Import Packages

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

Scrape Webpage for reviews of Ryanair

In [7]:
base_url = "https://www.airlinequality.com/airline-reviews/ryanair"
pages = 15
page_size = 100

reviews = []

# for i in range(1, pages + 1):
for i in range(1, pages + 1):

    print(f"Scraping page {i}")

    # Create URL to collect links from paginated data
    url = f"{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize={page_size}"

    # Collect HTML data from this page
    response = requests.get(url)

    # Parse content
    content = response.content
    parsed_content = BeautifulSoup(content, 'html.parser')
    for para in parsed_content.find_all("div", {"class": "text_content"}):
        reviews.append(para.get_text())
    
    print(f"   ---> {len(reviews)} total reviews")

Scraping page 1
   ---> 100 total reviews
Scraping page 2
   ---> 200 total reviews
Scraping page 3
   ---> 300 total reviews
Scraping page 4
   ---> 400 total reviews
Scraping page 5
   ---> 500 total reviews
Scraping page 6
   ---> 600 total reviews
Scraping page 7
   ---> 700 total reviews
Scraping page 8
   ---> 800 total reviews
Scraping page 9
   ---> 900 total reviews
Scraping page 10
   ---> 1000 total reviews
Scraping page 11
   ---> 1100 total reviews
Scraping page 12
   ---> 1200 total reviews
Scraping page 13
   ---> 1300 total reviews
Scraping page 14
   ---> 1400 total reviews
Scraping page 15
   ---> 1500 total reviews


Make Dataframe of Variable - Reviews

In [14]:
df = pd.DataFrame()
df["reviews"] = reviews
df.head()

Unnamed: 0,reviews
0,✅ Trip Verified | Another good affordable fli...
1,✅ Trip Verified | Really impressed! You get wh...
2,✅ Trip Verified | I should like to review my ...
3,✅ Trip Verified | Flight left the gate ahead o...
4,Not Verified | Booked a fight from Copenhagen ...


In [17]:
df.to_csv("reviews.csv")

Clean the data where items such as 'Trip Verified' is removed

In [31]:
reviews = pd.read_csv("reviews.csv")

In [32]:
pd.options.display.max_rows = None
pd.options.display.max_columns = None

In [34]:
reviews.head(20)

Unnamed: 0.1,Unnamed: 0,reviews
0,0,✅ Trip Verified | Another good affordable fli...
1,1,✅ Trip Verified | Really impressed! You get wh...
2,2,✅ Trip Verified | I should like to review my ...
3,3,✅ Trip Verified | Flight left the gate ahead o...
4,4,Not Verified | Booked a fight from Copenhagen ...
5,5,Not Verified | The flight itself is operated ...
6,6,✅ Trip Verified | Staff is rude and has no ma...
7,7,✅ Trip Verified | Ryanair ground service staf...
8,8,Not Verified | I wanted to check in online a ...
9,9,Not Verified | Krakow to Tirana with Ryanair's...


In [35]:
reviews.shape

(1500, 2)

In [37]:
reviews.isnull().sum()

Unnamed: 0    0
reviews       0
dtype: int64

In [38]:
reviews.drop(columns = 'Unnamed: 0' , inplace = True)

In [39]:
reviews.head(10)

Unnamed: 0,reviews
0,✅ Trip Verified | Another good affordable fli...
1,✅ Trip Verified | Really impressed! You get wh...
2,✅ Trip Verified | I should like to review my ...
3,✅ Trip Verified | Flight left the gate ahead o...
4,Not Verified | Booked a fight from Copenhagen ...
5,Not Verified | The flight itself is operated ...
6,✅ Trip Verified | Staff is rude and has no ma...
7,✅ Trip Verified | Ryanair ground service staf...
8,Not Verified | I wanted to check in online a ...
9,Not Verified | Krakow to Tirana with Ryanair's...


Dataframe has Raw and Clean Raw data

In [40]:
import re

reviews['clean_reviews'] = reviews['reviews'].apply(lambda x: re.sub(r'✅ Trip Verified \| | Not Verified \|', '', x))

In [41]:
reviews.head()

Unnamed: 0,reviews,clean_reviews
0,✅ Trip Verified | Another good affordable fli...,Another good affordable flight with Ryanair. ...
1,✅ Trip Verified | Really impressed! You get wh...,"Really impressed! You get what you pay for, th..."
2,✅ Trip Verified | I should like to review my ...,I should like to review my flight from Faro t...
3,✅ Trip Verified | Flight left the gate ahead o...,"Flight left the gate ahead of schedule, fare w..."
4,Not Verified | Booked a fight from Copenhagen ...,Not Verified | Booked a fight from Copenhagen ...


In [42]:
reviews.drop(columns = 'reviews' , inplace = True)

In [43]:
reviews.head()

Unnamed: 0,clean_reviews
0,Another good affordable flight with Ryanair. ...
1,"Really impressed! You get what you pay for, th..."
2,I should like to review my flight from Faro t...
3,"Flight left the gate ahead of schedule, fare w..."
4,Not Verified | Booked a fight from Copenhagen ...


In [44]:
!pip install TextBlob

from textblob import TextBlob

reviews['sentiment'] = reviews['clean_reviews'].apply(lambda x: TextBlob(x).sentiment.polarity)

Collecting TextBlob
  Using cached textblob-0.17.1-py2.py3-none-any.whl (636 kB)
Installing collected packages: TextBlob
Successfully installed TextBlob-0.17.1


In [45]:
reviews.head()

Unnamed: 0,clean_reviews,sentiment
0,Another good affordable flight with Ryanair. ...,0.561111
1,"Really impressed! You get what you pay for, th...",0.374688
2,I should like to review my flight from Faro t...,0.06131
3,"Flight left the gate ahead of schedule, fare w...",0.295
4,Not Verified | Booked a fight from Copenhagen ...,0.090909


Categories reviews based on criteria 

In [54]:
categories = ['service', 'food', 'comfort', 'staff', 'punctuality', 'timing', 'luggage']

def categorize_review(review):
    for category in categories:
        if category in review:
            return category
    return 'other'

reviews['category'] = reviews['clean_reviews'].apply(categorize_review)

In [55]:
reviews.head(20)

Unnamed: 0,clean_reviews,sentiment,category,sentiment_label,count
0,Another good affordable flight with Ryanair. ...,0.561111,staff,positive,1
1,"Really impressed! You get what you pay for, th...",0.374688,other,positive,1
2,I should like to review my flight from Faro t...,0.06131,other,positive,1
3,"Flight left the gate ahead of schedule, fare w...",0.295,other,positive,1
4,Not Verified | Booked a fight from Copenhagen ...,0.090909,service,positive,1
5,Not Verified | The flight itself is operated ...,-0.112292,other,negative,1
6,"Staff is rude and has no manners, let alone b...",0.037143,other,positive,1
7,Ryanair ground service staff is really bad. I...,-0.7,service,negative,1
8,Not Verified | I wanted to check in online a ...,-0.152083,other,negative,1
9,Not Verified | Krakow to Tirana with Ryanair's...,0.277797,comfort,positive,1


In [56]:
category_sentiments = reviews.groupby('category')['sentiment'].mean()
positive_reviews = reviews[reviews['sentiment'] > 0]
negative_reviews = reviews[reviews['sentiment'] < 0]

In [57]:
# Assign "positive" or "negative" based on sentiment score
reviews['sentiment_label'] = reviews['sentiment'].apply(lambda x: 'positive' if x > 0 else 'negative' if x < 0 else 'neutral')

In [58]:
reviews.head(20)

Unnamed: 0,clean_reviews,sentiment,category,sentiment_label,count
0,Another good affordable flight with Ryanair. ...,0.561111,staff,positive,1
1,"Really impressed! You get what you pay for, th...",0.374688,other,positive,1
2,I should like to review my flight from Faro t...,0.06131,other,positive,1
3,"Flight left the gate ahead of schedule, fare w...",0.295,other,positive,1
4,Not Verified | Booked a fight from Copenhagen ...,0.090909,service,positive,1
5,Not Verified | The flight itself is operated ...,-0.112292,other,negative,1
6,"Staff is rude and has no manners, let alone b...",0.037143,other,positive,1
7,Ryanair ground service staff is really bad. I...,-0.7,service,negative,1
8,Not Verified | I wanted to check in online a ...,-0.152083,other,negative,1
9,Not Verified | Krakow to Tirana with Ryanair's...,0.277797,comfort,positive,1


In [59]:
# Calculate the counts of each sentiment label
sentiment_counts = reviews['sentiment_label'].value_counts()

# Calculate the percentage of positive and negative sentiment
positive_percentage = (sentiment_counts.get('positive', 0) / len(reviews)) * 100
negative_percentage = (sentiment_counts.get('negative', 0) / len(reviews)) * 100

print(f"Percentage of positive sentiment: {positive_percentage:.2f}%")
print(f"Percentage of negative sentiment: {negative_percentage:.2f}%")

Percentage of positive sentiment: 58.07%
Percentage of negative sentiment: 40.67%


In [60]:
reviews['count'] = 1
reviews.groupby('category').sum('count')

Unnamed: 0_level_0,sentiment,count
category,Unnamed: 1_level_1,Unnamed: 2_level_1
comfort,7.765866,95
food,10.70771,94
luggage,2.584169,101
other,23.558816,551
punctuality,0.005912,2
service,5.993382,442
staff,4.657858,212
timing,-0.023313,3


In [61]:
# Create a DataFrame with category and sentiment_label columns
category_sentiment_counts = reviews.groupby(['category', 'sentiment_label'])['sentiment_label'].count().reset_index(name='count')

# Pivot the table to create separate columns for positive and negative counts
pivot_table = category_sentiment_counts.pivot(index='category', columns='sentiment_label', values='count')

# Fill NaN values with 0 (in case a category has only one sentiment label)
pivot_table.fillna(0, inplace=True)

# Calculate the total reviews for each category
pivot_table['total_reviews'] = pivot_table['negative'] + pivot_table['positive']

# Reset the index to have 'category' as a regular column
pivot_table.reset_index(inplace=True)

# If a category has no negative or positive reviews, replace NaN with 0
pivot_table['negative'].fillna(0, inplace=True)
pivot_table['positive'].fillna(0, inplace=True)

print(pivot_table)

sentiment_label     category  negative  neutral  positive  total_reviews
0                    comfort      33.0      0.0      62.0           95.0
1                       food      23.0      0.0      71.0           94.0
2                    luggage      45.0      1.0      55.0          100.0
3                      other     204.0     11.0     336.0          540.0
4                punctuality       1.0      0.0       1.0            2.0
5                    service     203.0      5.0     234.0          437.0
6                      staff     100.0      2.0     110.0          210.0
7                     timing       1.0      0.0       2.0            3.0


In [72]:
sentiment_cat_ryanair = pd.DataFrame(pivot_table)

In [73]:
sentiment_cat_ryanair.drop(columns = 'neutral' , inplace = True)

In [74]:
sentiment_cat_ryanair.head()

sentiment_label,category,negative,positive,total_reviews
0,comfort,33.0,62.0,95.0
1,food,23.0,71.0,94.0
2,luggage,45.0,55.0,100.0
3,other,204.0,336.0,540.0
4,punctuality,1.0,1.0,2.0


In [75]:
sentiment_cat_ryanair = sentiment_cat.sort_values('total_reviews' , ascending = False)

In [76]:
sentiment_cat_ryanair.head(10)

sentiment_label,category,negative,positive,total_reviews
3,other,204.0,336.0,540.0
5,service,203.0,234.0,437.0
6,staff,100.0,110.0,210.0
2,luggage,45.0,55.0,100.0
0,comfort,33.0,62.0,95.0
1,food,23.0,71.0,94.0
7,timing,1.0,2.0,3.0
4,punctuality,1.0,1.0,2.0


Scrape Webpage for reviews of Aer Lingus

In [77]:
base_url = "https://www.airlinequality.com/airline-reviews/aer-lingus"
pages = 15
page_size = 100

reviews_aerlingus = []

# for i in range(1, pages + 1):
for i in range(1, pages + 1):

    print(f"Scraping page {i}")

    # Create URL to collect links from paginated data
    url = f"{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize={page_size}"

    # Collect HTML data from this page
    response = requests.get(url)

    # Parse content
    content = response.content
    parsed_content = BeautifulSoup(content, 'html.parser')
    for para in parsed_content.find_all("div", {"class": "text_content"}):
        reviews_aerlingus.append(para.get_text())
    
    print(f"   ---> {len(reviews_aerlingus)} total reviews")

Scraping page 1
   ---> 100 total reviews
Scraping page 2
   ---> 200 total reviews
Scraping page 3
   ---> 300 total reviews
Scraping page 4
   ---> 400 total reviews
Scraping page 5
   ---> 500 total reviews
Scraping page 6
   ---> 600 total reviews
Scraping page 7
   ---> 700 total reviews
Scraping page 8
   ---> 800 total reviews
Scraping page 9
   ---> 900 total reviews
Scraping page 10
   ---> 994 total reviews
Scraping page 11
   ---> 994 total reviews
Scraping page 12
   ---> 994 total reviews
Scraping page 13
   ---> 994 total reviews
Scraping page 14
   ---> 994 total reviews
Scraping page 15
   ---> 994 total reviews


In [78]:
df = pd.DataFrame()
df["reviews_aerlingus"] = reviews_aerlingus
df.head()

Unnamed: 0,reviews_aerlingus
0,✅ Trip Verified | Disgusting service from sta...
1,✅ Trip Verified | Disgraceful experience in Mu...
2,"✅ Trip Verified | Flew on EI-EIL, an old airc..."
3,✅ Trip Verified | My girlfriend and I had a 9...
4,Not Verified | The check-in staff at Mancheste...


In [81]:
df.to_csv("reviews_aerlingus.csv")

In [82]:
reviews_aerlingus = pd.read_csv("reviews_aerlingus.csv")

In [83]:
reviews_aerlingus.head()

Unnamed: 0.1,Unnamed: 0,reviews_aerlingus
0,0,✅ Trip Verified | Disgusting service from sta...
1,1,✅ Trip Verified | Disgraceful experience in Mu...
2,2,"✅ Trip Verified | Flew on EI-EIL, an old airc..."
3,3,✅ Trip Verified | My girlfriend and I had a 9...
4,4,Not Verified | The check-in staff at Mancheste...


In [98]:
reviews_aerlingus.isnull().sum()

reviews_aerlingus          0
clean_reviews_aerlingus    0
category                   0
dtype: int64

In [99]:
reviews_aerlingus.head()

Unnamed: 0,reviews_aerlingus,clean_reviews_aerlingus,category
0,✅ Trip Verified | Disgusting service from sta...,"Disgusting service from start to finish, trea...",service
1,✅ Trip Verified | Disgraceful experience in Mu...,Disgraceful experience in Munich. Only one per...,other
2,"✅ Trip Verified | Flew on EI-EIL, an old airc...","Flew on EI-EIL, an old aircraft that is in ho...",other
3,✅ Trip Verified | My girlfriend and I had a 9...,My girlfriend and I had a 9 hours flight with...,food
4,Not Verified | The check-in staff at Mancheste...,Not Verified | The check-in staff at Mancheste...,staff


In [100]:
reviews_aerlingus.drop(columns = 'reviews_aerlingus' , inplace = True)

In [101]:
reviews_aerlingus.head()

Unnamed: 0,clean_reviews_aerlingus,category
0,"Disgusting service from start to finish, trea...",service
1,Disgraceful experience in Munich. Only one per...,other
2,"Flew on EI-EIL, an old aircraft that is in ho...",other
3,My girlfriend and I had a 9 hours flight with...,food
4,Not Verified | The check-in staff at Mancheste...,staff


In [103]:
reviews_aerlingus['sentiment'] = reviews_aerlingus['clean_reviews_aerlingus'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [107]:
reviews_aerlingus.head()

Unnamed: 0,clean_reviews_aerlingus,category,sentiment
0,"Disgusting service from start to finish, trea...",service,-0.27
1,Disgraceful experience in Munich. Only one per...,other,0.285227
2,"Flew on EI-EIL, an old aircraft that is in ho...",other,-0.425
3,My girlfriend and I had a 9 hours flight with...,food,0.0
4,Not Verified | The check-in staff at Mancheste...,staff,0.070833


In [108]:
category_sentiments = reviews_aerlingus.groupby('category')['sentiment'].mean()
positive_reviews = reviews_aerlingus[reviews_aerlingus['sentiment'] > 0]
negative_reviews = reviews_aerlingus[reviews_aerlingus['sentiment'] < 0]

In [109]:
# Assign "positive" or "negative" based on sentiment score
reviews_aerlingus['sentiment_label'] = reviews_aerlingus['sentiment'].apply(lambda x: 'positive' if x > 0 else 'negative' if x < 0 else 'neutral')

In [110]:
reviews_aerlingus.head()

Unnamed: 0,clean_reviews_aerlingus,category,sentiment,sentiment_label
0,"Disgusting service from start to finish, trea...",service,-0.27,negative
1,Disgraceful experience in Munich. Only one per...,other,0.285227,positive
2,"Flew on EI-EIL, an old aircraft that is in ho...",other,-0.425,negative
3,My girlfriend and I had a 9 hours flight with...,food,0.0,neutral
4,Not Verified | The check-in staff at Mancheste...,staff,0.070833,positive


In [111]:
# Calculate the counts of each sentiment label
sentiment_counts = reviews_aerlingus['sentiment_label'].value_counts()

# Calculate the percentage of positive and negative sentiment
positive_percentage = (sentiment_counts.get('positive', 0) / len(reviews_aerlingus)) * 100
negative_percentage = (sentiment_counts.get('negative', 0) / len(reviews_aerlingus)) * 100

print(f"Percentage of positive sentiment: {positive_percentage:.2f}%")
print(f"Percentage of negative sentiment: {negative_percentage:.2f}%")

Percentage of positive sentiment: 70.42%
Percentage of negative sentiment: 29.07%


In [112]:
reviews_aerlingus['count'] = 1
reviews_aerlingus.groupby('category').sum('count')

Unnamed: 0_level_0,sentiment,count
category,Unnamed: 1_level_1,Unnamed: 2_level_1
comfort,9.993652,69
food,20.449729,134
luggage,0.640125,44
other,21.382791,249
service,43.605091,392
staff,10.469725,105
timing,0.137315,1
