## Airline Review Sentiment Analysis

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from scipy.special import softmax
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('cleaned_data.csv')
df.head()

Unnamed: 0,rating,country,date,review,Type Of Traveller,Seat Type,Route,Date Flown,Seat Comfort,Cabin Staff Service,Food & Beverages,Inflight Entertainment,Ground Service,Value For Money,Recommended,Airlines,author,Verified,review_length
0,3.0,Italy,04-01-2024,Very cheeky check-in system: this did not ha...,Couple Leisure,Economy Class,Manchester to Milan,March 2024,1.0,2.0,2.0,1.0,2.0,2.0,no,ryanair,Y Chen,0,108
1,1.0,Spain,03-28-2024,Terrible customer service. Handling in Marra...,Family Leisure,Economy Class,Marrakech to Sevilla,March 2024,2.0,4.0,2.0,1.0,1.0,3.0,no,ryanair,Diego Perez,0,594
2,5.0,United Kingdom,03-18-2024,Luton to Faro and return Faro to Luton 10 da...,Couple Leisure,Economy Class,Luton to Faro,March 2024,1.0,3.0,2.0,3.0,1.0,2.0,no,ryanair,15 reviews\n\n\n\nRichard Hodges,1,211
3,1.0,Canada,03-16-2024,Very bad airline. I spent 100 pounds check i...,Family Leisure,Economy Class,Edinburgh to Stansted,March 2024,1.0,1.0,2.0,3.0,1.0,2.0,no,ryanair,G Han,1,84
4,7.0,United Kingdom,03-10-2024,"When things go wrong with low-cost carriers,...",Solo Leisure,Economy Class,Stansted to Sofia,March 2024,3.0,4.0,2.0,3.0,3.0,5.0,yes,ryanair,R Vines,1,111


### Sentiment Analysis Using RoBERTa

In [12]:
model_path = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_path)
config = AutoConfig.from_pretrained(model_path)
roberta_model = AutoModelForSequenceClassification.from_pretrained(model_path)

# a function that takes text and model to calculate probability of each sentiment
def sentiment_analyzer(text, model):
    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=512)
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = np.round(softmax(scores), 2)
    scores_dict = {"neg": scores[0], "neu": scores[1], "pos": scores[2]}
    return scores_dict

In [13]:
df["probabilities"] = df["review"].apply(sentiment_analyzer, model=roberta_model)

In [14]:
# Apply the RoBERTa function
probabilities = df["probabilities"].apply(pd.Series)
df = df.join(probabilities)
df = df.drop("probabilities", axis=1)

# Calculate the polarity for each text
polarity_weights = torch.tensor([-1, 0, 1])
probs = torch.tensor(df[["neg", "neu", "pos"]].values)
polarity = polarity_weights * probs
polarity = polarity.sum(dim=-1)
polarity_scaled = nn.Tanh()(polarity)

# Assign labels for each text based on polarity
df["roberta_polarity"] = polarity_scaled.numpy()
df["roberta_sentiment"] = pd.cut(df["roberta_polarity"],
                                  bins=[-1.0, -0.25, 0.25, 1.0],
                                  labels=["Negative", "Neutral", "Positive"])
df = df.drop(["neu", "neg", "pos"], axis=1)

In [28]:
df.roberta_sentiment.value_counts()

Negative    8296
Positive    3215
Neutral      665
Name: roberta_sentiment, dtype: int64

 Saving the Progress

In [15]:
df.to_csv('Sentiment.csv', index=False) 

## Geocoding API (NOMINATIM): Obtaining Central Latitude and Longitude of Passenger's Country of Origin

In [2]:
df1 = pd.read_csv('Sentiment.csv')
df1.head()

Unnamed: 0,rating,country,date,review,Type Of Traveller,Seat Type,Route,Date Flown,Seat Comfort,Cabin Staff Service,...,Inflight Entertainment,Ground Service,Value For Money,Recommended,Airlines,author,Verified,review_length,roberta_polarity,roberta_sentiment
0,3.0,Italy,04-01-2024,Very cheeky check-in system: this did not ha...,Couple Leisure,Economy Class,Manchester to Milan,March 2024,1.0,2.0,...,1.0,2.0,2.0,no,ryanair,Y Chen,0,108,-0.721132,Negative
1,1.0,Spain,03-28-2024,Terrible customer service. Handling in Marra...,Family Leisure,Economy Class,Marrakech to Sevilla,March 2024,2.0,4.0,...,1.0,1.0,3.0,no,ryanair,Diego Perez,0,594,-0.652707,Negative
2,5.0,United Kingdom,03-18-2024,Luton to Faro and return Faro to Luton 10 da...,Couple Leisure,Economy Class,Luton to Faro,March 2024,1.0,3.0,...,3.0,1.0,2.0,no,ryanair,15 reviews\n\n\n\nRichard Hodges,1,211,-0.37136,Negative
3,1.0,Canada,03-16-2024,Very bad airline. I spent 100 pounds check i...,Family Leisure,Economy Class,Edinburgh to Stansted,March 2024,1.0,1.0,...,3.0,1.0,2.0,no,ryanair,G Han,1,84,-0.735222,Negative
4,7.0,United Kingdom,03-10-2024,"When things go wrong with low-cost carriers,...",Solo Leisure,Economy Class,Stansted to Sofia,March 2024,3.0,4.0,...,3.0,3.0,5.0,yes,ryanair,R Vines,1,111,-0.089758,Neutral


In [30]:
!pip install pycountry



### Generating country code based on country name 

In [3]:

import pycountry 
def alpha3code(column):
    CODE=[]
    for country in column:
        try:
            code=pycountry.countries.get(name=country).alpha_2
            CODE.append(code)
        except:
            CODE.append('None')
    return CODE

df1['CODE']=alpha3code(df1.country)
df1.head()

Unnamed: 0,rating,country,date,review,Type Of Traveller,Seat Type,Route,Date Flown,Seat Comfort,Cabin Staff Service,...,Ground Service,Value For Money,Recommended,Airlines,author,Verified,review_length,roberta_polarity,roberta_sentiment,CODE
0,3.0,Italy,04-01-2024,Very cheeky check-in system: this did not ha...,Couple Leisure,Economy Class,Manchester to Milan,March 2024,1.0,2.0,...,2.0,2.0,no,ryanair,Y Chen,0,108,-0.721132,Negative,IT
1,1.0,Spain,03-28-2024,Terrible customer service. Handling in Marra...,Family Leisure,Economy Class,Marrakech to Sevilla,March 2024,2.0,4.0,...,1.0,3.0,no,ryanair,Diego Perez,0,594,-0.652707,Negative,ES
2,5.0,United Kingdom,03-18-2024,Luton to Faro and return Faro to Luton 10 da...,Couple Leisure,Economy Class,Luton to Faro,March 2024,1.0,3.0,...,1.0,2.0,no,ryanair,15 reviews\n\n\n\nRichard Hodges,1,211,-0.37136,Negative,GB
3,1.0,Canada,03-16-2024,Very bad airline. I spent 100 pounds check i...,Family Leisure,Economy Class,Edinburgh to Stansted,March 2024,1.0,1.0,...,1.0,2.0,no,ryanair,G Han,1,84,-0.735222,Negative,CA
4,7.0,United Kingdom,03-10-2024,"When things go wrong with low-cost carriers,...",Solo Leisure,Economy Class,Stansted to Sofia,March 2024,3.0,4.0,...,3.0,5.0,yes,ryanair,R Vines,1,111,-0.089758,Neutral,GB


### Using Nominatim to get location 

In [4]:
from geopy.geocoders import Nominatim

# Initialize geocoder
geolocator = Nominatim(user_agent="Data-2")

# Get unique country codes
codes = df1['CODE'].unique()

# Function to geocode a single country code
def geocode_country_code(code):
    location = geolocator.geocode(code)
    if location:
        return pd.Series({'CODE': code, 'Latitude': location.latitude, 'Longitude': location.longitude})
    else:
        return pd.Series({'CODE': code, 'Latitude': None, 'Longitude': None})

# Create DataFrame with latitude and longitude pairs for each country code
geocoded_data = pd.DataFrame([geocode_country_code(code) for code in codes])
geocoded_data.head()

Unnamed: 0,CODE,Latitude,Longitude
0,IT,42.638426,12.674297
1,ES,39.326068,-4.837979
2,GB,54.702354,-3.276575
3,CA,61.066692,-107.991707
4,PL,52.215933,19.134422


In [5]:
# Merge geocoded data with original DataFrame
merged_df = pd.merge(df1, geocoded_data, on='CODE', how='left')

In [11]:
merged_df.to_csv('Final_csv.csv')

In [10]:
merged_df.shape

(12176, 24)

In [12]:
df = pd.read_csv('Final_csv.csv')

In [14]:
df.head()

Unnamed: 0.1,Unnamed: 0,rating,country,date,review,Type Of Traveller,Seat Type,Route,Date Flown,Seat Comfort,...,Recommended,Airlines,author,Verified,review_length,roberta_polarity,roberta_sentiment,CODE,Latitude,Longitude
0,0,3.0,Italy,04-01-2024,Very cheeky check-in system: this did not ha...,Couple Leisure,Economy Class,Manchester to Milan,March 2024,1.0,...,no,ryanair,Y Chen,0,108,-0.721132,Negative,IT,42.638426,12.674297
1,1,1.0,Spain,03-28-2024,Terrible customer service. Handling in Marra...,Family Leisure,Economy Class,Marrakech to Sevilla,March 2024,2.0,...,no,ryanair,Diego Perez,0,594,-0.652707,Negative,ES,39.326068,-4.837979
2,2,5.0,United Kingdom,03-18-2024,Luton to Faro and return Faro to Luton 10 da...,Couple Leisure,Economy Class,Luton to Faro,March 2024,1.0,...,no,ryanair,15 reviews\n\n\n\nRichard Hodges,1,211,-0.37136,Negative,GB,54.702354,-3.276575
3,3,1.0,Canada,03-16-2024,Very bad airline. I spent 100 pounds check i...,Family Leisure,Economy Class,Edinburgh to Stansted,March 2024,1.0,...,no,ryanair,G Han,1,84,-0.735222,Negative,CA,61.066692,-107.991707
4,4,7.0,United Kingdom,03-10-2024,"When things go wrong with low-cost carriers,...",Solo Leisure,Economy Class,Stansted to Sofia,March 2024,3.0,...,yes,ryanair,R Vines,1,111,-0.089758,Neutral,GB,54.702354,-3.276575


In [None]:
# sentiment , recommended encode 