## Airline Review Sentiment Analysis

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from scipy.special import softmax
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('cleaned_data.csv')
df.head()

Unnamed: 0,rating,country,date,review,Type Of Traveller,Seat Type,Route,Date Flown,Seat Comfort,Cabin Staff Service,Food & Beverages,Inflight Entertainment,Ground Service,Value For Money,Recommended,Airlines,author,Verified,review_length
0,3.0,Italy,04-01-2024,Very cheeky check-in system: this did not ha...,Couple Leisure,Economy Class,Manchester to Milan,March 2024,1.0,2.0,2.0,1.0,2.0,2.0,no,ryanair,Y Chen,0,108
1,1.0,Spain,03-28-2024,Terrible customer service. Handling in Marra...,Family Leisure,Economy Class,Marrakech to Sevilla,March 2024,2.0,4.0,2.0,1.0,1.0,3.0,no,ryanair,Diego Perez,0,594
2,5.0,United Kingdom,03-18-2024,Luton to Faro and return Faro to Luton 10 da...,Couple Leisure,Economy Class,Luton to Faro,March 2024,1.0,3.0,2.0,3.0,1.0,2.0,no,ryanair,15 reviews\n\n\n\nRichard Hodges,1,211
3,1.0,Canada,03-16-2024,Very bad airline. I spent 100 pounds check i...,Family Leisure,Economy Class,Edinburgh to Stansted,March 2024,1.0,1.0,2.0,3.0,1.0,2.0,no,ryanair,G Han,1,84
4,7.0,United Kingdom,03-10-2024,"When things go wrong with low-cost carriers,...",Solo Leisure,Economy Class,Stansted to Sofia,March 2024,3.0,4.0,2.0,3.0,3.0,5.0,yes,ryanair,R Vines,1,111


### Sentiment Analysis Using RoBERTa

In [12]:
model_path = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_path)
config = AutoConfig.from_pretrained(model_path)
roberta_model = AutoModelForSequenceClassification.from_pretrained(model_path)

# a function that takes text and model to calculate probability of each sentiment
def sentiment_analyzer(text, model):
    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=512)
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = np.round(softmax(scores), 2)
    scores_dict = {"neg": scores[0], "neu": scores[1], "pos": scores[2]}
    return scores_dict

In [13]:
df["probabilities"] = df["review"].apply(sentiment_analyzer, model=roberta_model)

In [14]:
# Apply the RoBERTa function
probabilities = df["probabilities"].apply(pd.Series)
df = df.join(probabilities)
df = df.drop("probabilities", axis=1)

# Calculate the polarity for each text
polarity_weights = torch.tensor([-1, 0, 1])
probs = torch.tensor(df[["neg", "neu", "pos"]].values)
polarity = polarity_weights * probs
polarity = polarity.sum(dim=-1)
polarity_scaled = nn.Tanh()(polarity)

# Assign labels for each text based on polarity
df["roberta_polarity"] = polarity_scaled.numpy()
df["roberta_sentiment"] = pd.cut(df["roberta_polarity"],
                                  bins=[-1.0, -0.25, 0.25, 1.0],
                                  labels=["Negative", "Neutral", "Positive"])
df = df.drop(["neu", "neg", "pos"], axis=1)

In [28]:
df.roberta_sentiment.value_counts()

Negative    8296
Positive    3215
Neutral      665
Name: roberta_sentiment, dtype: int64

 Saving the Progress

In [15]:
df.to_csv('Final_csv.csv', index=False) 