In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
import os

#Regex 
import re         #can be used to work with Regular Expressions.    #check if a string contains the specified search pattern

In [6]:
cwd = os.getcwd()

df=pd.read_csv(cwd+ "/BA_reviews.csv" , index_col=0)

In [7]:
df.head(5)

Unnamed: 0,reviews,stars,date,country
0,Not Verified | Had to cancel my flight month...,5.0,1st July 2024,Canada
1,✅ Trip Verified | Flight cancelled with no rea...,1.0,30th June 2024,United Kingdom
2,✅ Trip Verified | This is a route I fly regula...,1.0,26th June 2024,United Kingdom
3,✅ Trip Verified | While BA may have made some...,6.0,23rd June 2024,Canada
4,✅ Trip Verified | British Airways new Club Sui...,3.0,23rd June 2024,Canada


In [13]:
#checking the null values
df['reviews'].isnull().sum() , df['stars'].isnull().sum() , df['date'].isnull().sum() , df['country'].isnull().sum()

(0, 3, 0, 2)

We will create a column which mentions if the user is verified or not.

In [16]:
df['verified']= df.reviews.str.contains(' Trip Verified ')
df.head(5)

Unnamed: 0,reviews,stars,date,country,verified
0,Not Verified | Had to cancel my flight month...,5.0,1st July 2024,Canada,False
1,✅ Trip Verified | Flight cancelled with no rea...,1.0,30th June 2024,United Kingdom,True
2,✅ Trip Verified | This is a route I fly regula...,1.0,26th June 2024,United Kingdom,True
3,✅ Trip Verified | While BA may have made some...,6.0,23rd June 2024,Canada,True
4,✅ Trip Verified | British Airways new Club Sui...,3.0,23rd June 2024,Canada,True


#### CLEANING REVIEWS

We will extract the review column into a separate dataframe and clean it for semantic/text analysis

In [27]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Aditya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [28]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aditya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [54]:
#for lemmatization of words we will use nltk library
from nltk.stem import WordNetLemmatizer #It looks beyond word reduction and considers a language’s full vocabulary to apply a morphological analysis to words.
from nltk.corpus import stopwords     #['stop', 'the', 'to', 'and', 'a', 'in', 'it', 'is', 'I', 'that', 'had', 'on', 'for', 'were', 'was'] words like this are removed, which just causes the extra storage.

lemma = WordNetLemmatizer()

#create an empty list to collect cleaned data corpus
corpus =[]

reviews_data= df.reviews.str.strip('✅ Trip Verified | ')         #removing trip verified

#loop through each review, remove punctuations, small case it, join it and add it to corpus
for rev in reviews_data:
    rev= re.sub('^a-zA-Z' , ' ' , rev ) #[^a-zA-Z] means any character that IS NOT a-z OR A-Z
                                        #re.sub() function replaces one or many matches with a string in the given text.
    rev= rev.lower()
    rev=rev.split()        #splited in seperate seperate words in " "
    rev = [lemma.lemmatize(word) for word in rev if word not in set(stopwords.words("english"))]
    rev=" ".join(rev)
    corpus.append(rev)

In [31]:
df['corpus']=corpus
df.head(6)

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,Not Verified | Had to cancel my flight month...,5.0,1st July 2024,Canada,False,verified | cancel flight month advance due cha...
1,✅ Trip Verified | Flight cancelled with no rea...,1.0,30th June 2024,United Kingdom,True,flight cancelled reason given le 24 h departur...
2,✅ Trip Verified | This is a route I fly regula...,1.0,26th June 2024,United Kingdom,True,route fly regularly. used first class security...
3,✅ Trip Verified | While BA may have made some...,6.0,23rd June 2024,Canada,True,ba may made positive improvement club world pr...
4,✅ Trip Verified | British Airways new Club Sui...,3.0,23rd June 2024,Canada,True,british airway new club suite marked improveme...
5,"✅ Trip Verified | Four very pleasant, on time...",5.0,18th June 2024,United Kingdom,True,"four pleasant, time flight friendly, helpful s..."


Cleaning\ changing the datatype so to avoid the decimal value in stars

In [33]:
df.dtypes

reviews      object
stars       float64
date         object
country      object
verified       bool
corpus       object
dtype: object

In [38]:
df['stars'].mode()[0]

1.0

In [36]:
df['stars'].fillna(df['stars'].mode()[0], inplace=True)      #filled the null values with mode

In [46]:
df['stars'].isnull().sum() 

0

In [53]:
df['stars'].unique()

array([ 5,  1,  6,  3,  9,  2,  8,  7,  4, 10], dtype=int64)

In [44]:
df['stars']= df['stars'].astype('int64' )    #changed the datatype

In [45]:
df.head(5)

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,Not Verified | Had to cancel my flight month...,5,1st July 2024,Canada,False,verified | cancel flight month advance due cha...
1,✅ Trip Verified | Flight cancelled with no rea...,1,30th June 2024,United Kingdom,True,flight cancelled reason given le 24 h departur...
2,✅ Trip Verified | This is a route I fly regula...,1,26th June 2024,United Kingdom,True,route fly regularly. used first class security...
3,✅ Trip Verified | While BA may have made some...,6,23rd June 2024,Canada,True,ba may made positive improvement club world pr...
4,✅ Trip Verified | British Airways new Club Sui...,3,23rd June 2024,Canada,True,british airway new club suite marked improveme...


Cleaning/Fromat date 

In [51]:
df.date = pd.to_datetime(df.date,format='mixed')
df.head()

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,Not Verified | Had to cancel my flight month...,5,2024-07-01,Canada,False,verified | cancel flight month advance due cha...
1,✅ Trip Verified | Flight cancelled with no rea...,1,2024-06-30,United Kingdom,True,flight cancelled reason given le 24 h departur...
2,✅ Trip Verified | This is a route I fly regula...,1,2024-06-26,United Kingdom,True,route fly regularly. used first class security...
3,✅ Trip Verified | While BA may have made some...,6,2024-06-23,Canada,True,ba may made positive improvement club world pr...
4,✅ Trip Verified | British Airways new Club Sui...,3,2024-06-23,Canada,True,british airway new club suite marked improveme...


CLEANING/COUNTRY

In [55]:
df.isnull().sum()

reviews     0
stars       0
date        0
country     2
verified    0
corpus      0
dtype: int64

In [56]:
#we can drop those two null values , means we can drop those two rows!

In [57]:
df.drop(df[df.country.isnull() == True].index, axis=0, inplace=True)

In [58]:
df.isnull().sum()

reviews     0
stars       0
date        0
country     0
verified    0
corpus      0
dtype: int64

In [59]:
df.shape

(3498, 6)

In [62]:
df.reset_index(drop=True)

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,Not Verified | Had to cancel my flight month...,5,2024-07-01,Canada,False,verified | cancel flight month advance due cha...
1,✅ Trip Verified | Flight cancelled with no rea...,1,2024-06-30,United Kingdom,True,flight cancelled reason given le 24 h departur...
2,✅ Trip Verified | This is a route I fly regula...,1,2024-06-26,United Kingdom,True,route fly regularly. used first class security...
3,✅ Trip Verified | While BA may have made some...,6,2024-06-23,Canada,True,ba may made positive improvement club world pr...
4,✅ Trip Verified | British Airways new Club Sui...,3,2024-06-23,Canada,True,british airway new club suite marked improveme...
...,...,...,...,...,...,...
3493,We flew BA from London to Dulles DC 10/8/2014....,1,2014-10-12,United States,False,flew ba london dulles dc 10/8/2014. unfortunat...
3494,B787. Just returned from a London - Toronto - ...,9,2014-10-12,United Kingdom,False,b787. returned london - toronto - london fligh...
3495,I travelled to Hong Kong with British Airways ...,1,2014-10-12,United Kingdom,False,travelled hong kong british airway heathrow 77...
3496,LGW to SZG. Absolutely useless for a scheduled...,4,2014-10-08,United Kingdom,False,lgw szg. absolutely useless scheduled service ...


Export the cleaned data

In [64]:
df.to_csv(cwd + '\cleaned_reviews.csv')