In [60]:
#imports

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

#regex
import re

In [61]:
cwd = os.getcwd()

df = pd.read_csv(cwd+"/BA_reviews.csv", index_col=0)

In [62]:
df.head()

Unnamed: 0,reviews,stars,date,country
0,✅ Trip Verified | The plane was extremely dir...,5.0,15th January 2024,Ireland
1,Not Verified | Overall journey wasn’t bad howe...,1.0,12th January 2024,United Kingdom
2,✅ Trip Verified | Overall very satisfied. Gro...,4.0,12th January 2024,United Kingdom
3,✅ Trip Verified | As always when I fly BA it ...,9.0,9th January 2024,Spain
4,✅ Trip Verified | First time using BA busines...,1.0,7th January 2024,United Kingdom


In [63]:
df.tail()

Unnamed: 0,reviews,stars,date,country
3729,YYZ to LHR - July 2012 - I flew overnight in p...,8.0,29th August 2012,Canada
3730,LHR to HAM. Purser addresses all club passenge...,2.0,28th August 2012,United Kingdom
3731,My son who had worked for British Airways urge...,7.0,12th October 2011,United Kingdom
3732,London City-New York JFK via Shannon on A318 b...,1.0,11th October 2011,United States
3733,SIN-LHR BA12 B747-436 First Class. Old aircraf...,10.0,9th October 2011,United Kingdom


In [64]:
df['IsVerified'] = df.reviews.str.contains("Trip Verified")

In [65]:
df

Unnamed: 0,reviews,stars,date,country,IsVerified
0,✅ Trip Verified | The plane was extremely dir...,5.0,15th January 2024,Ireland,True
1,Not Verified | Overall journey wasn’t bad howe...,1.0,12th January 2024,United Kingdom,False
2,✅ Trip Verified | Overall very satisfied. Gro...,4.0,12th January 2024,United Kingdom,True
3,✅ Trip Verified | As always when I fly BA it ...,9.0,9th January 2024,Spain,True
4,✅ Trip Verified | First time using BA busines...,1.0,7th January 2024,United Kingdom,True
...,...,...,...,...,...
3729,YYZ to LHR - July 2012 - I flew overnight in p...,8.0,29th August 2012,Canada,False
3730,LHR to HAM. Purser addresses all club passenge...,2.0,28th August 2012,United Kingdom,False
3731,My son who had worked for British Airways urge...,7.0,12th October 2011,United Kingdom,False
3732,London City-New York JFK via Shannon on A318 b...,1.0,11th October 2011,United States,False


### Removing prefix

In [66]:
prefixes_to_remove = ["✅ Trip Verified |", "Not Verified |"]

reviews_data = df.reviews.copy()

for prefix in prefixes_to_remove:
    reviews_data = reviews_data.str.replace(prefix, "")
    df.reviews = reviews_data

In [67]:
df.head()

Unnamed: 0,reviews,stars,date,country,IsVerified
0,The plane was extremely dirty with chocolate...,5.0,15th January 2024,Ireland,True
1,Overall journey wasn’t bad however at the end...,1.0,12th January 2024,United Kingdom,False
2,Overall very satisfied. Ground staff member ...,4.0,12th January 2024,United Kingdom,True
3,As always when I fly BA it was a total shamb...,9.0,9th January 2024,Spain,True
4,First time using BA business class but we we...,1.0,7th January 2024,United Kingdom,True


In [68]:
df

Unnamed: 0,reviews,stars,date,country,IsVerified
0,The plane was extremely dirty with chocolate...,5.0,15th January 2024,Ireland,True
1,Overall journey wasn’t bad however at the end...,1.0,12th January 2024,United Kingdom,False
2,Overall very satisfied. Ground staff member ...,4.0,12th January 2024,United Kingdom,True
3,As always when I fly BA it was a total shamb...,9.0,9th January 2024,Spain,True
4,First time using BA business class but we we...,1.0,7th January 2024,United Kingdom,True
...,...,...,...,...,...
3729,YYZ to LHR - July 2012 - I flew overnight in p...,8.0,29th August 2012,Canada,False
3730,LHR to HAM. Purser addresses all club passenge...,2.0,28th August 2012,United Kingdom,False
3731,My son who had worked for British Airways urge...,7.0,12th October 2011,United Kingdom,False
3732,London City-New York JFK via Shannon on A318 b...,1.0,11th October 2011,United States,False


In [69]:
df.dtypes

reviews        object
stars         float64
date           object
country        object
IsVerified       bool
dtype: object

### Converting Date to Datetime format

In [70]:
from dateutil import parser

df['date'] = df['date'].apply(lambda x: parser.parse(x))


In [71]:
df.date

0      2024-01-15
1      2024-01-12
2      2024-01-12
3      2024-01-09
4      2024-01-07
          ...    
3729   2012-08-29
3730   2012-08-28
3731   2011-10-12
3732   2011-10-11
3733   2011-10-09
Name: date, Length: 3734, dtype: datetime64[ns]

In [72]:
df.isnull().value_counts()

reviews  stars  date   country  IsVerified
False    False  False  False    False         3727
         True   False  False    False            5
         False  False  True     False            2
Name: count, dtype: int64

In [42]:
df.country.isnull().sum()

2

In [43]:
df.stars.isnull().sum()

5

In [44]:
df[df.stars.isnull()]

Unnamed: 0,reviews,stars,date,country,IsVerified
3186,Travelled Business A380 LHR-HK and returned on...,,2015-01-26,United Kingdom,False
3321,First time with BA (a code share flight for JA...,,2014-11-20,Australia,False
3340,BA 83 from LHR to LAX first time on Airbus A38...,,2014-11-12,United States,False
3579,BA026 7th July Seat 66B. Arrived HK off a conn...,,2014-07-08,United Kingdom,False
3606,Flew from LHR to Hong Kong April 13th 2014 BA ...,,2014-06-25,United Kingdom,False


### Removing rows with Null values

In [73]:
df.drop(df[(df.country.isnull() == True)].index, axis=0, inplace=True)

In [74]:
df.drop(df[(df.stars.isnull() == True)].index, axis=0, inplace=True)

In [75]:
df.isnull().value_counts()

reviews  stars  date   country  IsVerified
False    False  False  False    False         3727
Name: count, dtype: int64

In [76]:
df

Unnamed: 0,reviews,stars,date,country,IsVerified
0,The plane was extremely dirty with chocolate...,5.0,2024-01-15,Ireland,True
1,Overall journey wasn’t bad however at the end...,1.0,2024-01-12,United Kingdom,False
2,Overall very satisfied. Ground staff member ...,4.0,2024-01-12,United Kingdom,True
3,As always when I fly BA it was a total shamb...,9.0,2024-01-09,Spain,True
4,First time using BA business class but we we...,1.0,2024-01-07,United Kingdom,True
...,...,...,...,...,...
3729,YYZ to LHR - July 2012 - I flew overnight in p...,8.0,2012-08-29,Canada,False
3730,LHR to HAM. Purser addresses all club passenge...,2.0,2012-08-28,United Kingdom,False
3731,My son who had worked for British Airways urge...,7.0,2011-10-12,United Kingdom,False
3732,London City-New York JFK via Shannon on A318 b...,1.0,2011-10-11,United States,False


### Removing special characters

In [77]:
import re

# Define a function to clean the text
def clean(text):
# Removes all special characters and numericals leaving the alphabets
    text = re.sub('[^A-Za-z]+', ' ', str(text))
    return text

# Cleaning the text in the review column
df['Cleaned Reviews'] = df['reviews'].apply(clean)
df.head()

Unnamed: 0,reviews,stars,date,country,IsVerified,Cleaned Reviews
0,The plane was extremely dirty with chocolate...,5.0,2024-01-15,Ireland,True,The plane was extremely dirty with chocolate ...
1,Overall journey wasn’t bad however at the end...,1.0,2024-01-12,United Kingdom,False,Overall journey wasn t bad however at the end...
2,Overall very satisfied. Ground staff member ...,4.0,2024-01-12,United Kingdom,True,Overall very satisfied Ground staff member at...
3,As always when I fly BA it was a total shamb...,9.0,2024-01-09,Spain,True,As always when I fly BA it was a total shambl...
4,First time using BA business class but we we...,1.0,2024-01-07,United Kingdom,True,First time using BA business class but we wer...


In [78]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.corpus import wordnet

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Removing stopwords

In [80]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(filtered_tokens)

df['withoutstopwords'] = df['Cleaned Reviews'].apply(remove_stopwords)

In [81]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_tokens)
    
df['lemmatized_column'] = df['withoutstopwords'].apply(lemmatize_text)

In [82]:
def stem(word):
    for suffix in ['ing', 'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']:
        if word.endswith(suffix):
            return re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', word)[0][0]
    return word

def stemmer(phrase):
    stemmed_phrase = [stem(word) for word in phrase]
    return stemmed_phrase

# Apply stemmer to DataFrame column
df['stemmed_text'] = df['lemmatized_column'].apply(lambda x: stemmer(x.split()))
df['stemmed_sentence'] = df['stemmed_text'].apply(lambda x: ' '.join(x))

In [83]:
df.head()

Unnamed: 0,reviews,stars,date,country,IsVerified,Cleaned Reviews,withoutstopwords,lemmatized_column,stemmed_text,stemmed_sentence
0,The plane was extremely dirty with chocolate...,5.0,2024-01-15,Ireland,True,The plane was extremely dirty with chocolate ...,plane extremely dirty chocolate smudged mine c...,plane extremely dirty chocolate smudged mine c...,"[plane, extreme, dirty, chocolate, smudg, mine...",plane extreme dirty chocolate smudg mine child...
1,Overall journey wasn’t bad however at the end...,1.0,2024-01-12,United Kingdom,False,Overall journey wasn t bad however at the end...,Overall journey bad however end baggage arriva...,Overall journey bad however end baggage arriva...,"[Overall, journey, bad, however, end, baggage,...",Overall journey bad however end baggage arriva...
2,Overall very satisfied. Ground staff member ...,4.0,2024-01-12,United Kingdom,True,Overall very satisfied Ground staff member at...,Overall satisfied Ground staff member YVR extr...,Overall satisfied Ground staff member YVR extr...,"[Overall, satisfi, Ground, staff, member, YVR,...",Overall satisfi Ground staff member YVR extrem...
3,As always when I fly BA it was a total shamb...,9.0,2024-01-09,Spain,True,As always when I fly BA it was a total shambl...,always fly BA total shambles booked Manchester...,always fly BA total shamble booked Manchester ...,"[alway, f, BA, total, shamble, book, Mancheste...",alway f BA total shamble book Manchester Londo...
4,First time using BA business class but we we...,1.0,2024-01-07,United Kingdom,True,First time using BA business class but we wer...,First time using BA business class pleased ser...,First time using BA business class pleased ser...,"[First, time, us, BA, busines, clas, pleas, se...",First time us BA busines clas pleas service re...


In [84]:
df.to_csv(cwd + "/cleaned-BA-reviews.csv")