##### We need to clean the data which we downloaded

In [1]:
#import relevant libraries
import pandas as pd
import numpy as np
import seaborn as sb
import os
import re
import matplotlib.pyplot as plt 

In [2]:
#load data from csv file
df = pd.read_csv("reviews.csv")
df.head()

Unnamed: 0,Reviews,Stars,Date,Country
0,Not Verified | Only the second time flying BA ...,5,14th May 2023,United States
1,✅ Trip Verified | I wasn't going to bother rev...,3,14th May 2023,United Kingdom
2,✅ Trip Verified | I booked business class tic...,3,13th May 2023,United States
3,✅ Trip Verified | I will never travel with Br...,1,8th May 2023,Australia
4,✅ Trip Verified | I am already in Portugal so...,1,6th May 2023,United Kingdom


Create another column whch says the trip is verified or not from the review coulmn

In [3]:
df['Verified'] = df.Reviews.str.contains("Trip Verified")
df['Verified'] 

0       False
1        True
2        True
3        True
4        True
        ...  
2495    False
2496    False
2497    False
2498    False
2499    False
Name: Verified, Length: 2500, dtype: bool

##### cleaning reviews

In [5]:
#for lemmatization of words we will use nltk library
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemma = WordNetLemmatizer()


reviews_data = df.Reviews.str.strip("✅ Trip Verified |")

#create an empty list to collect cleaned data corpus
corpus =[]

#loop through each review, remove punctuations, small case it, join it and add it to corpus
for rev in reviews_data:
    rev = re.sub('[^a-zA-Z]',' ', rev)
    rev = rev.lower()
    rev = rev.split()
    rev = [lemma.lemmatize(word) for word in rev if word not in set(stopwords.words("english"))]
    rev = " ".join(rev)
    corpus.append(rev)

In [6]:
df['corpus'] = corpus

In [7]:
df.head()

Unnamed: 0,Reviews,Stars,Date,Country,Verified,corpus
0,Not Verified | Only the second time flying BA ...,5,14th May 2023,United States,False,verified second time flying ba first time posi...
1,✅ Trip Verified | I wasn't going to bother rev...,3,14th May 2023,United Kingdom,True,going bother reviewing flight seem perpetual d...
2,✅ Trip Verified | I booked business class tic...,3,13th May 2023,United States,True,booked business class ticket fianc reschedule ...
3,✅ Trip Verified | I will never travel with Br...,1,8th May 2023,Australia,True,never travel british airway spent business cla...
4,✅ Trip Verified | I am already in Portugal so...,1,6th May 2023,United Kingdom,True,already portugal contacted today cancelled ret...


#### Cleaning/Format date

In [8]:
df.dtypes

Reviews     object
Stars        int64
Date        object
Country     object
Verified      bool
corpus      object
dtype: object

In [9]:
#convert date to datetime format
df.Date = pd.to_datetime(df.Date)

In [10]:
df.Date.head()

0   2023-05-14
1   2023-05-14
2   2023-05-13
3   2023-05-08
4   2023-05-06
Name: Date, dtype: datetime64[ns]

#### Cleaning the rating with star

In [11]:
df.Stars.unique()

array([ 5,  3,  1,  2, 10,  4,  9,  7,  8,  6], dtype=int64)

In [12]:
df.Stars.value_counts()

1     640
2     310
3     295
8     230
10    190
7     190
5     185
9     178
4     166
6     116
Name: Stars, dtype: int64

#### Check for null Values


In [15]:
df.isnull().value_counts()

Reviews  Stars  Date   Country  Verified  corpus
False    False  False  False    False     False     2500
dtype: int64

In [16]:
df.Country.isnull().value_counts()


False    2500
Name: Country, dtype: int64

In [17]:
df.head()

Unnamed: 0,Reviews,Stars,Date,Country,Verified,corpus
0,Not Verified | Only the second time flying BA ...,5,2023-05-14,United States,False,verified second time flying ba first time posi...
1,✅ Trip Verified | I wasn't going to bother rev...,3,2023-05-14,United Kingdom,True,going bother reviewing flight seem perpetual d...
2,✅ Trip Verified | I booked business class tic...,3,2023-05-13,United States,True,booked business class ticket fianc reschedule ...
3,✅ Trip Verified | I will never travel with Br...,1,2023-05-08,Australia,True,never travel british airway spent business cla...
4,✅ Trip Verified | I am already in Portugal so...,1,2023-05-06,United Kingdom,True,already portugal contacted today cancelled ret...


In [18]:
# cleaned data is ready for visualisation
df.to_csv("reviews_cleaned.csv",index=False)