In [3]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

In [4]:
# read in the data
df = pd.read_csv("./data/reviews.csv", index_col=0)
df.head()

Unnamed: 0,review,stars,date,country
0,✅ Trip Verified | I was flying to Warsaw for ...,1.0,2022-12-03,United States
1,✅ Trip Verified | Booked a BA holiday to Marr...,9.0,2022-11-30,United Kingdom
2,✅ Trip Verified | Extremely sub-par service. H...,2.0,2022-11-28,United States
3,✅ Trip Verified | I virtually gave up on Brit...,7.0,2022-11-26,United Kingdom
4,✅ Trip Verified | I was pleasantly surprised ...,7.0,2022-11-25,Canada


## Data Cleaning

In [5]:
# create a new column with the length of each review
df["review_length"] = df["review"].apply(len)

In [6]:
# create a new column to see if the review is verified
df["verified"] = df["review"].apply(lambda x: "Trip Verified" in x)

In [7]:
# assign another name to the clean df
df_clean = df.copy()
df_clean.head()

Unnamed: 0,review,stars,date,country,review_length,verified
0,✅ Trip Verified | I was flying to Warsaw for ...,1.0,2022-12-03,United States,1387,True
1,✅ Trip Verified | Booked a BA holiday to Marr...,9.0,2022-11-30,United Kingdom,487,True
2,✅ Trip Verified | Extremely sub-par service. H...,2.0,2022-11-28,United States,798,True
3,✅ Trip Verified | I virtually gave up on Brit...,7.0,2022-11-26,United Kingdom,473,True
4,✅ Trip Verified | I was pleasantly surprised ...,7.0,2022-11-25,Canada,655,True


In [8]:
# clean the review column
df_clean["review"] = df_clean["review"].apply(lambda x: x.replace("✅ Trip Verified | ", ""))

# clean review column to remove "not verified"
df_clean["review"] = df_clean["review"].apply(lambda x: x.replace("Not Verified | ", ""))

# check first 5 rows
df_clean.head(5)

Unnamed: 0,review,stars,date,country,review_length,verified
0,I was flying to Warsaw for one day of meeting...,1.0,2022-12-03,United States,1387,True
1,"Booked a BA holiday to Marrakech, after posti...",9.0,2022-11-30,United Kingdom,487,True
2,Extremely sub-par service. Highlights: No onli...,2.0,2022-11-28,United States,798,True
3,I virtually gave up on British Airways about ...,7.0,2022-11-26,United Kingdom,473,True
4,I was pleasantly surprised that the airline c...,7.0,2022-11-25,Canada,655,True


In [9]:
lemma = WordNetLemmatizer()

# create an empty list to collect cleaned data corpus
corpus = []

# loop through each review, remove punctuations, small case it, join it and add it to corpus
for rev in df_clean['review']:
    rev = re.sub('[^a-zA-Z]',' ', rev)
    rev = rev.lower()
    rev = rev.split()
    rev = [lemma.lemmatize(word) for word in rev if word not in set(stopwords.words("english"))]
    rev = " ".join(rev)
    corpus.append(rev)

In [10]:
# add the corpus to the dataframe
df_clean["corpus"] = corpus

# check the first 5 rows
df_clean.head()

Unnamed: 0,review,stars,date,country,review_length,verified,corpus
0,I was flying to Warsaw for one day of meeting...,1.0,2022-12-03,United States,1387,True,flying warsaw one day meeting would flying bac...
1,"Booked a BA holiday to Marrakech, after posti...",9.0,2022-11-30,United Kingdom,487,True,booked ba holiday marrakech posting negative r...
2,Extremely sub-par service. Highlights: No onli...,2.0,2022-11-28,United States,798,True,extremely sub par service highlight online mea...
3,I virtually gave up on British Airways about ...,7.0,2022-11-26,United Kingdom,473,True,virtually gave british airway three year ago w...
4,I was pleasantly surprised that the airline c...,7.0,2022-11-25,Canada,655,True,pleasantly surprised airline could maintain st...


In [11]:
# check data types
df_clean.dtypes

review            object
stars            float64
date              object
country           object
review_length      int64
verified            bool
corpus            object
dtype: object

In [12]:
# convert date to datetime
df_clean["date"] = pd.to_datetime(df_clean["date"])

In [13]:
# check for null values
df_clean.isnull().sum()

review           0
stars            0
date             0
country          0
review_length    0
verified         0
corpus           0
dtype: int64

In [16]:
# export the cleaned data
df_clean.to_csv("./data/reviews_clean.csv")