# DATA CLEANING

## Importing libraries

In [1]:
import pandas as pd
import os
# Importing regex
import re

In [2]:
cwd = os.getcwd()
df = pd.read_csv(cwd+"/data/BA_reviews.csv", index_col=0)

In [3]:
df.head()

Unnamed: 0,reviews,stars,date,country
0,Not Verified | A nightmare journey courtesy o...,5.0,8th September 2024,United Kingdom
1,✅ Trip Verified | Absolutely atrocious. LHR-OR...,1.0,6th September 2024,United Kingdom
2,✅ Trip Verified | As someone who flies relentl...,1.0,2nd September 2024,United Kingdom
3,✅ Trip Verified | Flew with British Airways ...,4.0,1st September 2024,United Kingdom
4,✅ Trip Verified | Straightforward check in T...,2.0,30th August 2024,United Kingdom


In [4]:
df["verified"] = df.reviews.str.contains("Trip Verified")

In [5]:
df["verified"]

0       False
1        True
2        True
3        True
4        True
        ...  
4395     True
4396     True
4397     True
4398    False
4399    False
Name: verified, Length: 4400, dtype: bool

## Clening reviews

#### Clean reviews for semantic analysis

In [6]:
# lemmatization of words
from nltk.stem  import WordNetLemmatizer
from nltk.corpus import stopwords

lemma = WordNetLemmatizer()

reviews_data = df.reviews.str.strip("✅  Trip verified")

#create an empty list to collect cleaned data corpus

corpus = []

# Make it lowercase and remove punctutation
for rev in reviews_data:
    rev = re.sub('[^a-zA-Z]', ' ', rev)
    rev = rev.lower()
    rev = rev.split()
    rev = [lemma.lemmatize(word) for word in rev if word not in set(stopwords.words("english"))]
    rev = " ".join(rev)
    corpus.append(rev)

In [7]:
df['corpus'] = corpus

In [8]:
df.head()

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,Not Verified | A nightmare journey courtesy o...,5.0,8th September 2024,United Kingdom,False,verified nightmare journey courtesy british ai...
1,✅ Trip Verified | Absolutely atrocious. LHR-OR...,1.0,6th September 2024,United Kingdom,True,verified absolutely atrocious lhr ord lhr roun...
2,✅ Trip Verified | As someone who flies relentl...,1.0,2nd September 2024,United Kingdom,True,verified someone fly relentlessly british airw...
3,✅ Trip Verified | Flew with British Airways ...,4.0,1st September 2024,United Kingdom,True,verified flew british airway club europe satur...
4,✅ Trip Verified | Straightforward check in T...,2.0,30th August 2024,United Kingdom,True,verified straightforward check new site club c...


## Cleaning/Format date

In [9]:
df.dtypes

reviews      object
stars       float64
date         object
country      object
verified       bool
corpus       object
dtype: object

In [10]:
def remove_ordinal_suffixes(date_str):
    # Remove 'st', 'nd', 'rd', 'th'
    return re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str)

# Apply the function to your 'date' column
df['date'] = df['date'].apply(remove_ordinal_suffixes)

# Now convert to datetime
df['date'] = pd.to_datetime(df['date'], format='%d %B %Y')

# Verify the result
print(df['date'])

0      2024-09-08
1      2024-09-06
2      2024-09-02
3      2024-09-01
4      2024-08-30
          ...    
4395   2019-10-09
4396   2019-10-09
4397   2019-10-09
4398   2019-10-08
4399   2019-10-08
Name: date, Length: 4400, dtype: datetime64[ns]


## Cleaning ratings with stars

In [11]:
df.stars.unique()

array([ 5.,  1.,  4.,  2.,  8.,  9., 10.,  3.,  6.,  7., nan])

In [12]:
# Ensure the column is of string type
df['stars'] = df['stars'].astype(str)

# Remove leading and trailing whitespaces including \n and \t
df['stars'] = df['stars'].str.strip()

# Verify the result
print(df['stars'])

0        5.0
1        1.0
2        1.0
3        4.0
4        2.0
        ... 
4395     1.0
4396     1.0
4397    10.0
4398     2.0
4399     2.0
Name: stars, Length: 4400, dtype: object


In [13]:
df.stars.value_counts()

stars
1.0     1207
2.0      506
3.0      490
8.0      411
10.0     349
9.0      339
7.0      310
5.0      290
4.0      290
6.0      205
nan        3
Name: count, dtype: int64

In [14]:
# drop the rows where the value of ratings is NaN
df.drop(df[df.stars == "nan"].index, axis=0, inplace=True)

In [15]:
#check the unique values again
df.stars.unique()

array(['5.0', '1.0', '4.0', '2.0', '8.0', '9.0', '10.0', '3.0', '6.0',
       '7.0'], dtype=object)

## Check for NULL values

In [16]:
df.isnull().value_counts()

reviews  stars  date   country  verified  corpus
False    False  False  False    False     False     4396
                       True     False     False        1
Name: count, dtype: int64

In [17]:
df.country.value_counts()

country
United Kingdom        2670
United States          572
Australia              167
Canada                 146
Germany                 82
                      ... 
Egypt                    1
Laos                     1
Indonesia                1
Bahrain                  1
Dominican Republic       1
Name: count, Length: 70, dtype: int64

In [18]:
df.drop(df[df.country.isnull() == True].index, axis=0, inplace=True)

In [19]:
df.isnull().value_counts()

reviews  stars  date   country  verified  corpus
False    False  False  False    False     False     4396
Name: count, dtype: int64

In [20]:
#resetting the index
df.reset_index(drop=True)

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,Not Verified | A nightmare journey courtesy o...,5.0,2024-09-08,United Kingdom,False,verified nightmare journey courtesy british ai...
1,✅ Trip Verified | Absolutely atrocious. LHR-OR...,1.0,2024-09-06,United Kingdom,True,verified absolutely atrocious lhr ord lhr roun...
2,✅ Trip Verified | As someone who flies relentl...,1.0,2024-09-02,United Kingdom,True,verified someone fly relentlessly british airw...
3,✅ Trip Verified | Flew with British Airways ...,4.0,2024-09-01,United Kingdom,True,verified flew british airway club europe satur...
4,✅ Trip Verified | Straightforward check in T...,2.0,2024-08-30,United Kingdom,True,verified straightforward check new site club c...
...,...,...,...,...,...,...
4391,✅ Trip Verified | Istanbul to New York via Lon...,1.0,2019-10-09,United States,True,verified istanbul new york via london literall...
4392,✅ Trip Verified | Yesterday my husband and I ...,1.0,2019-10-09,United Kingdom,True,verified yesterday husband decided come home r...
4393,✅ Trip Verified | Flight from London to Athens...,10.0,2019-10-09,United Kingdom,True,verified flight london athens british airway o...
4394,Not Verified | Short 45 minute flight north a...,2.0,2019-10-08,United Kingdom,False,verified short minute flight north south briti...


In [21]:
# export the cleaned data

df.to_csv(cwd + "/data/cleaned-BA-reviews.csv")