# Data Cleaning

#### Importing dependencies

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import re

In [2]:
df = pd.read_csv("BA_reviews.csv")
df = df.drop("Unnamed: 0",axis=1)

In [3]:
df.head()

Unnamed: 0,reviews,date,country,ratings
0,✅ Trip Verified | I have come to boarding and...,28th January 2024,Ukraine,3
1,✅ Trip Verified | Stinking nappies being chang...,26th January 2024,United Kingdom,2
2,✅ Trip Verified | Worst service ever. Lost bag...,23rd January 2024,Germany,1
3,✅ Trip Verified | BA 246 21JAN 2023 Did not a...,21st January 2024,United Kingdom,6
4,✅ Trip Verified | Not a great experience. I co...,18th January 2024,United Kingdom,3


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3742 entries, 0 to 3741
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   reviews  3742 non-null   object
 1   date     3742 non-null   object
 2   country  3740 non-null   object
 3   ratings  3742 non-null   object
dtypes: object(4)
memory usage: 117.1+ KB


In [5]:
df.isnull().sum()

reviews    0
date       0
country    2
ratings    0
dtype: int64

#### There are 2 missing values in country column, lets drop it off

In [6]:
df = df.dropna()

In [7]:
df.shape

(3740, 4)

In [8]:
len(df['reviews'].unique())

3731

In [9]:
df['date'].value_counts()

19th January 2015     26
20th November 2014    18
28th October 2014     14
11th January 2015     12
22nd October 2014     12
                      ..
28th February 2019     1
2nd March 2019         1
4th March 2019         1
7th March 2019         1
9th October 2011       1
Name: date, Length: 1852, dtype: int64

#### Date column is object datatype, we need to change it to datetime format

In [10]:
df['date'] = pd.to_datetime(df['date'])

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3740 entries, 0 to 3741
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   reviews  3740 non-null   object        
 1   date     3740 non-null   datetime64[ns]
 2   country  3740 non-null   object        
 3   ratings  3740 non-null   object        
dtypes: datetime64[ns](1), object(3)
memory usage: 146.1+ KB


In [12]:
print(df['date'].max())
print(df['date'].min())

2024-01-28 00:00:00
2011-10-09 00:00:00


In [13]:
df['ratings'].value_counts()

1                                875
2                                427
3                                409
8                                367
10                               325
9                                311
7                                309
4                                250
5                                232
6                                193
\n\t\t\t\t\t\t\t\t\t\t\t\t\t5     37
None                               5
Name: ratings, dtype: int64

#### removing '\n' & '\t' 

In [14]:
df['ratings'] = df['ratings'].str.strip('\n\t\t\t\t\t\t\t\t\t\t\t\t\t')m

In [15]:
df['ratings'].value_counts()

1       875
2       427
3       409
8       367
10      325
9       311
7       309
5       269
4       250
6       193
None      5
Name: ratings, dtype: int64

#### removing "None" values in ratings column

In [19]:
df.drop(df[df['ratings'] == "None"].index, axis=0, inplace=True)

In [22]:
df.shape

(3735, 4)

In [23]:
df['country'].value_counts()

United Kingdom           2352
United States             417
Australia                 159
Canada                    116
Germany                    66
                         ... 
Costa Rica                  1
Cayman Islands              1
Panama                      1
Saint Kitts and Nevis       1
Oman                        1
Name: country, Length: 72, dtype: int64

### Cleaning reviews for further Analysis

In [26]:
#utilizing NLTK library for text pre-processing
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

#creating a lemma instance
lemma = WordNetLemmatizer()

reviews_data = df['reviews'].str.strip("✅ Trip Verified |")

#creating an empty list to store cleaned reviews 
corpus = []

#Looping through each review --> remove puntuactions, lower case and join those, so that we can append to corpus at end
for i in reviews_data:
    i = re.sub('[^a-zA-Z]',' ',i)
    i = i.lower()
    i = i.split()
    i = [lemma.lemmatize(word) for word in i if word not in set(stopwords.words("english"))]
    i = " ".join(i)
    corpus.append(i)

In [27]:
df['corpus'] = corpus

In [28]:
df

Unnamed: 0,reviews,date,country,ratings,corpus
0,✅ Trip Verified | I have come to boarding and...,2024-01-28,Ukraine,3,come boarding cabin luggage taken plane full a...
1,✅ Trip Verified | Stinking nappies being chang...,2024-01-26,United Kingdom,2,stinking nappy changed business cabin througho...
2,✅ Trip Verified | Worst service ever. Lost bag...,2024-01-23,Germany,1,worst service ever lost baggage delayed flight...
3,✅ Trip Verified | BA 246 21JAN 2023 Did not a...,2024-01-21,United Kingdom,6,ba jan appreciate unprofessional attitude pilo...
4,✅ Trip Verified | Not a great experience. I co...,2024-01-18,United Kingdom,3,great experience could check online two separa...
...,...,...,...,...,...
3737,Flew LHR - VIE return operated by bmi but BA a...,2012-08-29,United Kingdom,8,flew lhr vie return operated bmi ba aircraft a...
3738,LHR to HAM. Purser addresses all club passenge...,2012-08-28,United Kingdom,2,lhr ham purser address club passenger name boa...
3739,My son who had worked for British Airways urge...,2011-10-12,United Kingdom,7,son worked british airway urged fly british ai...
3740,London City-New York JFK via Shannon on A318 b...,2011-10-11,United States,1,london city new york jfk via shannon really ni...


In [30]:
df.isnull().sum()

reviews    0
date       0
country    0
ratings    0
corpus     0
dtype: int64

In [31]:
df.to_csv("Cleaned_BA_reviews.csv")