In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import pandas as pd

data = pd.read_csv('/content/drive/MyDrive/Internship/Sem-5/TASK-1/BA_reviews.csv')
print(data)

                                                reviews  \
0     ✅ Trip Verified |  As always when I fly BA it ...   
1     ✅ Trip Verified |  First time using BA busines...   
2     Not Verified |  Extremely rude ground service....   
3     ✅ Trip Verified |  My son and I flew to Geneva...   
4     ✅ Trip Verified |  For the price paid (bought ...   
...                                                 ...   
3413  Travelled from SYD to London return. Poor serv...   
3414  VIE to LHR - Seats were of the new configurati...   
3415  NCL-LHR-LAX / LAS-LHR-NCL. NCL-LHR A320. Comfo...   
3416  Traveled to Rome with British Airways. On the ...   
3417  BA26 22/6/2014. During my trip to the UK I fee...   

                              stars                date         country  
0     \n\t\t\t\t\t\t\t\t\t\t\t\t\t5    9th January 2024           Spain  
1                                 1    7th January 2024  United Kingdom  
2                                 9    3rd January 2024   United Stat

In [5]:
data.head()

Unnamed: 0,reviews,stars,date,country
0,✅ Trip Verified | As always when I fly BA it ...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t5,9th January 2024,Spain
1,✅ Trip Verified | First time using BA busines...,1,7th January 2024,United Kingdom
2,Not Verified | Extremely rude ground service....,9,3rd January 2024,United States
3,✅ Trip Verified | My son and I flew to Geneva...,6,2nd January 2024,China
4,✅ Trip Verified | For the price paid (bought ...,1,29th December 2023,United Kingdom


In [6]:
data['verified'] = data.reviews.str.contains("Trip Verified")
data['verified']

0        True
1        True
2       False
3        True
4        True
        ...  
3413    False
3414    False
3415    False
3416    False
3417    False
Name: verified, Length: 3418, dtype: bool

In [8]:
#for lemmatization(reducing words to their base or root form) of words we will use nltk library
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import stopwords
import re
lemma = WordNetLemmatizer()


reviews_data = data.reviews.str.strip("✅ Trip Verified |")

#create an empty list to collect cleaned data corpus
corpus =[]

#loop through each review, remove punctuations, small case it, join it and add it to corpus
for rev in reviews_data:
    rev = re.sub('[^a-zA-Z]',' ', rev)
    rev = rev.lower()
    rev = rev.split()
    rev = [lemma.lemmatize(word) for word in rev if word not in set(stopwords.words("english"))]
    rev = " ".join(rev)
    corpus.append(rev)

In [9]:
data['corpus'] = corpus

In [None]:
from textblob import TextBlob

sentiment=[]

reviews_data = data.reviews.str.strip("✅ Trip Verified |")
def get_sentiment(text):
    analysis = TextBlob(text)
    # Sentiment polarity is a float within the range [-1.0, 1.0]
    # -1.0 is very negative, 1.0 is very positive
    sentiment_polarity = analysis.sentiment.polarity

    if sentiment_polarity > 0:
        return "Positive"
    elif sentiment_polarity < 0:
        return "Negative"
    else:
        return "Neutral"

for i in reviews_data:
  i = get_sentiment(i)
  sentiment.append(i)

In [None]:
data['sentiment'] = sentiment

In [None]:
data.dtypes

reviews      object
stars        object
date         object
country      object
verified       bool
sentiment    object
dtype: object

In [None]:
data.date = pd.to_datetime(data.date)
data.dtypes

reviews              object
stars                object
date         datetime64[ns]
country              object
verified               bool
sentiment            object
dtype: object

In [None]:
data.stars.unique()

array(['\n\t\t\t\t\t\t\t\t\t\t\t\t\t5', '1', '9', '6', '8', '2', '5', '3',
       '10', '4', '7', 'None'], dtype=object)

In [None]:
data.stars = data.stars.str.strip("\n\t\t\t\t\t\t\t\t\t\t\t\t\t")

In [None]:
data.stars.value_counts()

1       826
2       395
3       389
8       332
10      281
9       272
7       271
5       244
4       234
6       171
None      3
Name: stars, dtype: int64

In [None]:
data.drop(data[data.stars == "None"].index, axis=0, inplace=True)
data.stars.unique()

array(['5', '1', '9', '6', '8', '2', '3', '10', '4', '7'], dtype=object)

In [None]:
data.isnull().value_counts()

reviews  stars  date   country  verified  sentiment
False    False  False  False    False     False        3413
                       True     False     False           2
dtype: int64

In [None]:
data.country.isnull().value_counts()

False    3413
True        2
Name: country, dtype: int64

In [None]:
data.drop(data[data.country.isnull() == True].index,
          axis = 0,inplace=True)

In [None]:
data.shape

(3413, 6)

In [None]:
data.reset_index(drop=True)

Unnamed: 0,reviews,stars,date,country,verified,sentiment
0,✅ Trip Verified | As always when I fly BA it ...,5,2024-01-09,Spain,True,Positive
1,✅ Trip Verified | First time using BA busines...,1,2024-01-07,United Kingdom,True,Positive
2,Not Verified | Extremely rude ground service....,9,2024-01-03,United States,False,Negative
3,✅ Trip Verified | My son and I flew to Geneva...,6,2024-01-02,China,True,Negative
4,✅ Trip Verified | For the price paid (bought ...,1,2023-12-29,United Kingdom,True,Positive
...,...,...,...,...,...,...
3408,Travelled from SYD to London return. Poor serv...,8,2014-10-08,Australia,False,Negative
3409,VIE to LHR - Seats were of the new configurati...,6,2014-10-08,United Kingdom,False,Positive
3410,NCL-LHR-LAX / LAS-LHR-NCL. NCL-LHR A320. Comfo...,3,2014-10-05,United Kingdom,False,Positive
3411,Traveled to Rome with British Airways. On the ...,3,2014-10-05,United Kingdom,False,Positive


In [None]:
data.to_csv('/content/drive/MyDrive/Internship/Sem-5/TASK-1/cleaned_data.csv',index=False)