# Library Loading

In [1]:
#Core
import pandas as pd
import numpy as np

#Language Detection
from langdetect import detect

#Noise Removal
import string
import re
import contractions
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

#display adjust
pd.set_option("display.max_columns", 1000)
pd.set_option("display.max_rows", 1000)
import warnings
warnings.filterwarnings("ignore")

In [2]:
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer() 

# Input Path

In [3]:
hotel_name = 'Britannia International Hotel Canary Wharf'

In [4]:
data_kaggle = "Hotel_Reviews_Kaggle.csv"
data_booking = "Britannia International Hotel Canary Wharf.csv"
data_tripadvisor = 'tripadvisor_processed.csv'

# Data Loading

In [5]:
df_kaggle = pd.read_csv(data_kaggle)
df_booking = pd.read_csv(data_booking)
df_tripadvisor = pd.read_csv(data_tripadvisor)

In [6]:
df_kaggle = df_kaggle[df_kaggle.Hotel_Name == hotel_name]
df_kaggle.reset_index(inplace = True, drop = True)

# Customized Cleaning

In [7]:
#No Negativity

In [8]:
terms = ["There are no comments available for this review",
         "Not Available",
         "No comments",
         "Nothing to comment on here.",
         "No comment at this time.",
         "No comments",
         "This review is hidden because it doesn't meet our guidelines.",
         "I had no complaints",
         "No Negativity"]

In [9]:
for term in terms:
    df_booking['Negative'] = np.where((df_booking['Negative'] == term),None,df_booking['Negative'] )
    df_booking['Positive'] = np.where((df_booking['Positive'] == term),None,df_booking['Positive'] )
    
    df_kaggle['Negative_Review'] = np.where((df_kaggle['Negative_Review'] == term),None,df_kaggle['Negative_Review'] )
    df_kaggle['Positive_Review'] = np.where((df_kaggle['Positive_Review'] == term),None,df_kaggle['Positive_Review'] )
    
    df_tripadvisor['Review'] = np.where((df_tripadvisor['Review'] == term),None,df_tripadvisor['Review'] )


In [10]:
#stopword customization

lang_stopwords = stopwords.words('english')

#add words that aren't in the NLTK stopwords list
#new_stopwords = []
#lang_stopwords = lang_stopwords.union(new_stopwords)

#remove words that are in NLTK stopwords list
not_stopwords = {'not'} 
lang_stopwords = set([word for word in lang_stopwords if word not in not_stopwords])


# Language Detection

In [11]:
#Scan Through Negative
for i in range(len(df_kaggle)):
    try:
        if detect(df_kaggle.Negative_Review[i]) != 'en':
            df_kaggle.drop(i,inplace = True)
    except:
        pass
df_kaggle.reset_index(drop = True,inplace = True)       

#Scan Through Positve
for i in range(len(df_kaggle)):
    try:
        if detect(df_kaggle.Positive_Review[i]) != 'en':
            df_kaggle.drop(i,inplace = True)
    except:
        pass

df_kaggle.reset_index(drop = True,inplace = True)

In [12]:
#Scan Through Positve
for i in range(len(df_booking)):
    try:
        if detect(df_booking.Positive_Review[i]) != 'en':
            df_booking.drop(i,inplace = True)
    except:
        pass

df_booking.reset_index(drop = True,inplace = True)

#Scan Through Negative
for i in range(len(df_booking)):
    try:
        if detect(df_booking.Negative[i]) != 'en':
            df_booking.drop(i,inplace = True)
    except:
        pass

df_booking.reset_index(drop = True,inplace = True)     

# Bulk Cleaning

In [13]:
def text_tokenized(text):
    return nltk.word_tokenize(text)

def noise_removal(each_row):
    if each_row != None:
        try:
            each_row = each_row.lower() #convert words to lowercase
            each_row = re.sub(r'\d+', '', each_row) #remove_numbers
            each_row = re.sub(r'[^a-zA-z0-9.,!?/:;\"\'\s]' , '', each_row) #remove special character
            each_row = " ".join([contractions.fix(word) for word in each_row.split()]) #expand contraction
            each_row = each_row.translate(str.maketrans(' ', ' ', string.punctuation)) #remove puntuation
            each_row = " ".join(each_row.split()) #remove whitespace

            #all tasks that requires tokenization first
            each_row = text_tokenized(each_row)
            each_row = [lemmatizer.lemmatize(w, "v") for w in each_row] #lemmmatization
            each_row = [w for w in each_row if w not in lang_stopwords] #stopwords removal
            each_row = " ".join(each_row)
        except:
            pass
    return each_row

## Applying

In [14]:
df_kaggle['Cleaned_Negative_Review'] = df_kaggle['Negative_Review'].apply(noise_removal)
df_kaggle['Cleaned_Positive_Review'] = df_kaggle['Positive_Review'].apply(noise_removal)

In [15]:
df_booking['Cleaned_Negative'] = df_booking['Negative'].apply(noise_removal)
df_booking['Cleaned_Positive'] = df_booking['Positive'].apply(noise_removal)

In [16]:
df_tripadvisor['Cleaned_Review'] = df_tripadvisor['Review'].apply(noise_removal)

## Save Data for Word Cloud

In [17]:
df_kaggle.to_csv('Britannia_Kaggle.csv')
df_booking.to_csv('Britannia_Booking.csv')
df_tripadvisor.to_csv('Britannia_Tripadvisor.csv')

## Merge and Truncate columns for Model

```
Column Category:
1 = Positive
0 = Negative
```

# df_tripadvisor

In [18]:
df_tripadvisor.rename(columns = {'Positive':'Category'},inplace = True)
df_tripadvisor.drop(columns=['Date','Rating'],axis=1,inplace=True)
df_tripadvisor.head(3)

Unnamed: 0,Review,Category,Cleaned_Review
0,All ok except the cost of parking We had a bud...,1,ok except cost park budget night stay well set...
1,Bad experience overall I had the worse experie...,0,bad experience overall worse experience ever f...
2,Better than expected stayed as part of a natio...,1,better expect stay part national coach trip pl...


# df_kaggle

In [19]:
df_kaggle.drop(columns=['Hotel_Address', 
                        'Additional_Number_of_Scoring', 
                        'Review_Date',
                        'Average_Score', 
                        'Hotel_Name', 
                        'Reviewer_Nationality',
                        'Review_Total_Negative_Word_Counts',
                        'Total_Number_of_Reviews', 
                        'Review_Total_Positive_Word_Counts',
                        'Total_Number_of_Reviews_Reviewer_Has_Given', 
                        'Reviewer_Score', 
                        'Tags',
                        'days_since_review',
                        'lat', 
                        'lng'],axis=1,inplace=True)

In [20]:
df1 = df_kaggle.copy() #set to contain only positive => drop Neg column => category column = 1
df2 = df_kaggle.copy() #set to contain only negative => drop Pos column => category column = 0

df1.drop(columns=['Negative_Review','Cleaned_Negative_Review'],axis=1,inplace=True)
df2.drop(columns=['Positive_Review','Cleaned_Positive_Review'],axis=1,inplace=True)

df1['Category'] = 1
df2['Category'] = 0

df1.rename(columns = {'Positive_Review':'Review'}, inplace = True)
df1.rename(columns = {'Cleaned_Positive_Review':'Cleaned_Review'}, inplace = True)

df2.rename(columns = {'Negative_Review':'Review'}, inplace = True)
df2.rename(columns = {'Cleaned_Negative_Review':'Cleaned_Review'}, inplace = True)

df_kaggle_merge = pd.concat([df1, df2],ignore_index = True)

In [21]:
df_kaggle_merge.tail(3)

Unnamed: 0,Review,Cleaned_Review,Category
7267,I had no complaints,complaints,0
7268,Really shabby and run down hotel Needs a tota...,really shabby run hotel need total refurbish j...,0
7269,Stains on the carpet peeling wallpaper scruff...,stain carpet peel wallpaper scruffy scuff bedr...,0


# df_booking

In [22]:
df_booking.drop(columns=['Unnamed: 0', 
                         'Name', 
                         'Country', 
                         'Room_stayed', 
                         'Date_stayed',
                         'Trip_type', 
                         'Review_date', 
                         'Review_title', 
                         'Reviewer_score'],axis=1,inplace=True)

In [23]:
df3 = df_booking.copy() #set to contain only positive => drop Neg column => category column = 1
df4 = df_booking.copy() #set to contain only negative => drop Pos column => category column = 0

df3.drop(columns=['Negative','Cleaned_Negative'],axis=1,inplace=True)
df4.drop(columns=['Positive','Cleaned_Positive'],axis=1,inplace=True)

df3['Category'] = 1
df4['Category'] = 0

df3.rename(columns = {'Positive':'Review'}, inplace = True)
df3.rename(columns = {'Cleaned_Positive':'Cleaned_Review'}, inplace = True)

df4.rename(columns = {'Negative':'Review'}, inplace = True)
df4.rename(columns = {'Cleaned_Negative':'Cleaned_Review'}, inplace = True)

df3 = df3.dropna()
df4 = df4.dropna()

df_booking_merge = pd.concat([df3, df4],ignore_index = True)

# Create Training Data

In [24]:
df_booking_merge.head(3)

Unnamed: 0,Review,Cleaned_Review,Category
0,Room quality.. hotel staff.. the bar..,room quality hotel staff bar,1
1,"The bed, size of room, the bath tub feature in...",bed size room bath tub feature bathroom location,1
2,I was made very welcome at the britania Hotel ...,make welcome britania hotel exerlent reception...,1


In [25]:
df_kaggle_merge.head(3)

Unnamed: 0,Review,Cleaned_Review,Category
0,The location was excellent for getting to the O2,location excellent get,1
1,The house keeping lady made my boyfriends day...,house keep lady make boyfriends day funny,1
2,The bed was OK,bed ok,1


In [26]:
train = pd.concat([df_booking_merge, df_kaggle_merge],ignore_index = True)

In [27]:
test = df_tripadvisor

In [28]:
train.to_csv('train_data.csv')
test.to_csv('test_data.csv')