In [127]:
# These have to be installed through running "pip install -r requirements.txt"
import re
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import sklearn
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer


nltk.download('all')  


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\Isakr/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\Isakr/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\Isakr/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     C:\Users\Isakr/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\Isakr/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]  

True

In [128]:
# Storing the csv file as a pandas dataframe
df = pd.read_csv('Hotel_Reviews.csv')

df = df.sample(frac=0.001, random_state=42)

In [129]:
def replace(cell):
    if cell == "No Positive" or cell == "No Negative":
        return ""
    return cell



df['Negative_Review'] = df['Negative_Review'].apply(replace)
df['Positive_Review'] = df["Positive_Review"].apply(replace)

In [130]:

def combine_names(row):
    return f"{row['Positive_Review']} {row['Negative_Review']}"


df["Review"] = df.apply(combine_names, axis="columns")

In [131]:
df = df.drop(["Hotel_Address", "Additional_Number_of_Scoring", "Review_Date", "Average_Score", "Reviewer_Nationality", "Review_Total_Negative_Word_Counts", "Review_Total_Positive_Word_Counts", "Total_Number_of_Reviews_Reviewer_Has_Given", "Tags", "days_since_review", "lat", "lng", "Hotel_Name", "Total_Number_of_Reviews", "Negative_Review", "Positive_Review"], axis = "columns")
df

Unnamed: 0,Reviewer_Score,Review
488440,9.6,Hotel was great clean friendly staff free bre...
274649,8.8,No tissue paper box was present at the room
374688,7.9,Nice welcoming and service Pillows
404352,10.0,Everything including the nice upgrade The Hot...
451596,9.6,Lovely hotel v welcoming staff
...,...,...
406925,9.6,The location of the Hotel is very good
278979,8.8,Great breakfast easy access to city metro
203818,7.5,Top location Small room
482918,3.3,The Air conditioning did not work in the roo...


In [132]:

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Define the preprocessing function
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    
    # Remove stop words
    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]
    
    # Lemmatize the tokens
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    
    # Join the tokens back into a single string
    processed_text = ' '.join(lemmatized_tokens)
    
    return processed_text

# Apply the preprocessing function to your DataFrame
df['Review'] = df['Review'].apply(preprocess_text)

In [None]:
# Sentiment analysis

analyzer = SentimentIntensityAnalyzer()

# Returns a 1 if analyzers recognizes it as positive and 0 if not
def sentiment(text):
    result = analyzer.polarity_scores(text)
    sentiment = 1 if result["pos"] > 0 else 0
    return sentiment

#This function will return a more detailed sentiment analyze with float numbers
def sentiment_detailed(text):
    result = analyzer.polarity_scores(text)
    return result

df["Sentiment"] = df['Review'].apply(sentiment)
df["Sentiments"] = df["Review"].apply(sentiment_detailed)
df = pd.concat([df.drop(['Sentiments'], axis=1), df['Sentiments'].apply(pd.Series)], axis=1)

df

Unnamed: 0,Reviewer_Score,Review,0,Sentiment,neg,neu,pos,compound,neg.1,neu.1,pos.1,compound.1
488440,9.6,hotel great clean friendly staff free breakfas...,1,1,0.016,0.534,0.451,0.9892,0.016,0.534,0.451,0.9892
274649,8.8,tissue paper box present room,0,0,0.000,1.000,0.000,0.0000,0.000,1.000,0.000,0.0000
374688,7.9,nice welcoming service pillow,1,1,0.000,0.260,0.740,0.6908,0.000,0.260,0.740,0.6908
404352,10.0,everything including nice upgrade hotel revamp...,1,1,0.000,0.439,0.561,0.9153,0.000,0.439,0.561,0.9153
451596,9.6,lovely hotel v welcoming staff,1,1,0.000,0.230,0.770,0.7717,0.000,0.230,0.770,0.7717
...,...,...,...,...,...,...,...,...,...,...,...,...
406925,9.6,location hotel good,1,1,0.000,0.408,0.592,0.4404,0.000,0.408,0.592,0.4404
278979,8.8,great breakfast easy access city metro,1,1,0.000,0.364,0.636,0.7906,0.000,0.364,0.636,0.7906
203818,7.5,top location small room,1,1,0.000,0.625,0.375,0.2023,0.000,0.625,0.375,0.2023
482918,3.3,air conditioning work room like sleeping oven ...,1,1,0.170,0.551,0.279,0.2893,0.170,0.551,0.279,0.2893


Unnamed: 0,Reviewer_Score,Review,0
488440,9.6,hotel great clean friendly staff free breakfas...,1
274649,8.8,tissue paper box present room,0
374688,7.9,nice welcoming service pillow,1
404352,10.0,everything including nice upgrade hotel revamp...,1
451596,9.6,lovely hotel v welcoming staff,1
...,...,...,...
406925,9.6,location hotel good,1
278979,8.8,great breakfast easy access city metro,1
203818,7.5,top location small room,1
482918,3.3,air conditioning work room like sleeping oven ...,1
