In [14]:
# These have to be installed through running "pip install -r requirements.txt"
import re
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import sklearn
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from wordcloud import WordCloud # to visualize text
from PIL import Image # import imag
from gensim.test.utils import common_texts # training corpus
from gensim.models.doc2vec import Doc2Vec, TaggedDocument #  representing documents as a vector
from sklearn.feature_extraction.text import TfidfVectorizer # converts a collection of raw documents into a matrix

In [15]:
# Storing the csv file as a pandas dataframe and sample 50/50 of the positive and negative scores
df = pd.read_csv('Hotel_Reviews.csv')

positive_reviews = df[df['Reviewer_Score'] > 7]
negative_reviews = df[df['Reviewer_Score'] < 7]

negative_sample = negative_reviews.sample(n=1000, random_state=42)
positive_sample = positive_reviews.sample(n=1000, random_state=42)

balanced_df = pd.concat([positive_sample, negative_sample])
df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)
print(balanced_df['Reviewer_Score'].apply(lambda x: 'positive' if x > 5 else 'negative').value_counts())

Reviewer_Score
positive    1641
negative     359
Name: count, dtype: int64


In [16]:
def replace(cell):
    if cell == "No Positive" or cell == "No Negative":
        return ""
    return cell



df['Negative_Review'] = df['Negative_Review'].apply(replace)
df['Positive_Review'] = df["Positive_Review"].apply(replace)

In [17]:

def combine_names(row):
    return f"{row['Positive_Review']} {row['Negative_Review']}"


df["Review"] = df.apply(combine_names, axis="columns")

In [18]:
df = df.drop(["Hotel_Address", "Additional_Number_of_Scoring", "Review_Date", "Average_Score", "Reviewer_Nationality", "Review_Total_Negative_Word_Counts", "Review_Total_Positive_Word_Counts", "Total_Number_of_Reviews_Reviewer_Has_Given", "Tags", "days_since_review", "lat", "lng", "Hotel_Name", "Total_Number_of_Reviews", "Negative_Review", "Positive_Review"], axis = "columns")
df

Unnamed: 0,Reviewer_Score,Review
0,3.3,Breakfast ok Receptionists were rude
1,10.0,Every member of staff was so friendly and hel...
2,5.4,gm letter in c in because of renovation the w...
3,8.3,It looks new and fancy Ok value for money T...
4,5.4,The colors were cool The photographer for t...
...,...,...
1995,6.7,Two staff members stood out in particular Nob...
1996,6.7,Location was good Stuff were very frendly Ro...
1997,10.0,The facility is luxurious and a good location...
1998,6.7,The breakfast was lovely and fresh with a wid...


In [19]:

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Define the preprocessing function
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    
    # Remove stop words
    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]
    
    # Lemmatize the tokens
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    
    # Join the tokens back into a single string
    processed_text = ' '.join(lemmatized_tokens)
    
    return processed_text

# Apply the preprocessing function to your DataFrame
df['Review'] = df['Review'].apply(preprocess_text)

In [20]:
# Sentiment analysis

analyzer = SentimentIntensityAnalyzer()

# Returns a 1 if analyzers recognizes it as positive and 0 if not
def sentiment(text):
    result = analyzer.polarity_scores(text)
    sentiment = 1 if result["pos"] > 0 else 0
    return sentiment

#This function will return a more detailed sentiment analyze with float numbers
def sentiment_detailed(text):
    result = analyzer.polarity_scores(text)
    return result

df["Sentiment"] = df['Review'].apply(sentiment)
df["Sentiments"] = df["Review"].apply(sentiment_detailed)
df = pd.concat([df.drop(['Sentiments'], axis=1), df['Sentiments'].apply(pd.Series)], axis=1)

df

#https://medium.com/@amalia.wulandiari/sentiment-analysis-hotel-review-python-nltk-846738de727f

Unnamed: 0,Reviewer_Score,Review,Sentiment,neg,neu,pos,compound
0,3.3,breakfast ok receptionist rude,1,0.417,0.278,0.306,-0.2023
1,10.0,every member staff friendly helpful room spaci...,1,0.000,0.385,0.615,0.8622
2,5.4,gm letter c renovation wonderfull location hou...,1,0.120,0.648,0.231,0.3400
3,8.3,look new fancy ok value money hotel near beach...,1,0.063,0.718,0.218,0.6597
4,5.4,color cool photographer website particularly s...,1,0.000,0.680,0.320,0.5719
...,...,...,...,...,...,...,...
1995,6.7,two staff member stood particular nobel fatou ...,1,0.216,0.645,0.138,-0.7003
1996,6.7,location good stuff frendly room small leaking...,1,0.000,0.756,0.244,0.7003
1997,10.0,facility luxurious good location swimming pool,1,0.000,0.633,0.367,0.4404
1998,6.7,breakfast lovely fresh wide range item room ti...,1,0.145,0.550,0.305,0.4939


In [21]:
# Create TaggedDocument objects
docs = [TaggedDocument(doc.split(' '), [i]) for i, doc in enumerate(df['Review'])]

# Train the Doc2Vec model
model = Doc2Vec(docs, vector_size=5, window=2, min_count=1, workers=4)

# Infer vectors for each document
doc2vec_vectors = df['Review'].apply(lambda x: model.infer_vector(x.split(' ')))

# Convert the list of vectors into a DataFrame
doc2vec_df = pd.DataFrame(doc2vec_vectors.tolist())

# Add column names
doc2vec_df.columns = ['doc2vec_vector_' + str(i) for i in range(doc2vec_df.shape[1])]

# Concatenate with the original DataFrame
df = pd.concat([df, doc2vec_df], axis=1)

# Display the final DataFrame
df

Unnamed: 0,Reviewer_Score,Review,Sentiment,neg,neu,pos,compound,doc2vec_vector_0,doc2vec_vector_1,doc2vec_vector_2,doc2vec_vector_3,doc2vec_vector_4
0,3.3,breakfast ok receptionist rude,1,0.417,0.278,0.306,-0.2023,0.056754,-0.006518,0.080449,0.040124,0.044553
1,10.0,every member staff friendly helpful room spaci...,1,0.000,0.385,0.615,0.8622,-0.073647,0.200109,0.047026,-0.009943,0.056442
2,5.4,gm letter c renovation wonderfull location hou...,1,0.120,0.648,0.231,0.3400,0.296592,0.314013,0.505645,-0.070788,-0.183819
3,8.3,look new fancy ok value money hotel near beach...,1,0.063,0.718,0.218,0.6597,0.241297,0.435850,0.783921,-0.069761,-0.337804
4,5.4,color cool photographer website particularly s...,1,0.000,0.680,0.320,0.5719,0.120873,0.459287,0.772370,-0.110016,-0.355163
...,...,...,...,...,...,...,...,...,...,...,...,...
1995,6.7,two staff member stood particular nobel fatou ...,1,0.216,0.645,0.138,-0.7003,0.590166,0.779675,1.487334,-0.065088,-0.591001
1996,6.7,location good stuff frendly room small leaking...,1,0.000,0.756,0.244,0.7003,0.051193,-0.003677,0.106787,0.022848,0.063555
1997,10.0,facility luxurious good location swimming pool,1,0.000,0.633,0.367,0.4404,0.021459,0.119512,0.129025,-0.036852,-0.020545
1998,6.7,breakfast lovely fresh wide range item room ti...,1,0.145,0.550,0.305,0.4939,0.124030,0.304036,0.579687,-0.030205,-0.276359


In [22]:
# Adding TF-IDF values to find out the importance of words in the reviews
tfidf = TfidfVectorizer(min_df = 10) # Initialize the TfidfVectorizer with a minimum document frequency of 10
tfidf_result = tfidf.fit_transform(df['Review']).toarray() # Fit and transform the 'Review' column to compute TF-IDF values
tfidf_df = pd.DataFrame(tfidf_result, columns = tfidf.get_feature_names_out())# Convert the TF-IDF matrix into a DataFrame
tfidf_df.columns = ['word_' + str(x) for x in tfidf_df.columns] # Rename columns to avoid conflicts with existing column names
tfidf_df.index = df.index # Align the index of the TF-IDF DataFrame with the original DataFrame
df = pd.concat([df, tfidf_df], axis=1) # Concatenate the original DataFrame with the TF-IDF DataFrame
df.head()

Unnamed: 0,Reviewer_Score,Review,Sentiment,neg,neu,pos,compound,doc2vec_vector_0,doc2vec_vector_1,doc2vec_vector_2,...,word_worked,word_working,word_worn,word_worst,word_worth,word_would,word_wrong,word_year,word_yet,word_young
0,3.3,breakfast ok receptionist rude,1,0.417,0.278,0.306,-0.2023,0.056754,-0.006518,0.080449,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10.0,every member staff friendly helpful room spaci...,1,0.0,0.385,0.615,0.8622,-0.073647,0.200109,0.047026,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5.4,gm letter c renovation wonderfull location hou...,1,0.12,0.648,0.231,0.34,0.296592,0.314013,0.505645,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,8.3,look new fancy ok value money hotel near beach...,1,0.063,0.718,0.218,0.6597,0.241297,0.43585,0.783921,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.249714,0.0,0.0
4,5.4,color cool photographer website particularly s...,1,0.0,0.68,0.32,0.5719,0.120873,0.459287,0.77237,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
df['posneg'] = df['Reviewer_Score'].apply(lambda x: 0 if x < 5 else 1)
df

Unnamed: 0,Reviewer_Score,Review,Sentiment,neg,neu,pos,compound,doc2vec_vector_0,doc2vec_vector_1,doc2vec_vector_2,...,word_working,word_worn,word_worst,word_worth,word_would,word_wrong,word_year,word_yet,word_young,posneg
0,3.3,breakfast ok receptionist rude,1,0.417,0.278,0.306,-0.2023,0.056754,-0.006518,0.080449,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0
1,10.0,every member staff friendly helpful room spaci...,1,0.000,0.385,0.615,0.8622,-0.073647,0.200109,0.047026,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,1
2,5.4,gm letter c renovation wonderfull location hou...,1,0.120,0.648,0.231,0.3400,0.296592,0.314013,0.505645,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,1
3,8.3,look new fancy ok value money hotel near beach...,1,0.063,0.718,0.218,0.6597,0.241297,0.435850,0.783921,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.249714,0.0,0.0,1
4,5.4,color cool photographer website particularly s...,1,0.000,0.680,0.320,0.5719,0.120873,0.459287,0.772370,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,6.7,two staff member stood particular nobel fatou ...,1,0.216,0.645,0.138,-0.7003,0.590166,0.779675,1.487334,...,0.0,0.0,0.000000,0.0,0.107839,0.185844,0.000000,0.0,0.0,1
1996,6.7,location good stuff frendly room small leaking...,1,0.000,0.756,0.244,0.7003,0.051193,-0.003677,0.106787,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,1
1997,10.0,facility luxurious good location swimming pool,1,0.000,0.633,0.367,0.4404,0.021459,0.119512,0.129025,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,1
1998,6.7,breakfast lovely fresh wide range item room ti...,1,0.145,0.550,0.305,0.4939,0.124030,0.304036,0.579687,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,1


In [27]:
# Calculate Positive and Negative Review

train_pos = df[ df['posneg'] == 1]
train_pos = train_pos['Review']
train_neg = df[ df['posneg'] == 0]
train_neg = train_neg['Review']

In [None]:
# Generate WordCloud function for negative and positive reviews
# using mask/shape = cloud
# source: https://medium.com/@amalia.wulandiari/sentiment-analysis-hotel-review-python-nltk-846738de727f

def wordCloud_generator(data, color, color_map):

    
    wordcloud = WordCloud(width = 1000, height = 1000,
                          background_color =color,
                          min_font_size = 12,
                          colormap= color_map,
                          mask = wave_mask
                         ).generate(' '.join(data.values))
    
    # plot the WordCloud image                        
    plt.figure(figsize = (10, 10), facecolor = None) 
    plt.imshow(wordcloud, interpolation='bilinear') 
    plt.axis('off')
    
    plt.show() 

In [32]:
# WordCloud for Positive Review

wordCloud_generator(train_pos, 'white', 'ocean')

NameError: name 'wave_mask' is not defined

In [None]:
# Wordcloud for Negative Review

wordCloud_generator(train_neg, 'white', 'Reds')