In [32]:
# These have to be installed through running "pip install -r requirements.txt"
import re
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import sklearn
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [40]:
# Storing the csv file as a pandas dataframe and sample 50/50 of the positive and negative scores
df = pd.read_csv('Hotel_Reviews.csv')

positive_reviews = df[df['Reviewer_Score'] > 7]
negative_reviews = df[df['Reviewer_Score'] < 7]

negative_sample = negative_reviews.sample(n=1000, random_state=42)
positive_sample = positive_reviews.sample(n=1000, random_state=42)

balanced_df = pd.concat([positive_sample, negative_sample])
df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)
print(balanced_df['Reviewer_Score'].apply(lambda x: 'positive' if x > 5 else 'negative').value_counts())

Reviewer_Score
positive    1641
negative     359
Name: count, dtype: int64


In [41]:
def replace(cell):
    if cell == "No Positive" or cell == "No Negative":
        return ""
    return cell



df['Negative_Review'] = df['Negative_Review'].apply(replace)
df['Positive_Review'] = df["Positive_Review"].apply(replace)

In [42]:

def combine_names(row):
    return f"{row['Positive_Review']} {row['Negative_Review']}"


df["Review"] = df.apply(combine_names, axis="columns")

In [43]:
df = df.drop(["Hotel_Address", "Additional_Number_of_Scoring", "Review_Date", "Average_Score", "Reviewer_Nationality", "Review_Total_Negative_Word_Counts", "Review_Total_Positive_Word_Counts", "Total_Number_of_Reviews_Reviewer_Has_Given", "Tags", "days_since_review", "lat", "lng", "Hotel_Name", "Total_Number_of_Reviews", "Negative_Review", "Positive_Review"], axis = "columns")
df

Unnamed: 0,Reviewer_Score,Review
0,3.3,Breakfast ok Receptionists were rude
1,10.0,Every member of staff was so friendly and hel...
2,5.4,gm letter in c in because of renovation the w...
3,8.3,It looks new and fancy Ok value for money T...
4,5.4,The colors were cool The photographer for t...
...,...,...
1995,6.7,Two staff members stood out in particular Nob...
1996,6.7,Location was good Stuff were very frendly Ro...
1997,10.0,The facility is luxurious and a good location...
1998,6.7,The breakfast was lovely and fresh with a wid...


In [44]:

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Define the preprocessing function
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    
    # Remove stop words
    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]
    
    # Lemmatize the tokens
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    
    # Join the tokens back into a single string
    processed_text = ' '.join(lemmatized_tokens)
    
    return processed_text

# Apply the preprocessing function to your DataFrame
df['Review'] = df['Review'].apply(preprocess_text)

In [45]:
# Sentiment analysis

analyzer = SentimentIntensityAnalyzer()

# Returns a 1 if analyzers recognizes it as positive and 0 if not
def sentiment(text):
    result = analyzer.polarity_scores(text)
    sentiment = 1 if result["pos"] > 0 else 0
    return sentiment

#This function will return a more detailed sentiment analyze with float numbers
def sentiment_detailed(text):
    result = analyzer.polarity_scores(text)
    return result

df["Sentiment"] = df['Review'].apply(sentiment)
#df["Sentiments"] = df["Review"].apply(sentiment_detailed)
#df = pd.concat([df.drop(['Sentiments'], axis=1), df['Sentiments'].apply(pd.Series)], axis=1)

df

#https://medium.com/@amalia.wulandiari/sentiment-analysis-hotel-review-python-nltk-846738de727f

Unnamed: 0,Reviewer_Score,Review,Sentiment
0,3.3,breakfast ok receptionist rude,1
1,10.0,every member staff friendly helpful room spaci...,1
2,5.4,gm letter c renovation wonderfull location hou...,1
3,8.3,look new fancy ok value money hotel near beach...,1
4,5.4,color cool photographer website particularly s...,1
...,...,...,...
1995,6.7,two staff member stood particular nobel fatou ...,1
1996,6.7,location good stuff frendly room small leaking...,1
1997,10.0,facility luxurious good location swimming pool,1
1998,6.7,breakfast lovely fresh wide range item room ti...,1


In [None]:
df['posneg'] = df['Reviewer_Score'].apply(lambda x: 0 if x < 5 else 1)
df

Unnamed: 0,Reviewer_Score,Review,Sentiment,posneg
0,3.3,breakfast ok receptionist rude,1,0
1,10.0,every member staff friendly helpful room spaci...,1,1
2,5.4,gm letter c renovation wonderfull location hou...,1,1
3,8.3,look new fancy ok value money hotel near beach...,1,1
4,5.4,color cool photographer website particularly s...,1,1
...,...,...,...,...
1995,6.7,two staff member stood particular nobel fatou ...,1,1
1996,6.7,location good stuff frendly room small leaking...,1,1
1997,10.0,facility luxurious good location swimming pool,1,1
1998,6.7,breakfast lovely fresh wide range item room ti...,1,1
