# Hotel Reviews in Europe

__Sentiment analysis__

__Import libraries and dataset__

In [None]:
import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt

from wordcloud import WordCloud, STOPWORDS

from sklearn.model_selection import train_test_split

__Load dataset__

In [None]:
df = pd.read_csv("data/Hotel_Reviews.csv")

__Explore review comments__

In [None]:
df[['Negative_Review', 'Positive_Review', 'Reviewer_Score']].head()

__Distribution of scores__

In [None]:
n_bins = 10

scores = df['Reviewer_Score']
plt.hist(scores, n_bins, alpha=0.8, edgecolor = "black")

plt.xlabel('Score')
plt.ylabel('Number of reviews')
plt.title('Review scores')

plt.show() 

__Create new score grade feature from the review score__

In [None]:
def calc_grade(row):
    score = row['Reviewer_Score']
    if score < 4:
        return 'Negative'
    elif score < 7:
        return 'Neutral'
    return 'Positive'
    
df['Score_Grade'] = df.apply(calc_grade, axis=1)
df[['Reviewer_Score', 'Score_Grade']].head()

__Count number of each grade__

In [None]:
reviews_by_grade = df.groupby('Score_Grade').size()
reviews_by_grade

__Ratio of negative, neutral and positive reviews__

In [None]:
#rest_fraud_transactions = total_fraud_transactions - top_merchant_fraud['Fraudulent'].sum()
#y = np.array(top_merchant_fraud['Fraudulent'].append(pd.Series(rest_fraud_transactions)))
grade_labels = ['Negative', 'Neutral', 'Positive']
colors = ['salmon', 'lightblue', 'lightgreen']
explode = [0.1, 0.1, 0]

title = plt.title('Review grades')

pie = plt.pie(
    reviews_by_grade,
    labels = grade_labels,
    autopct='%1.0f%%',
    colors=colors,
    explode=explode,
    shadow = True)

plt.axis('equal')
plt.subplots_adjust(left=0.0, bottom=0.1, right=0.5)

plt.show()

__Word analysis for each review grade__

In [None]:
# Concatenate positive and negative comments
positive_comments = df['Positive_Review'].str.cat()
negative_comments = df['Negative_Review'].str.cat()

# Ignore words that either have no meaning or could be both positive and negative
STOPWORDS.update(('Negative', 'NegativeNo', 'hotel', 'didn', 'couldn', 'wasn', 't', 'Nothing', 'one', 'everything', 'asked', 'use', 'got', 'much', 'find', 'found', 's', 'stay', 'stayed'))

# Build wordclouds
pos_wordcloud = WordCloud(width=1280, height=853, margin=0,
                      colormap='BuGn', background_color='dimgray',
                      stopwords=STOPWORDS).generate(positive_comments)

neg_wordcloud = WordCloud(width=1280, height=853, margin=0,
                      colormap='OrRd', background_color='dimgray',
                      stopwords=STOPWORDS).generate(negative_comments)

In [None]:
# Define figure
fig = plt.figure(figsize=(10, 10))
rows = 1
columns = 2

# Show wordcloud for positive reviews
fig.add_subplot(rows, columns, 1)
plt.imshow(pos_wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Positive Reviews")
  
# Show wordcloud for negative reviews
fig.add_subplot(rows, columns, 2)  
plt.imshow(neg_wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Negative Reviews")

plt.show()

__Separate train and test datasets__

In [None]:
RANDOM_SEED = 15

y = df['Score_Grade']
X = df[['Positive_Review', 'Negative_Review']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

print(f'Train size: {X_train.shape[0]}')
print(f'Test size: {X_test.shape[0]}')