In [1]:
import pandas as pd
import tensorflow as tf
import re
import numpy as np
from nltk.corpus.reader import CategorizedCorpusReader
import nltk
from lime import lime_tabular

In [2]:
# fix random seed for reproducibility
seed = 7
tf.random.set_seed(seed)

In [3]:
df_cleaned = pd.read_excel("all_reviews_cleaned.xlsx", index_col=0)

In [4]:
data = df_cleaned.apply(lambda x: x.map(x.value_counts()))
data

Unnamed: 0,Hotel_Name,City,Country,Reviewer_Origin,Unit_Size,Length_of_stay_nights,Group_Size,Review_Date,Review_Title,Review_Score,Positive,Negative,num_of_people_helpful
0,1830.0,62540,62540,26427.0,4284,137077,170817,6032,1.0,236590,1.0,1.0,668125
1,1830.0,62540,62540,6005.0,4284,179651,170817,825,1276.0,98180,1.0,1.0,668125
2,1830.0,62540,62540,6040.0,4284,179651,170817,973,59463.0,103759,1.0,1.0,668125
3,1830.0,62540,62540,1917.0,10036,137077,108428,973,51238.0,50901,1.0,1.0,668125
4,1830.0,62540,62540,77137.0,3607,251214,220530,973,1.0,50901,91.0,1.0,668125
...,...,...,...,...,...,...,...,...,...,...,...,...,...
774056,1512.0,26079,26079,1868.0,152,179651,108428,1408,16694.0,23528,1.0,1.0,668125
774057,1512.0,26079,26079,3882.0,8450,137077,108428,972,53660.0,19578,1.0,1.0,668125
774058,1512.0,26079,26079,31122.0,8450,137077,220530,972,1.0,19578,1.0,1.0,668125
774059,1512.0,26079,26079,8834.0,8450,137077,274286,972,51238.0,50901,1.0,1.0,668125


In [5]:
# If Reviewers Origin is less than 20 reviewers it is placed into a category called other
# this allows for the NN model to run

df_cleaned["Reviewer_Origin"] = df_cleaned["Reviewer_Origin"].where(data["Reviewer_Origin"] >=20, "other")

In [6]:
df_cleaned[df_cleaned["Reviewer_Origin"] == "other"]

Unnamed: 0,Hotel_Name,City,Country,Reviewer_Origin,Unit_Size,Length_of_stay_nights,Group_Size,Review_Date,Review_Title,Review_Score,Positive,Negative,num_of_people_helpful
482,Park Inn by Radisson Amsterdam City West,Amsterdam,Netherlands,other,Standard Room,3,Solo traveler,2019-12-05,Wonderful,9.2,all the rest.,maybe with too many colours. not a lot of opti...,0
723,Park Inn by Radisson Amsterdam City West,Amsterdam,Netherlands,other,Superior Room,1,Solo traveler,2019-08-13,Good,7.1,the hotel and the location. friendly front des...,extremely poor service at the restaurant. sow ...,0
1029,Park Inn by Radisson Amsterdam City West,Amsterdam,Netherlands,other,Superior Room,3,Solo traveler,2022-03-20,very bad,1.0,nothing,very bad mattress\nshower water switch cold an...,0
1125,Park Inn by Radisson Amsterdam City West,Amsterdam,Netherlands,other,Junior Suite,2,Group,2021-10-24,Amsterdam,2.0,-once reached the hotel 4 p.m my rooms not rea...,-once reached the hotel 4 p.m my rooms not rea...,0
1804,Park Inn by Radisson Amsterdam City West,Amsterdam,Netherlands,other,5 nights · \n\nNovember 2019,5,Solo traveler,2019-11-25,"Enjoyable, but I think a bit expensive.",6.0,it was close to the train station and that mad...,the room safe-box was broken and although i wa...,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
772119,Ibsens Hotel,Copenhagen,Denmark,other,Medium Room,2,Group,2019-10-16,Pleasant,6.2,the room was really quiet. the checkin and our...,toilet is integrated into the shower. not very...,0
772748,Alexandra,Copenhagen,Denmark,other,Standard Double Room,3,Solo traveler,2020-03-05,Very Good,8.0,the room under the roof was really cozy and wa...,"breakfast was rather modest, too few warm opti...",0
772817,Alexandra,Copenhagen,Denmark,other,Standard Single Room,2,Group,2021-11-04,"Overpriced for quality of room you get, but qu...",6.0,rooms are nicely decorated in the style of the...,a newspaper/folder received at check-in contai...,0
773285,Tivoli Hotel,Copenhagen,Denmark,other,Standard Single Room,5,Solo traveler,2019-06-24,Comfortable bed,7.5,set up and comfort of the room,room was not cleaned every day - i broke some ...,0


In [7]:
df_cleaned.dtypes

Hotel_Name                       object
City                             object
Country                          object
Reviewer_Origin                  object
Unit_Size                        object
Length_of_stay_nights             int64
Group_Size                       object
Review_Date              datetime64[ns]
Review_Title                     object
Review_Score                    float64
Positive                         object
Negative                         object
num_of_people_helpful             int64
dtype: object

In [8]:
df_cleaned.columns

Index(['Hotel_Name', 'City', 'Country', 'Reviewer_Origin', 'Unit_Size',
       'Length_of_stay_nights', 'Group_Size', 'Review_Date', 'Review_Title',
       'Review_Score', 'Positive', 'Negative', 'num_of_people_helpful'],
      dtype='object')

In [9]:
# 'Positive', 'Negative' are removed

df_NN = df_cleaned[['City', 'Country', 'Reviewer_Origin',
       'Group_Size', 'Length_of_stay_nights', 'num_of_people_helpful',  
       'Review_Score']]
df_NN.dropna(inplace=True)
df_NN

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_NN.dropna(inplace=True)


Unnamed: 0,City,Country,Reviewer_Origin,Group_Size,Length_of_stay_nights,num_of_people_helpful,Review_Score
0,Amsterdam,Netherlands,South Africa,Solo traveler,3,0,10.0
1,Amsterdam,Netherlands,Spain,Solo traveler,2,0,8.0
2,Amsterdam,Netherlands,Poland,Solo traveler,2,0,9.0
3,Amsterdam,Netherlands,Iceland,Group,3,0,7.0
4,Amsterdam,Netherlands,United States of America,Family,1,0,7.0
...,...,...,...,...,...,...,...
774056,Copenhagen,Denmark,Lithuania,Group,2,0,8.8
774057,Copenhagen,Denmark,Czech Republic,Group,3,0,8.3
774058,Copenhagen,Denmark,Australia,Family,3,0,8.3
774059,Copenhagen,Denmark,Thailand,Couple,3,0,7.0


In [10]:
# example of ordinal encoding for a neural network
from tensorflow import keras
from tensorflow.keras import models
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import classification_report
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder

In [11]:
df_NN[["num_of_people_helpful","Length_of_stay_nights"]] = df_NN[["num_of_people_helpful","Length_of_stay_nights"]].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [12]:
df_NN.dtypes

City                      object
Country                   object
Reviewer_Origin           object
Group_Size                object
Length_of_stay_nights      int32
num_of_people_helpful      int32
Review_Score             float64
dtype: object

In [13]:
df_NN.columns

Index(['City', 'Country', 'Reviewer_Origin', 'Group_Size',
       'Length_of_stay_nights', 'num_of_people_helpful', 'Review_Score'],
      dtype='object')

In [14]:
X = df_NN[['City', 'Country', 'Reviewer_Origin', 'Group_Size',
       'Length_of_stay_nights', 'num_of_people_helpful'
       ]]
Y = df_NN["Review_Score"]

In [15]:
X_train,X_test,y_train,y_test = train_test_split(df_NN.drop(columns=['Review_Score']),df_NN['Review_Score'],test_size=0.3)
print('Train/Test Sizes : ',X_train.shape, X_test.shape, y_train.shape, y_test.shape)

Train/Test Sizes :  (541842, 6) (232219, 6) (541842,) (232219,)


In [16]:
# Creation of neural network
neural_regressor = models.Sequential(
    [
        keras.layers.Dense(224, activation="relu", input_shape=(224,)),
        keras.layers.Dense(112, activation="relu"),
        keras.layers.Dense(52, activation="relu"),
        keras.layers.Dense(1)
    ]
)

neural_regressor.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 224)               50400     
                                                                 
 dense_1 (Dense)             (None, 112)               25200     
                                                                 
 dense_2 (Dense)             (None, 52)                5876      
                                                                 
 dense_3 (Dense)             (None, 1)                 53        
                                                                 
Total params: 81,529
Trainable params: 81,529
Non-trainable params: 0
_________________________________________________________________


In [17]:
#pip install scikeras

In [18]:
# Calling of scikeras API
from scikeras.wrappers import KerasRegressor

scikeras_regressor = KerasRegressor(model=neural_regressor,
                                    loss="mean_squared_error",
                                    optimizer="adam",
                                    verbose=1,
                                    epochs=50,
                                    batch_size= 40,
                                    warm_start= True
                                  )

In [19]:
# Creation of pipeline process, Seperation of numeric, categorical and ordinal variables

numeric_features = ['Length_of_stay_nights', 'num_of_people_helpful']
numeric_transformer = MinMaxScaler()

categorical_features = ['City', 'Country', 'Reviewer_Origin']
categorical_transformer = OneHotEncoder()

ordinal_features = ['Group_Size']
ordinal_transformer = OrdinalEncoder()


#text_features = ["Positive","Negative"]
#tfid_transformer = TfidfVectorizer(stop_words = 'english', max_features= 5000)


preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
        ("ord", ordinal_transformer, ordinal_features),
        #("text", tfid_transformer, text_features)        
    ], 
    sparse_threshold=0
)


In [20]:
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("scale", StandardScaler()),
        ('Model', scikeras_regressor)
        ]
)

In [21]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 2060


In [22]:
## Declare Hyperparameters Range
# Tested batch_size: 10, 20, 40, 60, 80, 100
# Tested Optimizers: Adam, SGD
# Tested epochs: 10, 20, 50, 100
# 
# Best Params : {'Model__batch_size': 20, 'Model__epochs': 20, 'Model__optimizer': 'adam
# Test  R^2 : 0.017656446414898386

# Best Params : {'Model__batch_size': 40, 'Model__epochs': 50, 'Model__optimizer': 'adam'}
# Test  R^2 : 0.018041664269548896

params = {
    "Model__batch_size": [10, 20, 40, 60, 80, 100],
    "Model__optimizer": ["adam"],
    "Model__epochs": [10, 20, 50, 100]
}

In [23]:
# Used to find the optimal parameters
# Already run, best one is {'Model__batch_size': 40, 'Model__epochs': 50, 'Model__optimizer': 'adam'}
#grid = GridSearchCV(pipeline, param_grid = params, scoring="r2", n_jobs = 3, verbose=10)

In [28]:
pipeline

In [24]:
# Used to run just the best model, not all of gridsearch
pipeline.fit(X_train, y_train)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [25]:
from sklearn.metrics import mean_squared_error

print("Train MSE : {}".format(mean_squared_error(y_train, pipeline.predict(X_train))))
print("Test  MSE : {}".format(mean_squared_error(y_test, pipeline.predict(X_test))))

Train MSE : 3.9989620076854955
Test  MSE : 4.059540653602671


In [26]:
print("\nTrain R^2 : {}".format(pipeline.score(X_train, y_train)))
print("Test  R^2 : {}".format(pipeline.score(X_test, y_test)))


Train R^2 : 0.02672692975315405
Test  R^2 : 0.016020503102327388


In [None]:
#print("Best Score  : {}".format(grid.best_score_))
#print("Best Params : {}".format(grid.best_params_))

Best Score  : 0.01718233798467461
Best Params : {'Model__batch_size': 40, 'Model__epochs': 50, 'Model__optimizer': 'adam'}


In [None]:
# With out standard scaler
#from sklearn.metrics import mean_squared_error

#print("Train MSE : {}".format(mean_squared_error(y_train, grid.predict(X_train))))
#print("Test  MSE : {}".format(mean_squared_error(y_test, grid.predict(X_test))))

Train MSE : 4.00225861110458
Test  MSE : 4.052882423668592


In [None]:
#print("\nTrain R^2 : {}".format(grid.score(X_train, y_train)))
#print("Test  R^2 : {}".format(grid.score(X_test, y_test)))


Train R^2 : 0.025751127790847428
Test  R^2 : 0.018041664269548896


In [None]:
#scikeras_regressor.model.save("keras_regressor_30hrs")

In [None]:
#grid.get_params

<bound method BaseEstimator.get_params of GridSearchCV(estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(sparse_threshold=0,
                                                          transformers=[('num',
                                                                         MinMaxScaler(),
                                                                         ['Length_of_stay_nights',
                                                                          'num_of_people_helpful']),
                                                                        ('cat',
                                                                         OneHotEncoder(),
                                                                         ['City',
                                                                          'Country',
                                                                          'Reviewer_Origin']),
                       

In [27]:
allscores= pd.read_csv("gridsearchCV_results.csv", index_col=0)
allscores

Unnamed: 0_level_0,std_fit_time,mean_score_time,std_score_time,param_Model__batch_size,param_Model__epochs,param_Model__optimizer,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,Unnamed: 16
mean_fit_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0.0,3027.447814,63.049185,15.01296,7.648083,20.0,50.0,adam,"{'Model__batch_size': 20, 'Model__epochs': 50,...",0.015833,0.016437,0.019686,0.017207,0.014979,0.016828,0.001605,2.0
1.0,2738.616238,18.25243,9.333782,0.123211,20.0,50.0,sgd,"{'Model__batch_size': 20, 'Model__epochs': 50,...",,,,,,,,18.0
2.0,7561.165366,1988.995056,11.529417,1.729778,20.0,100.0,adam,"{'Model__batch_size': 20, 'Model__epochs': 100...",0.01738,0.010837,0.016469,0.016564,0.015994,0.015449,0.002349,5.0
3.0,5324.621817,35.19113,9.255937,0.143706,20.0,100.0,sgd,"{'Model__batch_size': 20, 'Model__epochs': 100...",,,-5.7e-05,,,,,17.0
4.0,1475.069366,7.383413,5.616255,0.644619,40.0,50.0,adam,"{'Model__batch_size': 40, 'Model__epochs': 50,...",0.017267,0.014205,0.019329,0.018286,0.016825,0.017182,0.001722,1.0
5.0,1360.888494,6.171748,6.124391,0.538755,40.0,50.0,sgd,"{'Model__batch_size': 40, 'Model__epochs': 50,...",,0.012313,,0.012639,0.014305,,,16.0
6.0,8255.756581,6398.756802,5.315254,0.170176,40.0,100.0,adam,"{'Model__batch_size': 40, 'Model__epochs': 100...",0.016909,0.013964,0.019595,0.011924,0.010014,0.014481,0.00343,9.0
7.0,2800.147782,7.992454,5.70306,0.733097,40.0,100.0,sgd,"{'Model__batch_size': 40, 'Model__epochs': 100...",0.004048,,,0.002703,,,,15.0
8.0,1032.205235,23.787302,4.346321,0.47848,60.0,50.0,adam,"{'Model__batch_size': 60, 'Model__epochs': 50,...",0.017426,0.010994,0.017545,0.017307,0.016675,0.015989,0.002516,4.0
9.0,925.460727,12.946176,3.979057,0.273915,60.0,50.0,sgd,"{'Model__batch_size': 60, 'Model__epochs': 50,...",0.010867,0.013756,,0.013898,0.013978,,,14.0
