In [None]:
import pandas as pd
import tensorflow as tf
import re
import numpy as np
from nltk.corpus.reader import CategorizedCorpusReader
import nltk
from lime import lime_tabular

In [None]:
# fix random seed for reproducibility
seed = 7
tf.random.set_seed(seed)

In [None]:
df_cleaned = pd.read_excel("all_reviews_cleaned.xlsx", index_col=0)

In [None]:
data = df_cleaned.apply(lambda x: x.map(x.value_counts()))
data

Unnamed: 0,Hotel_Name,City,Country,Reviewer_Origin,Unit_Size,Length_of_stay_nights,Group_Size,Review_Date,Review_Title,Review_Score,Positive,Negative,num_of_people_helpful
0,1830.0,62540,62540,26427.0,4284,137077,170817,6032,1.0,236590,1.0,1.0,668125
1,1830.0,62540,62540,6005.0,4284,179651,170817,825,1276.0,98180,1.0,1.0,668125
2,1830.0,62540,62540,6040.0,4284,179651,170817,973,59463.0,103759,1.0,1.0,668125
3,1830.0,62540,62540,1917.0,10036,137077,108428,973,51238.0,50901,1.0,1.0,668125
4,1830.0,62540,62540,77137.0,3607,251214,220530,973,1.0,50901,91.0,1.0,668125
...,...,...,...,...,...,...,...,...,...,...,...,...,...
774056,1512.0,26079,26079,1868.0,152,179651,108428,1408,16694.0,23528,1.0,1.0,668125
774057,1512.0,26079,26079,3882.0,8450,137077,108428,972,53660.0,19578,1.0,1.0,668125
774058,1512.0,26079,26079,31122.0,8450,137077,220530,972,1.0,19578,1.0,1.0,668125
774059,1512.0,26079,26079,8834.0,8450,137077,274286,972,51238.0,50901,1.0,1.0,668125


In [None]:
# If Reviewers Origin is less than 20 reviewers it is placed into a category called other
# this allows for the NN model to run

df_cleaned["Reviewer_Origin"] = df_cleaned["Reviewer_Origin"].where(data["Reviewer_Origin"] >=20, "other")

In [None]:
df_cleaned[df_cleaned["Reviewer_Origin"] == "other"]

Unnamed: 0,Hotel_Name,City,Country,Reviewer_Origin,Unit_Size,Length_of_stay_nights,Group_Size,Review_Date,Review_Title,Review_Score,Positive,Negative,num_of_people_helpful
482,Park Inn by Radisson Amsterdam City West,Amsterdam,Netherlands,other,Standard Room,3,Solo traveler,2019-12-05,Wonderful,9.2,all the rest.,maybe with too many colours. not a lot of opti...,0
723,Park Inn by Radisson Amsterdam City West,Amsterdam,Netherlands,other,Superior Room,1,Solo traveler,2019-08-13,Good,7.1,the hotel and the location. friendly front des...,extremely poor service at the restaurant. sow ...,0
1029,Park Inn by Radisson Amsterdam City West,Amsterdam,Netherlands,other,Superior Room,3,Solo traveler,2022-03-20,very bad,1.0,nothing,very bad mattress\nshower water switch cold an...,0
1125,Park Inn by Radisson Amsterdam City West,Amsterdam,Netherlands,other,Junior Suite,2,Group,2021-10-24,Amsterdam,2.0,-once reached the hotel 4 p.m my rooms not rea...,-once reached the hotel 4 p.m my rooms not rea...,0
1804,Park Inn by Radisson Amsterdam City West,Amsterdam,Netherlands,other,5 nights · \n\nNovember 2019,5,Solo traveler,2019-11-25,"Enjoyable, but I think a bit expensive.",6.0,it was close to the train station and that mad...,the room safe-box was broken and although i wa...,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
772119,Ibsens Hotel,Copenhagen,Denmark,other,Medium Room,2,Group,2019-10-16,Pleasant,6.2,the room was really quiet. the checkin and our...,toilet is integrated into the shower. not very...,0
772748,Alexandra,Copenhagen,Denmark,other,Standard Double Room,3,Solo traveler,2020-03-05,Very Good,8.0,the room under the roof was really cozy and wa...,"breakfast was rather modest, too few warm opti...",0
772817,Alexandra,Copenhagen,Denmark,other,Standard Single Room,2,Group,2021-11-04,"Overpriced for quality of room you get, but qu...",6.0,rooms are nicely decorated in the style of the...,a newspaper/folder received at check-in contai...,0
773285,Tivoli Hotel,Copenhagen,Denmark,other,Standard Single Room,5,Solo traveler,2019-06-24,Comfortable bed,7.5,set up and comfort of the room,room was not cleaned every day - i broke some ...,0


In [None]:
df_cleaned.dtypes

Hotel_Name                       object
City                             object
Country                          object
Reviewer_Origin                  object
Unit_Size                        object
Length_of_stay_nights             int64
Group_Size                       object
Review_Date              datetime64[ns]
Review_Title                     object
Review_Score                    float64
Positive                         object
Negative                         object
num_of_people_helpful             int64
dtype: object

In [None]:
df_cleaned.columns

Index(['Hotel_Name', 'City', 'Country', 'Reviewer_Origin', 'Unit_Size',
       'Length_of_stay_nights', 'Group_Size', 'Review_Date', 'Review_Title',
       'Review_Score', 'Positive', 'Negative', 'num_of_people_helpful'],
      dtype='object')

In [None]:
df_NN = df_cleaned[['City', 'Country', 'Reviewer_Origin',
       'Group_Size', 'Length_of_stay_nights', 'num_of_people_helpful', #'Positive', 'Negative',  
       'Review_Score']]
df_NN.dropna(inplace=True)
df_NN

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Unnamed: 0,City,Country,Reviewer_Origin,Group_Size,Length_of_stay_nights,num_of_people_helpful,Review_Score
0,Amsterdam,Netherlands,South Africa,Solo traveler,3,0,10.0
1,Amsterdam,Netherlands,Spain,Solo traveler,2,0,8.0
2,Amsterdam,Netherlands,Poland,Solo traveler,2,0,9.0
3,Amsterdam,Netherlands,Iceland,Group,3,0,7.0
4,Amsterdam,Netherlands,United States of America,Family,1,0,7.0
...,...,...,...,...,...,...,...
774056,Copenhagen,Denmark,Lithuania,Group,2,0,8.8
774057,Copenhagen,Denmark,Czech Republic,Group,3,0,8.3
774058,Copenhagen,Denmark,Australia,Family,3,0,8.3
774059,Copenhagen,Denmark,Thailand,Couple,3,0,7.0


In [None]:
# example of ordinal encoding for a neural network
from tensorflow import keras
from tensorflow.keras import models
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder

In [None]:
df_NN[["num_of_people_helpful","Length_of_stay_nights"]] = df_NN[["num_of_people_helpful","Length_of_stay_nights"]].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [None]:
df_NN.dtypes

City                      object
Country                   object
Reviewer_Origin           object
Group_Size                object
Length_of_stay_nights      int64
num_of_people_helpful      int64
Review_Score             float64
dtype: object

In [None]:
df_NN.columns

Index(['City', 'Country', 'Reviewer_Origin', 'Group_Size',
       'Length_of_stay_nights', 'num_of_people_helpful', 'Review_Score'],
      dtype='object')

In [None]:
X = df_NN[['City', 'Country', 'Reviewer_Origin', 'Group_Size',
       'Length_of_stay_nights', 'num_of_people_helpful'#, 'Positive', 'Negative'
       ]]
Y = df_NN["Review_Score"]

In [None]:
X_train,X_test,y_train,y_test = train_test_split(df_NN.drop(columns=['Review_Score']),df_NN['Review_Score'],test_size=0.3)
print('Train/Test Sizes : ',X_train.shape, X_test.shape, y_train.shape, y_test.shape)

Train/Test Sizes :  (541842, 6) (232219, 6) (541842,) (232219,)


In [None]:
# Creation of neural network
neural_regressor = models.Sequential(
    [
        keras.layers.Dense(224, activation="relu", input_shape=(224,)),
        keras.layers.Dense(112, activation="relu"),
        keras.layers.Dense(52, activation="relu"),
        keras.layers.Dense(1)
    ]
)

neural_regressor.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 224)               50400     
                                                                 
 dense_1 (Dense)             (None, 112)               25200     
                                                                 
 dense_2 (Dense)             (None, 52)                5876      
                                                                 
 dense_3 (Dense)             (None, 1)                 53        
                                                                 
Total params: 81,529
Trainable params: 81,529
Non-trainable params: 0
_________________________________________________________________


In [None]:
pip install scikeras

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Calling of scikeras API
from scikeras.wrappers import KerasRegressor

scikeras_regressor = KerasRegressor(model=neural_regressor,
                                    loss="mean_squared_error",
                                    verbose=0,
                                    epochs=10,
                                    warm_start= True
                                  )

In [None]:
# Creation of pipeline process, Seperation of numeric, categorical and ordinal variables

numeric_features = ['Length_of_stay_nights', 'num_of_people_helpful']
numeric_transformer = MinMaxScaler()

categorical_features = ['City', 'Country', 'Reviewer_Origin']
categorical_transformer = OneHotEncoder()

ordinal_features = ['Group_Size']
ordinal_transformer = OrdinalEncoder()


#text_features = ["Positive","Negative"]
#tfid_transformer = TfidfVectorizer()


preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
        ("ord", ordinal_transformer, ordinal_features),
        #("text", tfid_transformer, text_features)        
    ], 
    sparse_threshold=0
)


In [None]:
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("scale", StandardScaler()),
        ('Model', scikeras_regressor)
        ]
)

In [None]:
## Declare Hyperparameters Range

params = {
    "Model__batch_size": [20, 40],
    "Model__optimizer": ["adam", "sgd"],
    "Model__epochs": [10,20]
}

In [None]:
grid = GridSearchCV(pipeline, param_grid = params, scoring="r2", n_jobs = -1, verbose=10)

In [None]:
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

In [None]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [None]:
print("Best Score  : {}".format(grid.best_score_))
print("Best Params : {}".format(grid.best_params_))

In [None]:
# With out standard scaler
from sklearn.metrics import mean_squared_error

print("Train MSE : {}".format(mean_squared_error(Y_train, grid.predict(X_train))))
print("Test  MSE : {}".format(mean_squared_error(Y_test, grid.predict(X_test))))

In [None]:
print("\nTrain R^2 : {}".format(grid.score(X_train, Y_train)))
print("Test  R^2 : {}".format(grid.score(X_test, Y_test)))

In [None]:
scikeras_regressor.model.save("keras_regressor")