In [48]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import classification_report , confusion_matrix , accuracy_score, f1_score,roc_auc_score, make_scorer

from hyperopt import fmin,hp,tpe,Trials,space_eval,STATUS_OK
from hyperopt.early_stop import no_progress_loss
import warnings
warnings.filterwarnings("ignore")

import torch
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer

# Aspect extraction method 2
## Some important stuff (xgboost used here is version 1.5.1, seems that the newer versions cannot accept negative numbers as class labels)

### Load train,val,test sets
- subset the extraction methods and the actual aspect scores 

In [49]:
train = pd.read_csv("data/train.csv")
val = pd.read_csv("data/val.csv")
test = pd.read_csv("data/test.csv")

# replace all NULL values with empty string, (NULL values means that there is no part of text in the review mentioning that particular aspect, i.e the sentiment should be neutral)
train.loc[train.food_aspect_extraction_2.isna(), "food_aspect_extraction_2"] = ""
train.loc[train.service_aspect_extraction_2.isna(), "service_aspect_extraction_2"] = ""
train.loc[train.ambience_aspect_extraction_2.isna(), "ambience_aspect_extraction_2"] = ""

val.loc[val.food_aspect_extraction_2.isna(), "food_aspect_extraction_2"] = ""
val.loc[val.service_aspect_extraction_2.isna(), "service_aspect_extraction_2"] = ""
val.loc[val.ambience_aspect_extraction_2.isna(), "ambience_aspect_extraction_2"] = ""

test.loc[test.food_aspect_extraction_2.isna(), "food_aspect_extraction_2"] = ""
test.loc[test.service_aspect_extraction_2.isna(), "service_aspect_extraction_2"] = ""
test.loc[test.ambience_aspect_extraction_2.isna(), "ambience_aspect_extraction_2"] = ""



#extra features (other than text)
train_rating, val_rating, test_rating = train.rating, val.rating, test.rating

train.head(2)

Unnamed: 0.1,Unnamed: 0,id_review,caption,relative_date,retrieval_date,rating,username,n_review_user,n_photo_user,url_user,...,other_sentiment,food_aspect_extraction_1,service_aspect_extraction_1,ambience_aspect_extraction_1,food_aspect_extraction_2,service_aspect_extraction_2,ambience_aspect_extraction_2,food_aspect_extraction_3,service_aspect_extraction_3,ambience_aspect_extraction_3
0,0,ChZDSUhNMG9nS0VJQ0FnSUQ4bXFuSU53EAE,Staffs are rude. Food served cold. Clearly ove...,2 years ago,48:59.1,2,Vanessa Nee,0,0,https://www.google.com/maps/contrib/1081475767...,...,-1,food serve cold,staff rude,,,,,,,
1,1,ChZDSUhNMG9nS0VJQ0FnSUQybFBEUVVREAE,Overrated. Good for Instagram photos. Queue sy...,10 months ago,14:28.0,2,Gwen Goh,0,0,https://www.google.com/maps/contrib/1167846711...,...,0,queue system confusing i dine order,person ask i takeout queue staff say free sit ...,overrate good instagram photo,,,,,,


In [50]:
X_train_food_aspect = train.food_aspect_extraction_2
Y_train_food_aspect = train.food_sentiment
print("Food Aspect")
print(Y_train_food_aspect.value_counts() , "\n")

X_train_service_aspect = train.service_aspect_extraction_2
Y_train_service_aspect = train.service_sentiment
print("Service Aspect")
print(Y_train_service_aspect.value_counts() , "\n")

X_train_ambience_aspect = train.ambience_aspect_extraction_2
Y_train_ambience_aspect = train.ambience_sentiment
print("Ambience Aspect")
print(Y_train_ambience_aspect.value_counts() , "\n")

Food Aspect
 1    1839
 0     700
-1     261
Name: food_sentiment, dtype: int64 

Service Aspect
 0    1960
 1     560
-1     280
Name: service_sentiment, dtype: int64 

Ambience Aspect
 0    1847
 1     860
-1      93
Name: ambience_sentiment, dtype: int64 



In [51]:
X_val_food_aspect = val.food_aspect_extraction_2
Y_val_food_aspect = val.food_sentiment
print("Food Aspect")
print(Y_val_food_aspect.value_counts() , "\n")

X_val_service_aspect = val.service_aspect_extraction_2
Y_val_service_aspect = val.service_sentiment
print("Service Aspect")
print(Y_val_service_aspect.value_counts() , "\n")

X_val_ambience_aspect = val.ambience_aspect_extraction_2
Y_val_ambience_aspect = val.ambience_sentiment
print("Ambience Aspect")
print(Y_val_ambience_aspect.value_counts() , "\n")

Food Aspect
 1    315
 0    140
-1     45
Name: food_sentiment, dtype: int64 

Service Aspect
 0    353
 1     87
-1     60
Name: service_sentiment, dtype: int64 

Ambience Aspect
 0    334
 1    149
-1     17
Name: ambience_sentiment, dtype: int64 



In [52]:
X_test_food_aspect = test.food_aspect_extraction_2
Y_test_food_aspect = test.food_sentiment
print("Food Aspect")
print(Y_test_food_aspect.value_counts() , "\n")

X_test_service_aspect = test.service_aspect_extraction_2
Y_test_service_aspect = test.service_sentiment
print("Service Aspect")
print(Y_test_service_aspect.value_counts() , "\n")

X_test_ambience_aspect = test.ambience_aspect_extraction_2
Y_test_ambience_aspect = test.ambience_sentiment
print("Ambience Aspect")
print(Y_test_ambience_aspect.value_counts() , "\n")

Food Aspect
 1    410
 0    141
-1     41
Name: food_sentiment, dtype: int64 

Service Aspect
 0    411
 1    131
-1     50
Name: service_sentiment, dtype: int64 

Ambience Aspect
 0    380
 1    190
-1     22
Name: ambience_sentiment, dtype: int64 



# Bag of Words
### Food Aspect
* BOW encoding + rating (out of 5 stars) as features

In [53]:
models = {} # for storing all the models
results_table = {
    "model_name":[],
    "train_accuracy":[],
    "test_accuracy":[],
    "train_weighted_avg_f1":[],
    "test_weghted_avg_f1":[],
    "train_NEG_precision":[],
    "test_NEG_precision":[],
    "train_NEG_recall":[],
    "test_NEG_recall":[],
    "train_NEU_precision":[],
    "test_NEU_precision":[],
    "train_NEU_recall":[],
    "test_NEU_recall":[],
    "train_POS_precision":[],
    "test_POS_precision":[],
    "train_POS_recall":[],
    "test_POS_recall":[]

}

In [54]:
# Food aspect
food_aspect_bow = CountVectorizer()
X_train_food_aspect_bow = food_aspect_bow.fit_transform(X_train_food_aspect).toarray()
X_train_food_aspect_bow = np.hstack((X_train_food_aspect_bow,train_rating.to_numpy().reshape(-1,1)))
X_train_food_aspect_bow

array([[0, 0, 0, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 0, 5],
       ...,
       [0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 2]])

In [55]:
X_val_food_aspect_bow = food_aspect_bow.transform(X_val_food_aspect).toarray()
X_val_food_aspect_bow = np.hstack((X_val_food_aspect_bow,val_rating.to_numpy().reshape(-1,1)))
X_val_food_aspect_bow

array([[0, 0, 0, ..., 1, 0, 5],
       [0, 0, 0, ..., 0, 0, 4],
       [0, 0, 0, ..., 0, 0, 5],
       ...,
       [0, 0, 0, ..., 0, 0, 3],
       [0, 0, 0, ..., 0, 0, 4],
       [0, 0, 0, ..., 0, 0, 5]])

In [56]:
X_test_food_aspect_bow = food_aspect_bow.transform(X_test_food_aspect).toarray()
X_test_food_aspect_bow = np.hstack((X_test_food_aspect_bow,test_rating.to_numpy().reshape(-1,1)))
X_test_food_aspect_bow

array([[0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 1, 0, 3],
       [0, 0, 0, ..., 0, 0, 5],
       ...,
       [0, 0, 0, ..., 0, 0, 4],
       [0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 5]])

In [57]:
# Logistic model
scorer = make_scorer(f1_score)

LOGISTIC_search_space = {
    "C": hp.uniform("C",0,0.2),
    "max_iter" : hp.randint("max_iter",100,300),
}

def obj(search_space):
    model = LogisticRegression(**search_space)
    model.fit(X_train_food_aspect_bow, Y_train_food_aspect)
    preds = model.predict(X_val_food_aspect_bow)
    score = f1_score(Y_val_food_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = LOGISTIC_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(15)) 
params = space_eval(LOGISTIC_search_space,best_params)
print("Selected Params:")
print(params)

bow_food_aspect_logistic = LogisticRegression(**params)
bow_food_aspect_logistic.fit(X_train_food_aspect_bow,Y_train_food_aspect) 

# store the model and their encoders
model_name = "BOW_food_LgstcRegression"
models[model_name] = (bow_food_aspect_logistic, food_aspect_bow)



# store the results in a table
# performance on test set


train_report = classification_report(Y_train_food_aspect, bow_food_aspect_logistic.predict(X_train_food_aspect_bow), output_dict = True) 
test_report = classification_report(Y_test_food_aspect, bow_food_aspect_logistic.predict(X_test_food_aspect_bow), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])


 44%|████▍     | 22/50 [00:23<00:29,  1.07s/trial, best loss: -0.7104075005789108]
Selected Params:
{'C': 0.025818073682339793, 'max_iter': 111}


In [58]:
# RandomForest Classifier

RF_search_space = {
    "n_estimators" : hp.randint("n_estimators", 100, 300),
    "max_depth" : hp.randint("max_depth", 8,200),
    "ccp_alpha" : hp.uniform("ccp_alpha", 0.001,0.02)
}


def obj(search_space):
    model = RandomForestClassifier(**search_space)
    model.fit(X_train_food_aspect_bow, Y_train_food_aspect)
    preds = model.predict(X_val_food_aspect_bow)
    score = f1_score(Y_val_food_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = RF_search_space, algo = tpe.suggest, max_evals= 100, early_stop_fn= no_progress_loss(50)) 
params = space_eval(RF_search_space,best_params)
print("Selected Params:")
print(params)

bow_food_aspect_rf = RandomForestClassifier(**params)
bow_food_aspect_rf.fit(X_train_food_aspect_bow,Y_train_food_aspect) 

# store the mode

model_name = "BOW_food_RF"
models[model_name] = (bow_food_aspect_rf, food_aspect_bow)


train_report = classification_report(Y_train_food_aspect, bow_food_aspect_rf.predict(X_train_food_aspect_bow), output_dict = True) 
test_report = classification_report(Y_test_food_aspect, bow_food_aspect_rf.predict(X_test_food_aspect_bow), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])



 86%|████████▌ | 86/100 [01:08<00:11,  1.26trial/s, best loss: -0.6902769397961471]
Selected Params:
{'ccp_alpha': 0.0010689707853670741, 'max_depth': 199, 'n_estimators': 206}


In [59]:
#XBG Classifier class_weight = {1: 0.65, 0: 0.25, -1:10}


XGB_search_space = {
    "max_depth" : hp.randint("max_depth",1,8),
    "n_estimators" : hp.randint("n_estimators",500,1000),
    "eta" : hp.uniform("eta", 0.01,0.2),
}


def obj(search_space):
    model = XGBClassifier(**search_space, verbosity = 0)
    model.fit(X_train_food_aspect_bow, Y_train_food_aspect)
    preds = model.predict(X_val_food_aspect_bow)
    score = f1_score(Y_val_food_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = XGB_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(10)) 
params = space_eval(XGB_search_space,best_params)
print("Selected Params:")
print(params)

bow_food_aspect_xgb = XGBClassifier(**params)
bow_food_aspect_xgb.fit(X_train_food_aspect_bow,Y_train_food_aspect) 


model_name = "BOW_food_XGB"
models[model_name] = (bow_food_aspect_xgb, food_aspect_bow)

train_report = classification_report(Y_train_food_aspect, bow_food_aspect_xgb.predict(X_train_food_aspect_bow), output_dict = True) 
test_report = classification_report(Y_test_food_aspect, bow_food_aspect_xgb.predict(X_test_food_aspect_bow), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])


 62%|██████▏   | 31/50 [20:45<12:43, 40.17s/trial, best loss: -0.7202240672009143]
Selected Params:
{'eta': 0.02131351124530673, 'max_depth': 3, 'n_estimators': 935}


### Service aspect

In [60]:
service_aspect_bow = CountVectorizer()
X_train_service_aspect_bow = service_aspect_bow.fit_transform(X_train_service_aspect).toarray()
X_train_service_aspect_bow = np.hstack((X_train_service_aspect_bow,train_rating.to_numpy().reshape(-1,1)))
X_train_service_aspect_bow

array([[0, 0, 0, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 0, 5],
       ...,
       [0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 2]])

In [61]:
X_val_service_aspect_bow = service_aspect_bow.transform(X_val_service_aspect).toarray()
X_val_service_aspect_bow = np.hstack((X_val_service_aspect_bow,val_rating.to_numpy().reshape(-1,1)))
X_val_service_aspect_bow

array([[0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 4],
       [0, 0, 0, ..., 0, 0, 5],
       ...,
       [0, 0, 0, ..., 0, 0, 3],
       [0, 0, 0, ..., 0, 0, 4],
       [0, 0, 0, ..., 0, 0, 5]])

In [62]:
X_test_service_aspect_bow = service_aspect_bow.transform(X_test_service_aspect).toarray()
X_test_service_aspect_bow = np.hstack((X_test_service_aspect_bow,test_rating.to_numpy().reshape(-1,1)))
X_test_service_aspect_bow

array([[0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 3],
       [0, 0, 0, ..., 0, 0, 5],
       ...,
       [0, 0, 0, ..., 0, 0, 4],
       [0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 5]])

In [63]:
# Logistic model
scorer = make_scorer(f1_score)

LOGISTIC_search_space = {
    "C": hp.uniform("C",0,0.2),
    "max_iter" : hp.randint("max_iter",100,300),
}

def obj(search_space):
    model = LogisticRegression(**search_space, class_weight = {1: 0.65, 0: 0.25, -1:10})
    model.fit(X_train_service_aspect_bow, Y_train_service_aspect)
    preds = model.predict(X_val_service_aspect_bow)
    score = f1_score(Y_val_service_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = LOGISTIC_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(15)) 
params = space_eval(LOGISTIC_search_space,best_params)
print("Selected Params:")
print(params)

bow_service_aspect_logistic = LogisticRegression(**params, class_weight = {1: 0.65, 0: 0.25, -1:10})
bow_service_aspect_logistic.fit(X_train_service_aspect_bow,Y_train_service_aspect) 

# store the mode

model_name = "BOW_service_LgstcRegression"
models[model_name] = (bow_service_aspect_logistic, service_aspect_bow)

train_report = classification_report(Y_train_service_aspect, bow_service_aspect_logistic.predict(X_train_service_aspect_bow), output_dict = True) 
test_report = classification_report(Y_test_service_aspect, bow_service_aspect_logistic.predict(X_test_service_aspect_bow), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

 34%|███▍      | 17/50 [00:06<00:13,  2.49trial/s, best loss: -0.5550041169736385]
Selected Params:
{'C': 0.19298183554213208, 'max_iter': 269}


In [64]:
# RandomForest Classifier

RF_search_space = {
    "n_estimators" : hp.randint("n_estimators", 100, 300),
    "max_depth" : hp.randint("max_depth", 8,200),
    "ccp_alpha" : hp.uniform("ccp_alpha", 0.001,0.02)
}


def obj(search_space):
    model = RandomForestClassifier(**search_space)
    model.fit(X_train_service_aspect_bow, Y_train_service_aspect)
    preds = model.predict(X_val_service_aspect_bow)
    score = f1_score(Y_val_service_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = RF_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(50)) 
params = space_eval(RF_search_space,best_params)
print("Selected Params:")
print(params)

bow_service_aspect_rf = RandomForestClassifier(**params)
bow_service_aspect_rf.fit(X_train_service_aspect_bow,Y_train_service_aspect) 

# store the mode

model_name = "BOW_service_RF"
models[model_name] = (bow_service_aspect_rf, service_aspect_bow)

train_report = classification_report(Y_train_service_aspect, bow_service_aspect_rf.predict(X_train_service_aspect_bow), output_dict = True) 
test_report = classification_report(Y_test_service_aspect, bow_service_aspect_rf.predict(X_test_service_aspect_bow), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

100%|██████████| 50/50 [00:19<00:00,  2.52trial/s, best loss: -0.8189195962951777]
Selected Params:
{'ccp_alpha': 0.0010124039198759769, 'max_depth': 152, 'n_estimators': 193}


In [65]:
XGB_search_space = {
    "max_depth" : hp.randint("max_depth",1,8),
    "n_estimators" : hp.randint("n_estimators",300,1000),
    "eta" : hp.uniform("eta", 0.01,0.2)
}


def obj(search_space):
    model = XGBClassifier(**search_space, verbosity = 0)
    model.fit(X_train_service_aspect_bow, Y_train_service_aspect)
    preds = model.predict(X_val_service_aspect_bow)
    score = f1_score(Y_val_service_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = XGB_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(20)) 
params = space_eval(XGB_search_space,best_params)
print("Selected Params:")
print(params)

bow_service_aspect_xgb = XGBClassifier(**params)
bow_service_aspect_xgb.fit(X_train_service_aspect_bow,Y_train_service_aspect) 

# store the model
model_name = "BOW_service_XGB"
models[model_name] = (bow_service_aspect_xgb, service_aspect_bow)

train_report = classification_report(Y_train_service_aspect, bow_service_aspect_xgb.predict(X_train_service_aspect_bow), output_dict = True) 
test_report = classification_report(Y_test_service_aspect, bow_service_aspect_xgb.predict(X_test_service_aspect_bow), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

 44%|████▍     | 22/50 [06:27<08:12, 17.60s/trial, best loss: -0.8277438388600629]
Selected Params:
{'eta': 0.08933452275128682, 'max_depth': 1, 'n_estimators': 419}


### Ambience Aspect

In [66]:
ambience_aspect_bow = CountVectorizer()
X_train_ambience_aspect_bow = ambience_aspect_bow.fit_transform(X_train_ambience_aspect).toarray()
X_train_ambience_aspect_bow = np.hstack((X_train_ambience_aspect_bow,train_rating.to_numpy().reshape(-1,1)))
X_train_ambience_aspect_bow

array([[0, 0, 0, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 0, 5],
       ...,
       [0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 2]])

In [67]:
X_val_ambience_aspect_bow = ambience_aspect_bow.transform(X_val_ambience_aspect).toarray()
X_val_ambience_aspect_bow = np.hstack((X_val_ambience_aspect_bow,val_rating.to_numpy().reshape(-1,1)))
X_val_ambience_aspect_bow

array([[0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 4],
       [0, 0, 0, ..., 0, 0, 5],
       ...,
       [0, 0, 0, ..., 0, 0, 3],
       [0, 0, 0, ..., 0, 0, 4],
       [0, 0, 0, ..., 0, 0, 5]])

In [68]:
X_test_ambience_aspect_bow = ambience_aspect_bow.transform(X_test_ambience_aspect).toarray()
X_test_ambience_aspect_bow = np.hstack((X_test_ambience_aspect_bow,test_rating.to_numpy().reshape(-1,1)))
X_test_ambience_aspect_bow

array([[0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 3],
       [0, 0, 0, ..., 0, 0, 5],
       ...,
       [0, 0, 0, ..., 0, 0, 4],
       [0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 5]])

In [69]:
# Logistic model
scorer = make_scorer(f1_score)

LOGISTIC_search_space = {
    "C": hp.uniform("C",0,0.2),
    "max_iter" : hp.randint("max_iter",100,300),
}

def obj(search_space):
    model = LogisticRegression(**search_space)
    model.fit(X_train_ambience_aspect_bow, Y_train_ambience_aspect)
    preds = model.predict(X_val_ambience_aspect_bow)
    score = f1_score(Y_val_ambience_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = LOGISTIC_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(15)) 
params = space_eval(LOGISTIC_search_space,best_params)
print("Selected Params:")
print(params)

bow_ambience_aspect_logistic = LogisticRegression(**params)
bow_ambience_aspect_logistic.fit(X_train_ambience_aspect_bow,Y_train_ambience_aspect) 

# store the model
model_name = "BOW_ambience_LgstcRegression"
models[model_name] = (bow_ambience_aspect_logistic, ambience_aspect_bow)

train_report = classification_report(Y_train_ambience_aspect, bow_ambience_aspect_logistic.predict(X_train_ambience_aspect_bow), output_dict = True) 
test_report = classification_report(Y_test_ambience_aspect, bow_ambience_aspect_logistic.predict(X_test_ambience_aspect_bow), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

 34%|███▍      | 17/50 [00:05<00:11,  2.87trial/s, best loss: -0.729064057487185]
Selected Params:
{'C': 0.15148467846800184, 'max_iter': 241}


In [70]:
RF_search_space = {
    "n_estimators" : hp.randint("n_estimators", 100, 300),
    "max_depth" : hp.randint("max_depth", 8,200),
    "ccp_alpha" : hp.uniform("ccp_alpha", 0.001,0.02)
}


def obj(search_space):
    model = RandomForestClassifier(**search_space)
    model.fit(X_train_ambience_aspect_bow, Y_train_ambience_aspect)
    preds = model.predict(X_val_ambience_aspect_bow)
    score = f1_score(Y_val_ambience_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = RF_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(50)) 
params = space_eval(RF_search_space,best_params)
print("Selected Params:")
print(params)

bow_ambience_aspect_rf = RandomForestClassifier(**params)
bow_ambience_aspect_rf.fit(X_train_ambience_aspect_bow,Y_train_ambience_aspect) 

# store the mode

model_name = "BOW_ambience_RF"
models[model_name] = (bow_ambience_aspect_rf, ambience_aspect_bow)

train_report = classification_report(Y_train_ambience_aspect, bow_ambience_aspect_rf.predict(X_train_ambience_aspect_bow), output_dict = True) 
test_report = classification_report(Y_test_ambience_aspect, bow_ambience_aspect_rf.predict(X_test_ambience_aspect_bow), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

100%|██████████| 50/50 [00:20<00:00,  2.44trial/s, best loss: -0.7437442110383367]
Selected Params:
{'ccp_alpha': 0.0011409082646532465, 'max_depth': 191, 'n_estimators': 286}


In [71]:
XGB_search_space = {
    "max_depth" : hp.randint("max_depth",1,8),
    "n_estimators" : hp.randint("n_estimators",300,1000),
    "eta" : hp.uniform("eta", 0.01,0.2)
}


def obj(search_space):
    model = XGBClassifier(**search_space, verbosity = 0)
    model.fit(X_train_ambience_aspect_bow, Y_train_ambience_aspect)
    preds = model.predict(X_val_ambience_aspect_bow)
    score = f1_score(Y_val_ambience_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = XGB_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(20)) 
params = space_eval(XGB_search_space,best_params)
print("Selected Params:")
print(params)

bow_ambience_aspect_xgb = XGBClassifier(**params)
bow_ambience_aspect_xgb.fit(X_train_ambience_aspect_bow,Y_train_ambience_aspect) 

# store the mode

model_name = "BOW_ambience_XGB"
models[model_name] = (bow_ambience_aspect_xgb, ambience_aspect_bow)

train_report = classification_report(Y_train_ambience_aspect, bow_ambience_aspect_xgb.predict(X_train_ambience_aspect_bow), output_dict = True) 
test_report = classification_report(Y_test_ambience_aspect, bow_ambience_aspect_xgb.predict(X_test_ambience_aspect_bow), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

 74%|███████▍  | 37/50 [12:22<04:20, 20.07s/trial, best loss: -0.7513904604782047]
Selected Params:
{'eta': 0.142068171783255, 'max_depth': 4, 'n_estimators': 958}


# Tf-idf model
### Food aspect

In [72]:
food_aspect_tfidf = TfidfVectorizer()
X_train_food_aspect_tfidf = food_aspect_tfidf.fit_transform(X_train_food_aspect).toarray()
X_train_food_aspect_tfidf = np.hstack((X_train_food_aspect_tfidf,train_rating.to_numpy().reshape(-1,1)))
X_train_food_aspect_tfidf

array([[0., 0., 0., ..., 0., 0., 2.],
       [0., 0., 0., ..., 0., 0., 2.],
       [0., 0., 0., ..., 0., 0., 5.],
       ...,
       [0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 2.]])

In [73]:
X_val_food_aspect_tfidf = food_aspect_tfidf.transform(X_val_food_aspect).toarray()
X_val_food_aspect_tfidf = np.hstack((X_val_food_aspect_tfidf,val_rating.to_numpy().reshape(-1,1)))
X_val_food_aspect_tfidf

array([[0.        , 0.        , 0.        , ..., 0.60483196, 0.        ,
        5.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        4.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        5.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        3.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        4.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        5.        ]])

In [74]:
X_test_food_aspect_tfidf = food_aspect_tfidf.transform(X_test_food_aspect).toarray()
X_test_food_aspect_tfidf = np.hstack((X_test_food_aspect_tfidf,test_rating.to_numpy().reshape(-1,1)))
X_test_food_aspect_tfidf

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        5.        ],
       [0.        , 0.        , 0.        , ..., 0.48101005, 0.        ,
        3.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        5.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        4.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        5.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        5.        ]])

In [75]:
# Logistic model

LOGISTIC_search_space = {
    "C": hp.uniform("C",0.5,2),
    "max_iter" : hp.randint("max_iter",100,200),
}

def obj(search_space):
    model = LogisticRegression(**search_space, class_weight = {1: 0.65, 0: 0.25, -1:10})
    model.fit(X_train_food_aspect_tfidf, Y_train_food_aspect)
    preds = model.predict(X_val_food_aspect_tfidf)
    score = f1_score(Y_val_food_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = LOGISTIC_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(15)) 
params = space_eval(LOGISTIC_search_space,best_params)
print("Selected Params:")
print(params)

tfidf_food_aspect_logistic = LogisticRegression(**params, class_weight = {1: 0.65, 0: 0.25, -1:10})
tfidf_food_aspect_logistic.fit(X_train_food_aspect_tfidf,Y_train_food_aspect) 



model_name = "TFIDF_food_LgstcRegression"
models[model_name] = (tfidf_food_aspect_logistic, food_aspect_tfidf)

train_report = classification_report(Y_train_food_aspect, tfidf_food_aspect_logistic.predict(X_train_food_aspect_tfidf), output_dict = True) 
test_report = classification_report(Y_test_food_aspect, tfidf_food_aspect_logistic.predict(X_test_food_aspect_tfidf), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

 30%|███       | 15/50 [00:30<01:10,  2.03s/trial, best loss: -0.58342804675286]
Selected Params:
{'C': 1.2006157106503257, 'max_iter': 125}


In [76]:
RF_search_space = {
    "n_estimators" : hp.randint("n_estimators", 100, 300),
    "max_depth" : hp.randint("max_depth", 8,200),
    "ccp_alpha" : hp.uniform("ccp_alpha", 0.001,0.02)
}


def obj(search_space):
    model = RandomForestClassifier(**search_space)
    model.fit(X_train_food_aspect_tfidf, Y_train_food_aspect)
    preds = model.predict(X_val_food_aspect_tfidf)
    score = f1_score(Y_val_food_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = RF_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(50)) 
params = space_eval(RF_search_space,best_params)
print("Selected Params:")
print(params)

tfidf_food_aspect_rf = RandomForestClassifier(**params)
tfidf_food_aspect_rf.fit(X_train_food_aspect_tfidf,Y_train_food_aspect) 

# store the mode

model_name = "TFIDF_food_RF"
models[model_name] = (tfidf_food_aspect_rf, food_aspect_tfidf)

train_report = classification_report(Y_train_food_aspect, tfidf_food_aspect_rf.predict(X_train_food_aspect_tfidf), output_dict = True) 
test_report = classification_report(Y_test_food_aspect, tfidf_food_aspect_rf.predict(X_test_food_aspect_tfidf), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

100%|██████████| 50/50 [00:49<00:00,  1.00trial/s, best loss: -0.700783957866268] 
Selected Params:
{'ccp_alpha': 0.0010803152656957882, 'max_depth': 158, 'n_estimators': 110}


In [77]:
XGB_search_space = {
    "max_depth" : hp.randint("max_depth",1,8),
    "n_estimators" : hp.randint("n_estimators",300,1000),
    "eta" : hp.uniform("eta", 0.01,0.2)
}


def obj(search_space):
    model = XGBClassifier(**search_space, verbosity = 0)
    model.fit(X_train_food_aspect_tfidf, Y_train_food_aspect)
    preds = model.predict(X_val_food_aspect_tfidf)
    score = f1_score(Y_val_food_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = XGB_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(20)) 
params = space_eval(XGB_search_space,best_params)
print("Selected Params:")
print(params)

tfidf_food_aspect_xgb = XGBClassifier(**params)
tfidf_food_aspect_xgb.fit(X_train_food_aspect_tfidf,Y_train_food_aspect) 

# store themodel

model_name = "tfidf_food_XGB"
models[model_name] = (tfidf_food_aspect_xgb, food_aspect_tfidf)

train_report = classification_report(Y_train_food_aspect, tfidf_food_aspect_xgb.predict(X_train_food_aspect_tfidf), output_dict = True) 
test_report = classification_report(Y_test_food_aspect, tfidf_food_aspect_xgb.predict(X_test_food_aspect_tfidf), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

 80%|████████  | 40/50 [26:16<06:34, 39.42s/trial, best loss: -0.7214128071481757]
Selected Params:
{'eta': 0.11096634559815331, 'max_depth': 1, 'n_estimators': 837}


### Service aspect

In [78]:
service_aspect_tfidf = TfidfVectorizer()
X_train_service_aspect_tfidf = service_aspect_tfidf.fit_transform(X_train_service_aspect).toarray()
X_train_service_aspect_tfidf = np.hstack((X_train_service_aspect_tfidf,train_rating.to_numpy().reshape(-1,1)))
X_train_service_aspect_tfidf

array([[0., 0., 0., ..., 0., 0., 2.],
       [0., 0., 0., ..., 0., 0., 2.],
       [0., 0., 0., ..., 0., 0., 5.],
       ...,
       [0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 2.]])

In [79]:
X_val_service_aspect_tfidf = service_aspect_tfidf.transform(X_val_service_aspect).toarray()
X_val_service_aspect_tfidf = np.hstack((X_val_service_aspect_tfidf,val_rating.to_numpy().reshape(-1,1)))
X_val_service_aspect_tfidf

array([[0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 4.],
       [0., 0., 0., ..., 0., 0., 5.],
       ...,
       [0., 0., 0., ..., 0., 0., 3.],
       [0., 0., 0., ..., 0., 0., 4.],
       [0., 0., 0., ..., 0., 0., 5.]])

In [80]:
X_test_service_aspect_tfidf = service_aspect_tfidf.transform(X_test_service_aspect).toarray()
X_test_service_aspect_tfidf = np.hstack((X_test_service_aspect_tfidf,test_rating.to_numpy().reshape(-1,1)))
X_test_service_aspect_tfidf

array([[0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 3.],
       [0., 0., 0., ..., 0., 0., 5.],
       ...,
       [0., 0., 0., ..., 0., 0., 4.],
       [0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 5.]])

In [81]:
LOGISTIC_search_space = {
    "C": hp.uniform("C",0,0.2),
    "max_iter" : hp.randint("max_iter",100,300),
}

def obj(search_space):
    model = LogisticRegression(**search_space)
    model.fit(X_train_service_aspect_tfidf, Y_train_service_aspect)
    preds = model.predict(X_val_service_aspect_tfidf)
    score = f1_score(Y_val_service_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = LOGISTIC_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(15)) 
params = space_eval(LOGISTIC_search_space,best_params)
print("Selected Params:")
print(params)

tfidf_service_aspect_logistic = LogisticRegression(**params)
tfidf_service_aspect_logistic.fit(X_train_service_aspect_tfidf,Y_train_service_aspect) 

# store the mode

model_name = "TFIDF_service_LgstcRegression"
models[model_name] = (tfidf_service_aspect_logistic, service_aspect_tfidf)

train_report = classification_report(Y_train_service_aspect, tfidf_service_aspect_logistic.predict(X_train_service_aspect_tfidf), output_dict = True) 
test_report = classification_report(Y_test_service_aspect, tfidf_service_aspect_logistic.predict(X_test_service_aspect_tfidf), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

 34%|███▍      | 17/50 [00:05<00:10,  3.30trial/s, best loss: -0.8224990149169716]
Selected Params:
{'C': 0.19135746716814583, 'max_iter': 285}


In [82]:
RF_search_space = {
    "n_estimators" : hp.randint("n_estimators", 100, 300),
    "max_depth" : hp.randint("max_depth", 8,200),
    "ccp_alpha" : hp.uniform("ccp_alpha", 0.001,0.02)
}


def obj(search_space):
    model = RandomForestClassifier(**search_space)
    model.fit(X_train_service_aspect_tfidf, Y_train_service_aspect)
    preds = model.predict(X_val_service_aspect_tfidf)
    score = f1_score(Y_val_service_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = RF_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(50)) 
params = space_eval(RF_search_space,best_params)
print("Selected Params:")
print(params)

tfidf_service_aspect_rf = RandomForestClassifier(**params)
tfidf_service_aspect_rf.fit(X_train_service_aspect_tfidf,Y_train_service_aspect) 

# store the mode

model_name = "TFIDF_service_RF"
models[model_name] = (tfidf_service_aspect_rf, service_aspect_tfidf)

train_report = classification_report(Y_train_service_aspect, tfidf_service_aspect_rf.predict(X_train_service_aspect_tfidf), output_dict = True) 
test_report = classification_report(Y_test_service_aspect, tfidf_service_aspect_rf.predict(X_test_service_aspect_tfidf), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

100%|██████████| 50/50 [00:20<00:00,  2.44trial/s, best loss: -0.8083557339751585]
Selected Params:
{'ccp_alpha': 0.0010680664927856785, 'max_depth': 117, 'n_estimators': 109}


In [83]:
XGB_search_space = {
    "max_depth" : hp.randint("max_depth",1,8),
    "n_estimators" : hp.randint("n_estimators",300,1000),
    "eta" : hp.uniform("eta", 0.01,0.2)
}


def obj(search_space):
    model = XGBClassifier(**search_space, verbosity = 0)
    model.fit(X_train_service_aspect_tfidf, Y_train_service_aspect)
    preds = model.predict(X_val_service_aspect_tfidf)
    score = f1_score(Y_val_service_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = XGB_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(20)) 
params = space_eval(XGB_search_space,best_params)
print("Selected Params:")
print(params)

tfidf_service_aspect_xgb = XGBClassifier(**params)
tfidf_service_aspect_xgb.fit(X_train_service_aspect_tfidf,Y_train_service_aspect) 

# store the mode
model_name = "tfidf_service_XGB"
models[model_name] = (tfidf_service_aspect_xgb, service_aspect_tfidf)

train_report = classification_report(Y_train_service_aspect, tfidf_service_aspect_xgb.predict(X_train_service_aspect_tfidf), output_dict = True) 
test_report = classification_report(Y_test_service_aspect, tfidf_service_aspect_xgb.predict(X_test_service_aspect_tfidf), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

 46%|████▌     | 23/50 [08:26<09:55, 22.04s/trial, best loss: -0.8272981894568371]
Selected Params:
{'eta': 0.07247922988230135, 'max_depth': 7, 'n_estimators': 810}


### Ambience

In [84]:
ambience_aspect_tfidf = TfidfVectorizer()
X_train_ambience_aspect_tfidf = ambience_aspect_tfidf.fit_transform(X_train_ambience_aspect).toarray()
X_train_ambience_aspect_tfidf = np.hstack((X_train_ambience_aspect_tfidf,train_rating.to_numpy().reshape(-1,1)))
X_train_ambience_aspect_tfidf

array([[0., 0., 0., ..., 0., 0., 2.],
       [0., 0., 0., ..., 0., 0., 2.],
       [0., 0., 0., ..., 0., 0., 5.],
       ...,
       [0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 2.]])

In [85]:
X_val_ambience_aspect_tfidf = ambience_aspect_tfidf.transform(X_val_ambience_aspect).toarray()
X_val_ambience_aspect_tfidf = np.hstack((X_val_ambience_aspect_tfidf,val_rating.to_numpy().reshape(-1,1)))
X_val_ambience_aspect_tfidf

array([[0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 4.],
       [0., 0., 0., ..., 0., 0., 5.],
       ...,
       [0., 0., 0., ..., 0., 0., 3.],
       [0., 0., 0., ..., 0., 0., 4.],
       [0., 0., 0., ..., 0., 0., 5.]])

In [86]:
X_test_ambience_aspect_tfidf = ambience_aspect_tfidf.transform(X_test_ambience_aspect).toarray()
X_test_ambience_aspect_tfidf = np.hstack((X_test_ambience_aspect_tfidf,test_rating.to_numpy().reshape(-1,1)))
X_test_ambience_aspect_tfidf

array([[0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 3.],
       [0., 0., 0., ..., 0., 0., 5.],
       ...,
       [0., 0., 0., ..., 0., 0., 4.],
       [0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 5.]])

In [87]:
LOGISTIC_search_space = {
    "C": hp.uniform("C",0,0.2),
    "max_iter" : hp.randint("max_iter",100,300),
}

def obj(search_space):
    model = LogisticRegression(**search_space)
    model.fit(X_train_ambience_aspect_tfidf, Y_train_ambience_aspect)
    preds = model.predict(X_val_ambience_aspect_tfidf)
    score = f1_score(Y_val_ambience_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = LOGISTIC_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(15)) 
params = space_eval(LOGISTIC_search_space,best_params)
print("Selected Params:")
print(params)

tfidf_ambience_aspect_logistic = LogisticRegression(**params)
tfidf_ambience_aspect_logistic.fit(X_train_ambience_aspect_tfidf,Y_train_ambience_aspect) 

# store the mode

model_name = "TFIDF_ambience_LgstcRegression"
models[model_name] = (tfidf_ambience_aspect_logistic, ambience_aspect_tfidf)

train_report = classification_report(Y_train_ambience_aspect, tfidf_ambience_aspect_logistic.predict(X_train_ambience_aspect_tfidf), output_dict = True) 
test_report = classification_report(Y_test_ambience_aspect, tfidf_ambience_aspect_logistic.predict(X_test_ambience_aspect_tfidf), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

 60%|██████    | 30/50 [00:21<00:14,  1.38trial/s, best loss: -0.7405475078424538]
Selected Params:
{'C': 0.19637674478413686, 'max_iter': 145}


In [88]:
RF_search_space = {
    "n_estimators" : hp.randint("n_estimators", 100, 300),
    "max_depth" : hp.randint("max_depth", 8,200),
    "ccp_alpha" : hp.uniform("ccp_alpha", 0.001,0.02)
}


def obj(search_space):
    model = RandomForestClassifier(**search_space)
    model.fit(X_train_ambience_aspect_tfidf, Y_train_ambience_aspect)
    preds = model.predict(X_val_ambience_aspect_tfidf)
    score = f1_score(Y_val_ambience_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = RF_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(50)) 
params = space_eval(RF_search_space,best_params)
print("Selected Params:")
print(params)

tfidf_ambience_aspect_rf = RandomForestClassifier(**params)
tfidf_ambience_aspect_rf.fit(X_train_ambience_aspect_tfidf,Y_train_ambience_aspect) 

# store the mode

model_name = "TFIDF_ambience_RF"
models[model_name] = (tfidf_ambience_aspect_rf, ambience_aspect_tfidf)

train_report = classification_report(Y_train_ambience_aspect, tfidf_ambience_aspect_rf.predict(X_train_ambience_aspect_tfidf), output_dict = True) 
test_report = classification_report(Y_test_ambience_aspect, tfidf_ambience_aspect_rf.predict(X_test_ambience_aspect_tfidf), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

100%|██████████| 50/50 [00:20<00:00,  2.42trial/s, best loss: -0.7455186168402423]
Selected Params:
{'ccp_alpha': 0.0033679429370978807, 'max_depth': 72, 'n_estimators': 119}


In [89]:
XGB_search_space = {
    "max_depth" : hp.randint("max_depth",1,8),
    "n_estimators" : hp.randint("n_estimators",300,1000),
    "eta" : hp.uniform("eta", 0.01,0.2)
}


def obj(search_space):
    model = XGBClassifier(**search_space, verbosity = 0)
    model.fit(X_train_ambience_aspect_tfidf, Y_train_ambience_aspect)
    preds = model.predict(X_val_ambience_aspect_tfidf)
    score = f1_score(Y_val_ambience_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = XGB_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(20)) 
params = space_eval(XGB_search_space,best_params)
print("Selected Params:")
print(params)

tfidf_ambience_aspect_xgb = XGBClassifier(**params)
tfidf_ambience_aspect_xgb.fit(X_train_ambience_aspect_tfidf,Y_train_ambience_aspect) 

# store the mode

model_name = "tfidf_ambience_XGB"
models[model_name] = (tfidf_ambience_aspect_xgb, ambience_aspect_tfidf)

train_report = classification_report(Y_train_ambience_aspect, tfidf_ambience_aspect_xgb.predict(X_train_ambience_aspect_tfidf), output_dict = True) 
test_report = classification_report(Y_test_ambience_aspect, tfidf_ambience_aspect_xgb.predict(X_test_ambience_aspect_tfidf), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

 84%|████████▍ | 42/50 [10:50<02:03, 15.49s/trial, best loss: -0.7493146508655746]
Selected Params:
{'eta': 0.011580984356128116, 'max_depth': 5, 'n_estimators': 615}


# BERT

In [90]:
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

In [91]:
# empty string means no mention of that aspect in the review, therefore we think it shud get a neutral rating. we replace the empty string with "normal" to get a neutral rating

test.loc[test.food_aspect_extraction_3 == "", "food_aspect_extraction_3"] = "normal"
test.loc[test.service_aspect_extraction_3 == "", "service_aspect_extraction_3"] = "normal"
test.loc[test.ambience_aspect_extraction_3 == "", "ambience_aspect_extraction_3"] = "normal"
        

In [92]:
def get_sentiment(review):
    tokens = tokenizer.encode(review, return_tensors="pt")
    result = model(tokens)
    sentiment = int(torch.argmax(result.logits)+1)
    # the out put is range from 1~5 , we classifiy 1~2 as negative , 3 as neutral , 4 and 5 as positvive
    if sentiment >=4 :
        return 1
    elif sentiment == 3:
        return 0
    else:
        return -1

In [93]:
# Food aspect
preds = test.food_aspect_extraction_2.apply(lambda x : get_sentiment(x))

model_name = "food_BERT"

test_report = classification_report(Y_test_ambience_aspect, preds, output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(np.nan)
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(np.nan)
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(np.nan)
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(np.nan)
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(np.nan)
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(np.nan)
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(np.nan)
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(np.nan)
results_table["test_POS_recall"].append(test_report["1"]["recall"])

In [94]:
# service aspect
preds = test.service_aspect_extraction_2.apply(lambda x : get_sentiment(x))

model_name = "service_BERT"

test_report = classification_report(Y_test_ambience_aspect, preds, output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(np.nan)
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(np.nan)
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(np.nan)
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(np.nan)
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(np.nan)
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(np.nan)
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(np.nan)
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(np.nan)
results_table["test_POS_recall"].append(test_report["1"]["recall"])

In [95]:
# ambience aspect
preds = test.ambience_aspect_extraction_2.apply(lambda x : get_sentiment(x))

model_name = "ambience_BERT"

test_report = classification_report(Y_test_ambience_aspect, preds, output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(np.nan)
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(np.nan)
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(np.nan)
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(np.nan)
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(np.nan)
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(np.nan)
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(np.nan)
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(np.nan)
results_table["test_POS_recall"].append(test_report["1"]["recall"])

# Results


In [96]:
res = pd.DataFrame(results_table)
res

Unnamed: 0,model_name,train_accuracy,test_accuracy,train_weighted_avg_f1,test_weghted_avg_f1,train_NEG_precision,test_NEG_precision,train_NEG_recall,test_NEG_recall,train_NEU_precision,test_NEU_precision,train_NEU_recall,test_NEU_recall,train_POS_precision,test_POS_precision,train_POS_recall,test_POS_recall
0,BOW_food_LgstcRegression,0.740714,0.75,0.711517,0.717715,0.697917,0.785714,0.256705,0.268293,0.53507,0.529412,0.381429,0.319149,0.789116,0.787018,0.946166,0.946341
1,BOW_food_RF,0.762143,0.746622,0.736443,0.710676,0.987805,0.857143,0.310345,0.146341,0.590457,0.544444,0.424286,0.347518,0.792777,0.781818,0.954867,0.943902
2,BOW_food_XGB,0.773571,0.744932,0.754274,0.720634,0.824818,0.666667,0.43295,0.390244,0.603143,0.522222,0.438571,0.333333,0.810585,0.790795,0.949429,0.921951
3,BOW_service_LgstcRegression,0.500357,0.472973,0.56439,0.542859,0.178191,0.147436,0.957143,0.92,0.845618,0.803571,0.433163,0.437956,0.972603,0.964286,0.507143,0.412214
4,BOW_service_RF,0.844643,0.819257,0.826014,0.790304,0.873874,0.916667,0.346429,0.22,0.835421,0.80315,0.986735,0.992701,0.893048,0.916667,0.596429,0.503817
5,BOW_service_XGB,0.846429,0.815878,0.828886,0.78643,0.852459,0.785714,0.371429,0.22,0.835132,0.801572,0.987245,0.992701,0.916898,0.927536,0.591071,0.48855
6,BOW_ambience_LgstcRegression,0.783929,0.771959,0.753242,0.739255,0.0,0.0,0.0,0.0,0.768303,0.753593,0.965891,0.965789,0.859833,0.857143,0.477907,0.473684
7,BOW_ambience_RF,0.785,0.766892,0.759473,0.739806,0.0,0.0,0.0,0.0,0.780914,0.763441,0.943692,0.934211,0.801056,0.779528,0.52907,0.521053
8,BOW_ambience_XGB,0.806429,0.773649,0.781345,0.745424,0.875,0.5,0.075269,0.045455,0.784876,0.759916,0.977802,0.957895,0.906314,0.837838,0.517442,0.489474
9,TFIDF_food_LgstcRegression,0.691071,0.682432,0.611718,0.614215,0.346099,0.274809,0.934866,0.878049,0.0,0.0,0.0,0.0,0.80716,0.798265,0.919521,0.897561


In [97]:
# test results

In [98]:
test_results = res.columns[res.columns.str.contains("test")]
test_results = ["model_name"] + test_results.to_list()
res[test_results]

Unnamed: 0,model_name,test_accuracy,test_weghted_avg_f1,test_NEG_precision,test_NEG_recall,test_NEU_precision,test_NEU_recall,test_POS_precision,test_POS_recall
0,BOW_food_LgstcRegression,0.75,0.717715,0.785714,0.268293,0.529412,0.319149,0.787018,0.946341
1,BOW_food_RF,0.746622,0.710676,0.857143,0.146341,0.544444,0.347518,0.781818,0.943902
2,BOW_food_XGB,0.744932,0.720634,0.666667,0.390244,0.522222,0.333333,0.790795,0.921951
3,BOW_service_LgstcRegression,0.472973,0.542859,0.147436,0.92,0.803571,0.437956,0.964286,0.412214
4,BOW_service_RF,0.819257,0.790304,0.916667,0.22,0.80315,0.992701,0.916667,0.503817
5,BOW_service_XGB,0.815878,0.78643,0.785714,0.22,0.801572,0.992701,0.927536,0.48855
6,BOW_ambience_LgstcRegression,0.771959,0.739255,0.0,0.0,0.753593,0.965789,0.857143,0.473684
7,BOW_ambience_RF,0.766892,0.739806,0.0,0.0,0.763441,0.934211,0.779528,0.521053
8,BOW_ambience_XGB,0.773649,0.745424,0.5,0.045455,0.759916,0.957895,0.837838,0.489474
9,TFIDF_food_LgstcRegression,0.682432,0.614215,0.274809,0.878049,0.0,0.0,0.798265,0.897561


In [99]:
# training rest
train_results = res.columns[res.columns.str.contains("train")]
train_results = ["model_name"] + train_results.to_list()
res[train_results]

Unnamed: 0,model_name,train_accuracy,train_weighted_avg_f1,train_NEG_precision,train_NEG_recall,train_NEU_precision,train_NEU_recall,train_POS_precision,train_POS_recall
0,BOW_food_LgstcRegression,0.740714,0.711517,0.697917,0.256705,0.53507,0.381429,0.789116,0.946166
1,BOW_food_RF,0.762143,0.736443,0.987805,0.310345,0.590457,0.424286,0.792777,0.954867
2,BOW_food_XGB,0.773571,0.754274,0.824818,0.43295,0.603143,0.438571,0.810585,0.949429
3,BOW_service_LgstcRegression,0.500357,0.56439,0.178191,0.957143,0.845618,0.433163,0.972603,0.507143
4,BOW_service_RF,0.844643,0.826014,0.873874,0.346429,0.835421,0.986735,0.893048,0.596429
5,BOW_service_XGB,0.846429,0.828886,0.852459,0.371429,0.835132,0.987245,0.916898,0.591071
6,BOW_ambience_LgstcRegression,0.783929,0.753242,0.0,0.0,0.768303,0.965891,0.859833,0.477907
7,BOW_ambience_RF,0.785,0.759473,0.0,0.0,0.780914,0.943692,0.801056,0.52907
8,BOW_ambience_XGB,0.806429,0.781345,0.875,0.075269,0.784876,0.977802,0.906314,0.517442
9,TFIDF_food_LgstcRegression,0.691071,0.611718,0.346099,0.934866,0.0,0.0,0.80716,0.919521


# Best models of aspect extraction 3, using weighted f1 score
- for food sentiment : BOW_FOOD_XGB
- for service sentiment : ROW_service_RF
- for ambience sentiment : TFIDF_ambience_RF

In [104]:
res[res.model_name.isin(["BOW_food_RF","TFIDF_service_LgstcRegression","tfidf_ambience_XGB"])][["model_name","train_accuracy","test_accuracy","train_weighted_avg_f1","test_weghted_avg_f1"]]

Unnamed: 0,model_name,train_accuracy,test_accuracy,train_weighted_avg_f1,test_weghted_avg_f1
1,BOW_food_RF,0.762143,0.746622,0.736443,0.710676
12,TFIDF_service_LgstcRegression,0.833929,0.826014,0.824238,0.807728
17,tfidf_ambience_XGB,0.798571,0.777027,0.771537,0.745041


In [106]:
res[res.model_name.isin(["BOW_food_RF","TFIDF_service_LgstcRegression","tfidf_ambience_XGB"])][["model_name","test_POS_precision","test_POS_recall","test_NEG_precision","test_NEG_recall","test_NEU_precision","test_NEU_recall"]]

Unnamed: 0,model_name,test_POS_precision,test_POS_recall,test_NEG_precision,test_NEG_recall,test_NEU_precision,test_NEU_recall
1,BOW_food_RF,0.781818,0.943902,0.857143,0.146341,0.544444,0.347518
12,TFIDF_service_LgstcRegression,0.927536,0.48855,0.741935,0.46,0.817073,0.978102
17,tfidf_ambience_XGB,0.867925,0.484211,0.0,0.0,0.757202,0.968421


In [105]:
res[res.model_name.str.contains("BERT")][["model_name","test_accuracy","test_weghted_avg_f1"]]

Unnamed: 0,model_name,test_accuracy,test_weghted_avg_f1
18,food_BERT,0.344595,0.225662
19,service_BERT,0.320946,0.165303
20,ambience_BERT,0.315878,0.1651


In [110]:
# import os
# import pickle

# path = os.getcwd() + "/extraction_2_models"
# if "extraction_2_models" not in os.listdir():
#     os.mkdir(path)

# for model_name, model in models.items():
#     filename = path + "/" + model_name + ".pkg"
#     pickle.dump(model[0], open(filename, "wb"))

# encodings = {
#     "bow_food": food_aspect_bow,
#     "bow_service": service_aspect_bow,
#     "bow_ambience": ambience_aspect_bow,
#     "tfidf_food": food_aspect_tfidf,
#     "tfidf_service": service_aspect_tfidf,
#     "tfidf_ambience": ambience_aspect_tfidf
# }

# for name, encoder in  encodings.items():
#     filename = path + "/encoding_" + name + ".pkg"
#     pickle.dump(encoder, open(filename, "wb"))