In [107]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import classification_report , confusion_matrix , accuracy_score, f1_score,roc_auc_score, make_scorer

from hyperopt import fmin,hp,tpe,Trials,space_eval,STATUS_OK
from hyperopt.early_stop import no_progress_loss
import warnings
warnings.filterwarnings("ignore")

import torch
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer

# Aspect extraction method 3
## Some important stuff (xgboost used here is version 1.5.1, seems that the newer versions cannot accept negative numbers as class labels)

### Load train,val,test sets
- subset the extraction methods and the actual aspect scores 

In [108]:
train = pd.read_csv("data/train.csv")
val = pd.read_csv("data/val.csv")
test = pd.read_csv("data/test.csv")

# replace all NULL values with empty string, (NULL values means that there is no part of text in the review mentioning that particular aspect, i.e the sentiment should be neutral)
train.loc[train.food_aspect_extraction_3.isna(), "food_aspect_extraction_3"] = ""
train.loc[train.service_aspect_extraction_3.isna(), "service_aspect_extraction_3"] = ""
train.loc[train.ambience_aspect_extraction_3.isna(), "ambience_aspect_extraction_3"] = ""

val.loc[val.food_aspect_extraction_3.isna(), "food_aspect_extraction_3"] = ""
val.loc[val.service_aspect_extraction_3.isna(), "service_aspect_extraction_3"] = ""
val.loc[val.ambience_aspect_extraction_3.isna(), "ambience_aspect_extraction_3"] = ""

test.loc[test.food_aspect_extraction_3.isna(), "food_aspect_extraction_3"] = ""
test.loc[test.service_aspect_extraction_3.isna(), "service_aspect_extraction_3"] = ""
test.loc[test.ambience_aspect_extraction_3.isna(), "ambience_aspect_extraction_3"] = ""



#extra features (other than text)
train_rating, val_rating, test_rating = train.rating, val.rating, test.rating

train.head(2)

Unnamed: 0.1,Unnamed: 0,id_review,caption,relative_date,retrieval_date,rating,username,n_review_user,n_photo_user,url_user,...,other_sentiment,food_aspect_extraction_1,service_aspect_extraction_1,ambience_aspect_extraction_1,food_aspect_extraction_2,service_aspect_extraction_2,ambience_aspect_extraction_2,food_aspect_extraction_3,service_aspect_extraction_3,ambience_aspect_extraction_3
0,0,ChZDSUhNMG9nS0VJQ0FnSUQ4bXFuSU53EAE,Staffs are rude. Food served cold. Clearly ove...,2 years ago,48:59.1,2,Vanessa Nee,0,0,https://www.google.com/maps/contrib/1081475767...,...,-1,food serve cold,staff rude,,,,,,,
1,1,ChZDSUhNMG9nS0VJQ0FnSUQybFBEUVVREAE,Overrated. Good for Instagram photos. Queue sy...,10 months ago,14:28.0,2,Gwen Goh,0,0,https://www.google.com/maps/contrib/1167846711...,...,0,queue system confusing i dine order,person ask i takeout queue staff say free sit ...,overrate good instagram photo,,,,,,


In [109]:
X_train_food_aspect = train.food_aspect_extraction_3
Y_train_food_aspect = train.food_sentiment
print("Food Aspect")
print(Y_train_food_aspect.value_counts() , "\n")

X_train_service_aspect = train.service_aspect_extraction_3
Y_train_service_aspect = train.service_sentiment
print("Service Aspect")
print(Y_train_service_aspect.value_counts() , "\n")

X_train_ambience_aspect = train.ambience_aspect_extraction_3
Y_train_ambience_aspect = train.ambience_sentiment
print("Ambience Aspect")
print(Y_train_ambience_aspect.value_counts() , "\n")

Food Aspect
 1    1839
 0     700
-1     261
Name: food_sentiment, dtype: int64 

Service Aspect
 0    1960
 1     560
-1     280
Name: service_sentiment, dtype: int64 

Ambience Aspect
 0    1847
 1     860
-1      93
Name: ambience_sentiment, dtype: int64 



In [110]:
X_val_food_aspect = val.food_aspect_extraction_3
Y_val_food_aspect = val.food_sentiment
print("Food Aspect")
print(Y_val_food_aspect.value_counts() , "\n")

X_val_service_aspect = val.service_aspect_extraction_3
Y_val_service_aspect = val.service_sentiment
print("Service Aspect")
print(Y_val_service_aspect.value_counts() , "\n")

X_val_ambience_aspect = val.ambience_aspect_extraction_3
Y_val_ambience_aspect = val.ambience_sentiment
print("Ambience Aspect")
print(Y_val_ambience_aspect.value_counts() , "\n")

Food Aspect
 1    315
 0    140
-1     45
Name: food_sentiment, dtype: int64 

Service Aspect
 0    353
 1     87
-1     60
Name: service_sentiment, dtype: int64 

Ambience Aspect
 0    334
 1    149
-1     17
Name: ambience_sentiment, dtype: int64 



In [111]:
X_test_food_aspect = test.food_aspect_extraction_3
Y_test_food_aspect = test.food_sentiment
print("Food Aspect")
print(Y_test_food_aspect.value_counts() , "\n")

X_test_service_aspect = test.service_aspect_extraction_3
Y_test_service_aspect = test.service_sentiment
print("Service Aspect")
print(Y_test_service_aspect.value_counts() , "\n")

X_test_ambience_aspect = test.ambience_aspect_extraction_3
Y_test_ambience_aspect = test.ambience_sentiment
print("Ambience Aspect")
print(Y_test_ambience_aspect.value_counts() , "\n")

Food Aspect
 1    410
 0    141
-1     41
Name: food_sentiment, dtype: int64 

Service Aspect
 0    411
 1    131
-1     50
Name: service_sentiment, dtype: int64 

Ambience Aspect
 0    380
 1    190
-1     22
Name: ambience_sentiment, dtype: int64 



# Bag of Words
### Food Aspect
* BOW encoding + rating (out of 5 stars) as features

In [112]:
models = {} # for storing all the models
results_table = {
    "model_name":[],
    "train_accuracy":[],
    "test_accuracy":[],
    "train_weighted_avg_f1":[],
    "test_weghted_avg_f1":[],
    "train_NEG_precision":[],
    "test_NEG_precision":[],
    "train_NEG_recall":[],
    "test_NEG_recall":[],
    "train_NEU_precision":[],
    "test_NEU_precision":[],
    "train_NEU_recall":[],
    "test_NEU_recall":[],
    "train_POS_precision":[],
    "test_POS_precision":[],
    "train_POS_recall":[],
    "test_POS_recall":[]

}

In [113]:
# Food aspect
food_aspect_bow = CountVectorizer()
X_train_food_aspect_bow = food_aspect_bow.fit_transform(X_train_food_aspect).toarray()
X_train_food_aspect_bow = np.hstack((X_train_food_aspect_bow,train_rating.to_numpy().reshape(-1,1)))
X_train_food_aspect_bow

array([[0, 0, 0, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 0, 5],
       ...,
       [0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 2]])

In [114]:
X_val_food_aspect_bow = food_aspect_bow.transform(X_val_food_aspect).toarray()
X_val_food_aspect_bow = np.hstack((X_val_food_aspect_bow,val_rating.to_numpy().reshape(-1,1)))
X_val_food_aspect_bow

array([[0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 4],
       [0, 0, 0, ..., 0, 0, 5],
       ...,
       [0, 0, 0, ..., 0, 0, 3],
       [0, 0, 0, ..., 0, 0, 4],
       [0, 0, 0, ..., 0, 0, 5]])

In [115]:
X_test_food_aspect_bow = food_aspect_bow.transform(X_test_food_aspect).toarray()
X_test_food_aspect_bow = np.hstack((X_test_food_aspect_bow,test_rating.to_numpy().reshape(-1,1)))
X_test_food_aspect_bow

array([[0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 3],
       [0, 0, 0, ..., 0, 0, 5],
       ...,
       [0, 0, 0, ..., 0, 0, 4],
       [0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 5]])

In [116]:
# Logistic model
scorer = make_scorer(f1_score)

LOGISTIC_search_space = {
    "C": hp.uniform("C",0,0.2),
    "max_iter" : hp.randint("max_iter",100,300),
}

def obj(search_space):
    model = LogisticRegression(**search_space)
    model.fit(X_train_food_aspect_bow, Y_train_food_aspect)
    preds = model.predict(X_val_food_aspect_bow)
    score = f1_score(Y_val_food_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = LOGISTIC_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(15)) 
params = space_eval(LOGISTIC_search_space,best_params)
print("Selected Params:")
print(params)

bow_food_aspect_logistic = LogisticRegression(**params)
bow_food_aspect_logistic.fit(X_train_food_aspect_bow,Y_train_food_aspect) 

# store the model and their encoders
model_name = "BOW_food_LgstcRegression"
models[model_name] = (bow_food_aspect_logistic, food_aspect_bow)



# store the results in a table
# performance on test set


train_report = classification_report(Y_train_food_aspect, bow_food_aspect_logistic.predict(X_train_food_aspect_bow), output_dict = True) 
test_report = classification_report(Y_test_food_aspect, bow_food_aspect_logistic.predict(X_test_food_aspect_bow), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])


 38%|███▊      | 19/50 [00:10<00:16,  1.90trial/s, best loss: -0.6996409695644249]
Selected Params:
{'C': 0.1924383664094962, 'max_iter': 190}


In [117]:
# RandomForest Classifier

RF_search_space = {
    "n_estimators" : hp.randint("n_estimators", 100, 300),
    "max_depth" : hp.randint("max_depth", 8,200),
    "ccp_alpha" : hp.uniform("ccp_alpha", 0.001,0.02)
}


def obj(search_space):
    model = RandomForestClassifier(**search_space)
    model.fit(X_train_food_aspect_bow, Y_train_food_aspect)
    preds = model.predict(X_val_food_aspect_bow)
    score = f1_score(Y_val_food_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = RF_search_space, algo = tpe.suggest, max_evals= 100, early_stop_fn= no_progress_loss(50)) 
params = space_eval(RF_search_space,best_params)
print("Selected Params:")
print(params)

bow_food_aspect_rf = RandomForestClassifier(**params)
bow_food_aspect_rf.fit(X_train_food_aspect_bow,Y_train_food_aspect) 

# store the mode

model_name = "BOW_food_RF"
models[model_name] = (bow_food_aspect_rf, food_aspect_bow)


train_report = classification_report(Y_train_food_aspect, bow_food_aspect_rf.predict(X_train_food_aspect_bow), output_dict = True) 
test_report = classification_report(Y_test_food_aspect, bow_food_aspect_rf.predict(X_test_food_aspect_bow), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])



 72%|███████▏  | 72/100 [01:37<00:37,  1.36s/trial, best loss: -0.6623075842919296]
Selected Params:
{'ccp_alpha': 0.001204075045261382, 'max_depth': 178, 'n_estimators': 298}


In [118]:
#XBG Classifier class_weight = {1: 0.65, 0: 0.25, -1:10}


XGB_search_space = {
    "max_depth" : hp.randint("max_depth",1,8),
    "n_estimators" : hp.randint("n_estimators",500,1000),
    "eta" : hp.uniform("eta", 0.01,0.2),
}


def obj(search_space):
    model = XGBClassifier(**search_space, verbosity = 0)
    model.fit(X_train_food_aspect_bow, Y_train_food_aspect)
    preds = model.predict(X_val_food_aspect_bow)
    score = f1_score(Y_val_food_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = XGB_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(10)) 
params = space_eval(XGB_search_space,best_params)
print("Selected Params:")
print(params)

bow_food_aspect_xgb = XGBClassifier(**params)
bow_food_aspect_xgb.fit(X_train_food_aspect_bow,Y_train_food_aspect) 


model_name = "BOW_food_XGB"
models[model_name] = (bow_food_aspect_xgb, food_aspect_bow)

train_report = classification_report(Y_train_food_aspect, bow_food_aspect_xgb.predict(X_train_food_aspect_bow), output_dict = True) 
test_report = classification_report(Y_test_food_aspect, bow_food_aspect_xgb.predict(X_test_food_aspect_bow), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])


 28%|██▊       | 14/50 [10:03<25:51, 43.09s/trial, best loss: -0.7011353092049016]
Selected Params:
{'eta': 0.03378133242868611, 'max_depth': 2, 'n_estimators': 820}


### Service aspect

In [119]:
service_aspect_bow = CountVectorizer()
X_train_service_aspect_bow = service_aspect_bow.fit_transform(X_train_service_aspect).toarray()
X_train_service_aspect_bow = np.hstack((X_train_service_aspect_bow,train_rating.to_numpy().reshape(-1,1)))
X_train_service_aspect_bow

array([[0, 0, 0, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 0, 5],
       ...,
       [0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 2]])

In [120]:
X_val_service_aspect_bow = service_aspect_bow.transform(X_val_service_aspect).toarray()
X_val_service_aspect_bow = np.hstack((X_val_service_aspect_bow,val_rating.to_numpy().reshape(-1,1)))
X_val_service_aspect_bow

array([[0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 4],
       [0, 0, 0, ..., 0, 0, 5],
       ...,
       [0, 0, 0, ..., 0, 0, 3],
       [0, 0, 0, ..., 0, 0, 4],
       [0, 0, 0, ..., 0, 0, 5]])

In [121]:
X_test_service_aspect_bow = service_aspect_bow.transform(X_test_service_aspect).toarray()
X_test_service_aspect_bow = np.hstack((X_test_service_aspect_bow,test_rating.to_numpy().reshape(-1,1)))
X_test_service_aspect_bow

array([[0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 3],
       [0, 0, 0, ..., 0, 0, 5],
       ...,
       [0, 0, 0, ..., 0, 0, 4],
       [0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 5]])

In [122]:
# Logistic model
scorer = make_scorer(f1_score)

LOGISTIC_search_space = {
    "C": hp.uniform("C",0,0.2),
    "max_iter" : hp.randint("max_iter",100,300),
}

def obj(search_space):
    model = LogisticRegression(**search_space, class_weight = {1: 0.65, 0: 0.25, -1:10})
    model.fit(X_train_service_aspect_bow, Y_train_service_aspect)
    preds = model.predict(X_val_service_aspect_bow)
    score = f1_score(Y_val_service_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = LOGISTIC_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(15)) 
params = space_eval(LOGISTIC_search_space,best_params)
print("Selected Params:")
print(params)

bow_service_aspect_logistic = LogisticRegression(**params, class_weight = {1: 0.65, 0: 0.25, -1:10})
bow_service_aspect_logistic.fit(X_train_service_aspect_bow,Y_train_service_aspect) 

# store the mode

model_name = "BOW_service_LgstcRegression"
models[model_name] = (bow_service_aspect_logistic, service_aspect_bow)

train_report = classification_report(Y_train_service_aspect, bow_service_aspect_logistic.predict(X_train_service_aspect_bow), output_dict = True) 
test_report = classification_report(Y_test_service_aspect, bow_service_aspect_logistic.predict(X_test_service_aspect_bow), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

 36%|███▌      | 18/50 [00:11<00:20,  1.60trial/s, best loss: -0.5265202437986021]
Selected Params:
{'C': 0.18729430332693883, 'max_iter': 175}


In [123]:
# RandomForest Classifier

RF_search_space = {
    "n_estimators" : hp.randint("n_estimators", 100, 300),
    "max_depth" : hp.randint("max_depth", 8,200),
    "ccp_alpha" : hp.uniform("ccp_alpha", 0.001,0.02)
}


def obj(search_space):
    model = RandomForestClassifier(**search_space)
    model.fit(X_train_service_aspect_bow, Y_train_service_aspect)
    preds = model.predict(X_val_service_aspect_bow)
    score = f1_score(Y_val_service_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = RF_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(50)) 
params = space_eval(RF_search_space,best_params)
print("Selected Params:")
print(params)

bow_service_aspect_rf = RandomForestClassifier(**params)
bow_service_aspect_rf.fit(X_train_service_aspect_bow,Y_train_service_aspect) 

# store the mode

model_name = "BOW_service_RF"
models[model_name] = (bow_service_aspect_rf, service_aspect_bow)

train_report = classification_report(Y_train_service_aspect, bow_service_aspect_rf.predict(X_train_service_aspect_bow), output_dict = True) 
test_report = classification_report(Y_test_service_aspect, bow_service_aspect_rf.predict(X_test_service_aspect_bow), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

100%|██████████| 50/50 [00:27<00:00,  1.79trial/s, best loss: -0.7904205774640559]
Selected Params:
{'ccp_alpha': 0.0011388319247146847, 'max_depth': 92, 'n_estimators': 115}


In [124]:
XGB_search_space = {
    "max_depth" : hp.randint("max_depth",1,8),
    "n_estimators" : hp.randint("n_estimators",300,1000),
    "eta" : hp.uniform("eta", 0.01,0.2)
}


def obj(search_space):
    model = XGBClassifier(**search_space, verbosity = 0)
    model.fit(X_train_service_aspect_bow, Y_train_service_aspect)
    preds = model.predict(X_val_service_aspect_bow)
    score = f1_score(Y_val_service_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = XGB_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(20)) 
params = space_eval(XGB_search_space,best_params)
print("Selected Params:")
print(params)

bow_service_aspect_xgb = XGBClassifier(**params)
bow_service_aspect_xgb.fit(X_train_service_aspect_bow,Y_train_service_aspect) 

# store the model
model_name = "BOW_service_XGB"
models[model_name] = (bow_service_aspect_xgb, service_aspect_bow)

train_report = classification_report(Y_train_service_aspect, bow_service_aspect_xgb.predict(X_train_service_aspect_bow), output_dict = True) 
test_report = classification_report(Y_test_service_aspect, bow_service_aspect_xgb.predict(X_test_service_aspect_bow), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

 78%|███████▊  | 39/50 [13:15<03:44, 20.40s/trial, best loss: -0.781341823253239] 
Selected Params:
{'eta': 0.034770946285323165, 'max_depth': 1, 'n_estimators': 553}


### Ambience Aspect

In [125]:
ambience_aspect_bow = CountVectorizer()
X_train_ambience_aspect_bow = ambience_aspect_bow.fit_transform(X_train_ambience_aspect).toarray()
X_train_ambience_aspect_bow = np.hstack((X_train_ambience_aspect_bow,train_rating.to_numpy().reshape(-1,1)))
X_train_ambience_aspect_bow

array([[0, 0, 0, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 0, 5],
       ...,
       [0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 2]])

In [126]:
X_val_ambience_aspect_bow = ambience_aspect_bow.transform(X_val_ambience_aspect).toarray()
X_val_ambience_aspect_bow = np.hstack((X_val_ambience_aspect_bow,val_rating.to_numpy().reshape(-1,1)))
X_val_ambience_aspect_bow

array([[0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 4],
       [0, 0, 0, ..., 0, 0, 5],
       ...,
       [0, 0, 0, ..., 0, 0, 3],
       [0, 0, 0, ..., 0, 0, 4],
       [0, 0, 0, ..., 0, 0, 5]])

In [127]:
X_test_ambience_aspect_bow = ambience_aspect_bow.transform(X_test_ambience_aspect).toarray()
X_test_ambience_aspect_bow = np.hstack((X_test_ambience_aspect_bow,test_rating.to_numpy().reshape(-1,1)))
X_test_ambience_aspect_bow

array([[0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 3],
       [0, 0, 0, ..., 0, 0, 5],
       ...,
       [0, 0, 0, ..., 0, 0, 4],
       [0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 5]])

In [128]:
# Logistic model
scorer = make_scorer(f1_score)

LOGISTIC_search_space = {
    "C": hp.uniform("C",0,0.2),
    "max_iter" : hp.randint("max_iter",100,300),
}

def obj(search_space):
    model = LogisticRegression(**search_space)
    model.fit(X_train_ambience_aspect_bow, Y_train_ambience_aspect)
    preds = model.predict(X_val_ambience_aspect_bow)
    score = f1_score(Y_val_ambience_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = LOGISTIC_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(15)) 
params = space_eval(LOGISTIC_search_space,best_params)
print("Selected Params:")
print(params)

bow_ambience_aspect_logistic = LogisticRegression(**params)
bow_ambience_aspect_logistic.fit(X_train_ambience_aspect_bow,Y_train_ambience_aspect) 

# store the model
model_name = "BOW_ambience_LgstcRegression"
models[model_name] = (bow_ambience_aspect_logistic, ambience_aspect_bow)

train_report = classification_report(Y_train_ambience_aspect, bow_ambience_aspect_logistic.predict(X_train_ambience_aspect_bow), output_dict = True) 
test_report = classification_report(Y_test_ambience_aspect, bow_ambience_aspect_logistic.predict(X_test_ambience_aspect_bow), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

 30%|███       | 15/50 [00:04<00:11,  3.12trial/s, best loss: -0.700457653608194]
Selected Params:
{'C': 0.10748989592549925, 'max_iter': 260}


In [129]:
RF_search_space = {
    "n_estimators" : hp.randint("n_estimators", 100, 300),
    "max_depth" : hp.randint("max_depth", 8,200),
    "ccp_alpha" : hp.uniform("ccp_alpha", 0.001,0.02)
}


def obj(search_space):
    model = RandomForestClassifier(**search_space)
    model.fit(X_train_ambience_aspect_bow, Y_train_ambience_aspect)
    preds = model.predict(X_val_ambience_aspect_bow)
    score = f1_score(Y_val_ambience_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = RF_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(50)) 
params = space_eval(RF_search_space,best_params)
print("Selected Params:")
print(params)

bow_ambience_aspect_rf = RandomForestClassifier(**params)
bow_ambience_aspect_rf.fit(X_train_ambience_aspect_bow,Y_train_ambience_aspect) 

# store the mode

model_name = "BOW_ambience_RF"
models[model_name] = (bow_ambience_aspect_rf, ambience_aspect_bow)

train_report = classification_report(Y_train_ambience_aspect, bow_ambience_aspect_rf.predict(X_train_ambience_aspect_bow), output_dict = True) 
test_report = classification_report(Y_test_ambience_aspect, bow_ambience_aspect_rf.predict(X_test_ambience_aspect_bow), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

100%|██████████| 50/50 [00:19<00:00,  2.56trial/s, best loss: -0.7034990413691051]
Selected Params:
{'ccp_alpha': 0.004419058564591858, 'max_depth': 48, 'n_estimators': 147}


In [130]:
XGB_search_space = {
    "max_depth" : hp.randint("max_depth",1,8),
    "n_estimators" : hp.randint("n_estimators",300,1000),
    "eta" : hp.uniform("eta", 0.01,0.2)
}


def obj(search_space):
    model = XGBClassifier(**search_space, verbosity = 0)
    model.fit(X_train_ambience_aspect_bow, Y_train_ambience_aspect)
    preds = model.predict(X_val_ambience_aspect_bow)
    score = f1_score(Y_val_ambience_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = XGB_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(20)) 
params = space_eval(XGB_search_space,best_params)
print("Selected Params:")
print(params)

bow_ambience_aspect_xgb = XGBClassifier(**params)
bow_ambience_aspect_xgb.fit(X_train_ambience_aspect_bow,Y_train_ambience_aspect) 

# store the mode

model_name = "BOW_ambience_XGB"
models[model_name] = (bow_ambience_aspect_xgb, ambience_aspect_bow)

train_report = classification_report(Y_train_ambience_aspect, bow_ambience_aspect_xgb.predict(X_train_ambience_aspect_bow), output_dict = True) 
test_report = classification_report(Y_test_ambience_aspect, bow_ambience_aspect_xgb.predict(X_test_ambience_aspect_bow), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

 40%|████      | 20/50 [07:48<11:42, 23.42s/trial, best loss: -0.7034990413691051]
Selected Params:
{'eta': 0.17501779320529404, 'max_depth': 7, 'n_estimators': 828}


# Tf-idf model
### Food aspect

In [131]:
food_aspect_tfidf = TfidfVectorizer()
X_train_food_aspect_tfidf = food_aspect_tfidf.fit_transform(X_train_food_aspect).toarray()
X_train_food_aspect_tfidf = np.hstack((X_train_food_aspect_tfidf,train_rating.to_numpy().reshape(-1,1)))
X_train_food_aspect_tfidf

array([[0., 0., 0., ..., 0., 0., 2.],
       [0., 0., 0., ..., 0., 0., 2.],
       [0., 0., 0., ..., 0., 0., 5.],
       ...,
       [0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 2.]])

In [132]:
X_val_food_aspect_tfidf = food_aspect_tfidf.transform(X_val_food_aspect).toarray()
X_val_food_aspect_tfidf = np.hstack((X_val_food_aspect_tfidf,val_rating.to_numpy().reshape(-1,1)))
X_val_food_aspect_tfidf

array([[0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 4.],
       [0., 0., 0., ..., 0., 0., 5.],
       ...,
       [0., 0., 0., ..., 0., 0., 3.],
       [0., 0., 0., ..., 0., 0., 4.],
       [0., 0., 0., ..., 0., 0., 5.]])

In [133]:
X_test_food_aspect_tfidf = food_aspect_tfidf.transform(X_test_food_aspect).toarray()
X_test_food_aspect_tfidf = np.hstack((X_test_food_aspect_tfidf,test_rating.to_numpy().reshape(-1,1)))
X_test_food_aspect_tfidf

array([[0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 3.],
       [0., 0., 0., ..., 0., 0., 5.],
       ...,
       [0., 0., 0., ..., 0., 0., 4.],
       [0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 5.]])

In [134]:
# Logistic model

LOGISTIC_search_space = {
    "C": hp.uniform("C",0.5,2),
    "max_iter" : hp.randint("max_iter",100,200),
}

def obj(search_space):
    model = LogisticRegression(**search_space, class_weight = {1: 0.65, 0: 0.25, -1:10})
    model.fit(X_train_food_aspect_tfidf, Y_train_food_aspect)
    preds = model.predict(X_val_food_aspect_tfidf)
    score = f1_score(Y_val_food_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = LOGISTIC_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(15)) 
params = space_eval(LOGISTIC_search_space,best_params)
print("Selected Params:")
print(params)

tfidf_food_aspect_logistic = LogisticRegression(**params, class_weight = {1: 0.65, 0: 0.25, -1:10})
tfidf_food_aspect_logistic.fit(X_train_food_aspect_tfidf,Y_train_food_aspect) 



model_name = "TFIDF_food_LgstcRegression"
models[model_name] = (tfidf_food_aspect_logistic, food_aspect_tfidf)

train_report = classification_report(Y_train_food_aspect, tfidf_food_aspect_logistic.predict(X_train_food_aspect_tfidf), output_dict = True) 
test_report = classification_report(Y_test_food_aspect, tfidf_food_aspect_logistic.predict(X_test_food_aspect_tfidf), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

 46%|████▌     | 23/50 [00:28<00:33,  1.23s/trial, best loss: -0.5839665227849352]
Selected Params:
{'C': 1.8140831060312956, 'max_iter': 148}


In [135]:
RF_search_space = {
    "n_estimators" : hp.randint("n_estimators", 100, 300),
    "max_depth" : hp.randint("max_depth", 8,200),
    "ccp_alpha" : hp.uniform("ccp_alpha", 0.001,0.02)
}


def obj(search_space):
    model = RandomForestClassifier(**search_space)
    model.fit(X_train_food_aspect_tfidf, Y_train_food_aspect)
    preds = model.predict(X_val_food_aspect_tfidf)
    score = f1_score(Y_val_food_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = RF_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(50)) 
params = space_eval(RF_search_space,best_params)
print("Selected Params:")
print(params)

tfidf_food_aspect_rf = RandomForestClassifier(**params)
tfidf_food_aspect_rf.fit(X_train_food_aspect_tfidf,Y_train_food_aspect) 

# store the mode

model_name = "TFIDF_food_RF"
models[model_name] = (tfidf_food_aspect_rf, food_aspect_tfidf)

train_report = classification_report(Y_train_food_aspect, tfidf_food_aspect_rf.predict(X_train_food_aspect_tfidf), output_dict = True) 
test_report = classification_report(Y_test_food_aspect, tfidf_food_aspect_rf.predict(X_test_food_aspect_tfidf), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

100%|██████████| 50/50 [01:01<00:00,  1.23s/trial, best loss: -0.673020050948054] 
Selected Params:
{'ccp_alpha': 0.0010458666255669683, 'max_depth': 158, 'n_estimators': 150}


In [136]:
XGB_search_space = {
    "max_depth" : hp.randint("max_depth",1,8),
    "n_estimators" : hp.randint("n_estimators",300,1000),
    "eta" : hp.uniform("eta", 0.01,0.2)
}


def obj(search_space):
    model = XGBClassifier(**search_space, verbosity = 0)
    model.fit(X_train_food_aspect_tfidf, Y_train_food_aspect)
    preds = model.predict(X_val_food_aspect_tfidf)
    score = f1_score(Y_val_food_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = XGB_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(20)) 
params = space_eval(XGB_search_space,best_params)
print("Selected Params:")
print(params)

tfidf_food_aspect_xgb = XGBClassifier(**params)
tfidf_food_aspect_xgb.fit(X_train_food_aspect_tfidf,Y_train_food_aspect) 

# store themodel

model_name = "tfidf_food_XGB"
models[model_name] = (tfidf_food_aspect_xgb, food_aspect_tfidf)

train_report = classification_report(Y_train_food_aspect, tfidf_food_aspect_xgb.predict(X_train_food_aspect_tfidf), output_dict = True) 
test_report = classification_report(Y_test_food_aspect, tfidf_food_aspect_xgb.predict(X_test_food_aspect_tfidf), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

 98%|█████████▊| 49/50 [33:08<00:40, 40.58s/trial, best loss: -0.7184160649076083]
Selected Params:
{'eta': 0.17788748645600275, 'max_depth': 6, 'n_estimators': 323}


### Service aspect

In [137]:
service_aspect_tfidf = TfidfVectorizer()
X_train_service_aspect_tfidf = service_aspect_tfidf.fit_transform(X_train_service_aspect).toarray()
X_train_service_aspect_tfidf = np.hstack((X_train_service_aspect_tfidf,train_rating.to_numpy().reshape(-1,1)))
X_train_service_aspect_tfidf

array([[0., 0., 0., ..., 0., 0., 2.],
       [0., 0., 0., ..., 0., 0., 2.],
       [0., 0., 0., ..., 0., 0., 5.],
       ...,
       [0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 2.]])

In [138]:
X_val_service_aspect_tfidf = service_aspect_tfidf.transform(X_val_service_aspect).toarray()
X_val_service_aspect_tfidf = np.hstack((X_val_service_aspect_tfidf,val_rating.to_numpy().reshape(-1,1)))
X_val_service_aspect_tfidf

array([[0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 4.],
       [0., 0., 0., ..., 0., 0., 5.],
       ...,
       [0., 0., 0., ..., 0., 0., 3.],
       [0., 0., 0., ..., 0., 0., 4.],
       [0., 0., 0., ..., 0., 0., 5.]])

In [139]:
X_test_service_aspect_tfidf = service_aspect_tfidf.transform(X_test_service_aspect).toarray()
X_test_service_aspect_tfidf = np.hstack((X_test_service_aspect_tfidf,test_rating.to_numpy().reshape(-1,1)))
X_test_service_aspect_tfidf

array([[0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 3.],
       [0., 0., 0., ..., 0., 0., 5.],
       ...,
       [0., 0., 0., ..., 0., 0., 4.],
       [0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 5.]])

In [140]:
LOGISTIC_search_space = {
    "C": hp.uniform("C",0,0.2),
    "max_iter" : hp.randint("max_iter",100,300),
}

def obj(search_space):
    model = LogisticRegression(**search_space)
    model.fit(X_train_service_aspect_tfidf, Y_train_service_aspect)
    preds = model.predict(X_val_service_aspect_tfidf)
    score = f1_score(Y_val_service_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = LOGISTIC_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(15)) 
params = space_eval(LOGISTIC_search_space,best_params)
print("Selected Params:")
print(params)

tfidf_service_aspect_logistic = LogisticRegression(**params)
tfidf_service_aspect_logistic.fit(X_train_service_aspect_tfidf,Y_train_service_aspect) 

# store the mode

model_name = "TFIDF_service_LgstcRegression"
models[model_name] = (tfidf_service_aspect_logistic, service_aspect_tfidf)

train_report = classification_report(Y_train_service_aspect, tfidf_service_aspect_logistic.predict(X_train_service_aspect_tfidf), output_dict = True) 
test_report = classification_report(Y_test_service_aspect, tfidf_service_aspect_logistic.predict(X_test_service_aspect_tfidf), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

 34%|███▍      | 17/50 [00:07<00:14,  2.28trial/s, best loss: -0.7710513956435558]
Selected Params:
{'C': 0.1891211932744063, 'max_iter': 176}


In [141]:
RF_search_space = {
    "n_estimators" : hp.randint("n_estimators", 100, 300),
    "max_depth" : hp.randint("max_depth", 8,200),
    "ccp_alpha" : hp.uniform("ccp_alpha", 0.001,0.02)
}


def obj(search_space):
    model = RandomForestClassifier(**search_space)
    model.fit(X_train_service_aspect_tfidf, Y_train_service_aspect)
    preds = model.predict(X_val_service_aspect_tfidf)
    score = f1_score(Y_val_service_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = RF_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(50)) 
params = space_eval(RF_search_space,best_params)
print("Selected Params:")
print(params)

tfidf_service_aspect_rf = RandomForestClassifier(**params)
tfidf_service_aspect_rf.fit(X_train_service_aspect_tfidf,Y_train_service_aspect) 

# store the mode

model_name = "TFIDF_service_RF"
models[model_name] = (tfidf_service_aspect_rf, service_aspect_tfidf)

train_report = classification_report(Y_train_service_aspect, tfidf_service_aspect_rf.predict(X_train_service_aspect_tfidf), output_dict = True) 
test_report = classification_report(Y_test_service_aspect, tfidf_service_aspect_rf.predict(X_test_service_aspect_tfidf), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

100%|██████████| 50/50 [00:25<00:00,  1.94trial/s, best loss: -0.7859048147377667]
Selected Params:
{'ccp_alpha': 0.0013439345581009183, 'max_depth': 183, 'n_estimators': 170}


In [142]:
XGB_search_space = {
    "max_depth" : hp.randint("max_depth",1,8),
    "n_estimators" : hp.randint("n_estimators",300,1000),
    "eta" : hp.uniform("eta", 0.01,0.2)
}


def obj(search_space):
    model = XGBClassifier(**search_space, verbosity = 0)
    model.fit(X_train_service_aspect_tfidf, Y_train_service_aspect)
    preds = model.predict(X_val_service_aspect_tfidf)
    score = f1_score(Y_val_service_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = XGB_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(20)) 
params = space_eval(XGB_search_space,best_params)
print("Selected Params:")
print(params)

tfidf_service_aspect_xgb = XGBClassifier(**params)
tfidf_service_aspect_xgb.fit(X_train_service_aspect_tfidf,Y_train_service_aspect) 

# store the mode
model_name = "tfidf_service_XGB"
models[model_name] = (tfidf_service_aspect_xgb, service_aspect_tfidf)

train_report = classification_report(Y_train_service_aspect, tfidf_service_aspect_xgb.predict(X_train_service_aspect_tfidf), output_dict = True) 
test_report = classification_report(Y_test_service_aspect, tfidf_service_aspect_xgb.predict(X_test_service_aspect_tfidf), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

 68%|██████▊   | 34/50 [12:29<05:52, 22.05s/trial, best loss: -0.781341823253239] 
Selected Params:
{'eta': 0.02014430631728941, 'max_depth': 3, 'n_estimators': 354}


### Ambience

In [143]:
ambience_aspect_tfidf = TfidfVectorizer()
X_train_ambience_aspect_tfidf = ambience_aspect_tfidf.fit_transform(X_train_ambience_aspect).toarray()
X_train_ambience_aspect_tfidf = np.hstack((X_train_ambience_aspect_tfidf,train_rating.to_numpy().reshape(-1,1)))
X_train_ambience_aspect_tfidf

array([[0., 0., 0., ..., 0., 0., 2.],
       [0., 0., 0., ..., 0., 0., 2.],
       [0., 0., 0., ..., 0., 0., 5.],
       ...,
       [0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 2.]])

In [144]:
X_val_ambience_aspect_tfidf = ambience_aspect_tfidf.transform(X_val_ambience_aspect).toarray()
X_val_ambience_aspect_tfidf = np.hstack((X_val_ambience_aspect_tfidf,val_rating.to_numpy().reshape(-1,1)))
X_val_ambience_aspect_tfidf

array([[0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 4.],
       [0., 0., 0., ..., 0., 0., 5.],
       ...,
       [0., 0., 0., ..., 0., 0., 3.],
       [0., 0., 0., ..., 0., 0., 4.],
       [0., 0., 0., ..., 0., 0., 5.]])

In [145]:
X_test_ambience_aspect_tfidf = ambience_aspect_tfidf.transform(X_test_ambience_aspect).toarray()
X_test_ambience_aspect_tfidf = np.hstack((X_test_ambience_aspect_tfidf,test_rating.to_numpy().reshape(-1,1)))
X_test_ambience_aspect_tfidf

array([[0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 3.],
       [0., 0., 0., ..., 0., 0., 5.],
       ...,
       [0., 0., 0., ..., 0., 0., 4.],
       [0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 5.]])

In [146]:
LOGISTIC_search_space = {
    "C": hp.uniform("C",0,0.2),
    "max_iter" : hp.randint("max_iter",100,300),
}

def obj(search_space):
    model = LogisticRegression(**search_space)
    model.fit(X_train_ambience_aspect_tfidf, Y_train_ambience_aspect)
    preds = model.predict(X_val_ambience_aspect_tfidf)
    score = f1_score(Y_val_ambience_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = LOGISTIC_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(15)) 
params = space_eval(LOGISTIC_search_space,best_params)
print("Selected Params:")
print(params)

tfidf_ambience_aspect_logistic = LogisticRegression(**params)
tfidf_ambience_aspect_logistic.fit(X_train_ambience_aspect_tfidf,Y_train_ambience_aspect) 

# store the mode

model_name = "TFIDF_ambience_LgstcRegression"
models[model_name] = (tfidf_ambience_aspect_logistic, ambience_aspect_tfidf)

train_report = classification_report(Y_train_ambience_aspect, tfidf_ambience_aspect_logistic.predict(X_train_ambience_aspect_tfidf), output_dict = True) 
test_report = classification_report(Y_test_ambience_aspect, tfidf_ambience_aspect_logistic.predict(X_test_ambience_aspect_tfidf), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

 30%|███       | 15/50 [00:05<00:12,  2.71trial/s, best loss: -0.695936931855447]
Selected Params:
{'C': 0.15763719938126808, 'max_iter': 269}


In [147]:
RF_search_space = {
    "n_estimators" : hp.randint("n_estimators", 100, 300),
    "max_depth" : hp.randint("max_depth", 8,200),
    "ccp_alpha" : hp.uniform("ccp_alpha", 0.001,0.02)
}


def obj(search_space):
    model = RandomForestClassifier(**search_space)
    model.fit(X_train_ambience_aspect_tfidf, Y_train_ambience_aspect)
    preds = model.predict(X_val_ambience_aspect_tfidf)
    score = f1_score(Y_val_ambience_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = RF_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(50)) 
params = space_eval(RF_search_space,best_params)
print("Selected Params:")
print(params)

tfidf_ambience_aspect_rf = RandomForestClassifier(**params)
tfidf_ambience_aspect_rf.fit(X_train_ambience_aspect_tfidf,Y_train_ambience_aspect) 

# store the mode

model_name = "TFIDF_ambience_RF"
models[model_name] = (tfidf_ambience_aspect_rf, ambience_aspect_tfidf)

train_report = classification_report(Y_train_ambience_aspect, tfidf_ambience_aspect_rf.predict(X_train_ambience_aspect_tfidf), output_dict = True) 
test_report = classification_report(Y_test_ambience_aspect, tfidf_ambience_aspect_rf.predict(X_test_ambience_aspect_tfidf), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

100%|██████████| 50/50 [00:19<00:00,  2.60trial/s, best loss: -0.7034990413691051]
Selected Params:
{'ccp_alpha': 0.0016278841314499104, 'max_depth': 164, 'n_estimators': 243}


In [148]:
XGB_search_space = {
    "max_depth" : hp.randint("max_depth",1,8),
    "n_estimators" : hp.randint("n_estimators",300,1000),
    "eta" : hp.uniform("eta", 0.01,0.2)
}


def obj(search_space):
    model = XGBClassifier(**search_space, verbosity = 0)
    model.fit(X_train_ambience_aspect_tfidf, Y_train_ambience_aspect)
    preds = model.predict(X_val_ambience_aspect_tfidf)
    score = f1_score(Y_val_ambience_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = XGB_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(20)) 
params = space_eval(XGB_search_space,best_params)
print("Selected Params:")
print(params)

tfidf_ambience_aspect_xgb = XGBClassifier(**params)
tfidf_ambience_aspect_xgb.fit(X_train_ambience_aspect_tfidf,Y_train_ambience_aspect) 

# store the mode

model_name = "tfidf_ambience_XGB"
models[model_name] = (tfidf_ambience_aspect_xgb, ambience_aspect_tfidf)

train_report = classification_report(Y_train_ambience_aspect, tfidf_ambience_aspect_xgb.predict(X_train_ambience_aspect_tfidf), output_dict = True) 
test_report = classification_report(Y_test_ambience_aspect, tfidf_ambience_aspect_xgb.predict(X_test_ambience_aspect_tfidf), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

 40%|████      | 20/50 [06:42<10:03, 20.10s/trial, best loss: -0.7034990413691051]
Selected Params:
{'eta': 0.07268846695572269, 'max_depth': 4, 'n_estimators': 687}


# Bert

In [154]:
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

In [155]:
# empty string means no mention of that aspect in the review, therefore we think it shud get a neutral rating. we replace the empty string with "normal" to get a neutral rating

test.loc[test.food_aspect_extraction_3 == "" , "food_aspect_extraction_3"] = "normal"
test.loc[test.service_aspect_extraction_3 == "", "service_aspect_extraction_3"] = "normal"
test.loc[test.ambience_aspect_extraction_3 == "", "ambience_aspect_extraction_3"] = "normal"
        

In [156]:
def get_sentiment(review):
    tokens = tokenizer.encode(review, return_tensors="pt")
    result = model(tokens)
    sentiment = int(torch.argmax(result.logits)+1)
    # the out put is range from 1~5 , we classifiy 1~2 as negative , 3 as neutral , 4 and 5 as positvive
    if sentiment >=4 :
        return 1
    elif sentiment == 3:
        return 0
    else:
        return -1

### Food

In [158]:
# Food aspect
preds = test.food_aspect_extraction_3.apply(lambda x : get_sentiment(x))

model_name = "food_BERT"

test_report = classification_report(Y_test_ambience_aspect, preds, output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(np.nan)
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(np.nan)
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(np.nan)
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(np.nan)
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(np.nan)
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(np.nan)
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(np.nan)
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(np.nan)
results_table["test_POS_recall"].append(test_report["1"]["recall"])

### Service

In [159]:
# service aspect
preds = test.service_aspect_extraction_3.apply(lambda x : get_sentiment(x))

model_name = "service_BERT"

test_report = classification_report(Y_test_ambience_aspect, preds, output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(np.nan)
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(np.nan)
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(np.nan)
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(np.nan)
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(np.nan)
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(np.nan)
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(np.nan)
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(np.nan)
results_table["test_POS_recall"].append(test_report["1"]["recall"])

### Ambience

In [160]:
# ambience aspect
preds = test.ambience_aspect_extraction_3.apply(lambda x : get_sentiment(x))

model_name = "ambience_BERT"

test_report = classification_report(Y_test_ambience_aspect, preds, output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(np.nan)
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(np.nan)
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(np.nan)
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(np.nan)
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(np.nan)
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(np.nan)
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(np.nan)
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(np.nan)
results_table["test_POS_recall"].append(test_report["1"]["recall"])

# Results


In [161]:
res = pd.DataFrame(results_table)

In [173]:
res

Unnamed: 0,model_name,train_accuracy,test_accuracy,train_weighted_avg_f1,test_weghted_avg_f1,train_NEG_precision,test_NEG_precision,train_NEG_recall,test_NEG_recall,train_NEU_precision,test_NEU_precision,train_NEU_recall,test_NEU_recall,train_POS_precision,test_POS_precision,train_POS_recall,test_POS_recall
0,BOW_food_LgstcRegression,0.749643,0.733108,0.722666,0.703417,0.744898,0.846154,0.279693,0.268293,0.563492,0.478261,0.405714,0.312057,0.792539,0.778234,0.947254,0.92439
1,BOW_food_RF,0.7425,0.714527,0.712134,0.672794,0.982143,1.0,0.210728,0.02439,0.549242,0.45098,0.414286,0.326241,0.782491,0.768916,0.942904,0.917073
2,BOW_food_XGB,0.752143,0.724662,0.728624,0.698434,0.783019,0.647059,0.318008,0.268293,0.553232,0.459184,0.415714,0.319149,0.798893,0.781971,0.941816,0.909756
3,BOW_service_LgstcRegression,0.478571,0.45777,0.537239,0.523084,0.176664,0.146032,0.957143,0.92,0.808978,0.782609,0.432143,0.437956,0.95339,0.957447,0.401786,0.343511
4,BOW_service_RF,0.8125,0.819257,0.798917,0.80001,0.652174,0.756098,0.589286,0.62,0.829816,0.815574,0.942857,0.96837,0.81875,0.888889,0.467857,0.427481
5,BOW_service_XGB,0.815,0.807432,0.790033,0.781302,0.809524,0.769231,0.364286,0.4,0.807451,0.796844,0.984184,0.982968,0.880702,0.915254,0.448214,0.412214
6,BOW_ambience_LgstcRegression,0.756071,0.733108,0.70625,0.674684,0.0,0.0,0.0,0.0,0.732827,0.708411,0.993503,0.997368,0.952703,0.964912,0.327907,0.289474
7,BOW_ambience_RF,0.754286,0.72973,0.703594,0.669452,0.0,0.0,0.0,0.0,0.731367,0.705773,0.993503,0.997368,0.95189,0.963636,0.322093,0.278947
8,BOW_ambience_XGB,0.760714,0.733108,0.712168,0.674684,1.0,0.0,0.021505,0.0,0.735036,0.708411,0.997293,0.997368,0.979452,0.964912,0.332558,0.289474
9,TFIDF_food_LgstcRegression,0.696786,0.672297,0.616213,0.606518,0.353276,0.253846,0.950192,0.804878,0.0,0.0,0.0,0.0,0.811725,0.790043,0.926047,0.890244


# test results

In [162]:
test_results = res.columns[res.columns.str.contains("test")]
test_results = ["model_name"] + test_results.to_list()
res[test_results]

Unnamed: 0,model_name,test_accuracy,test_weghted_avg_f1,test_NEG_precision,test_NEG_recall,test_NEU_precision,test_NEU_recall,test_POS_precision,test_POS_recall
0,BOW_food_LgstcRegression,0.733108,0.703417,0.846154,0.268293,0.478261,0.312057,0.778234,0.92439
1,BOW_food_RF,0.714527,0.672794,1.0,0.02439,0.45098,0.326241,0.768916,0.917073
2,BOW_food_XGB,0.724662,0.698434,0.647059,0.268293,0.459184,0.319149,0.781971,0.909756
3,BOW_service_LgstcRegression,0.45777,0.523084,0.146032,0.92,0.782609,0.437956,0.957447,0.343511
4,BOW_service_RF,0.819257,0.80001,0.756098,0.62,0.815574,0.96837,0.888889,0.427481
5,BOW_service_XGB,0.807432,0.781302,0.769231,0.4,0.796844,0.982968,0.915254,0.412214
6,BOW_ambience_LgstcRegression,0.733108,0.674684,0.0,0.0,0.708411,0.997368,0.964912,0.289474
7,BOW_ambience_RF,0.72973,0.669452,0.0,0.0,0.705773,0.997368,0.963636,0.278947
8,BOW_ambience_XGB,0.733108,0.674684,0.0,0.0,0.708411,0.997368,0.964912,0.289474
9,TFIDF_food_LgstcRegression,0.672297,0.606518,0.253846,0.804878,0.0,0.0,0.790043,0.890244


# training results

In [163]:
# training results
train_results = res.columns[res.columns.str.contains("train")]
train_results = ["model_name"] + train_results.to_list()
res[train_results]

Unnamed: 0,model_name,train_accuracy,train_weighted_avg_f1,train_NEG_precision,train_NEG_recall,train_NEU_precision,train_NEU_recall,train_POS_precision,train_POS_recall
0,BOW_food_LgstcRegression,0.749643,0.722666,0.744898,0.279693,0.563492,0.405714,0.792539,0.947254
1,BOW_food_RF,0.7425,0.712134,0.982143,0.210728,0.549242,0.414286,0.782491,0.942904
2,BOW_food_XGB,0.752143,0.728624,0.783019,0.318008,0.553232,0.415714,0.798893,0.941816
3,BOW_service_LgstcRegression,0.478571,0.537239,0.176664,0.957143,0.808978,0.432143,0.95339,0.401786
4,BOW_service_RF,0.8125,0.798917,0.652174,0.589286,0.829816,0.942857,0.81875,0.467857
5,BOW_service_XGB,0.815,0.790033,0.809524,0.364286,0.807451,0.984184,0.880702,0.448214
6,BOW_ambience_LgstcRegression,0.756071,0.70625,0.0,0.0,0.732827,0.993503,0.952703,0.327907
7,BOW_ambience_RF,0.754286,0.703594,0.0,0.0,0.731367,0.993503,0.95189,0.322093
8,BOW_ambience_XGB,0.760714,0.712168,1.0,0.021505,0.735036,0.997293,0.979452,0.332558
9,TFIDF_food_LgstcRegression,0.696786,0.616213,0.353276,0.950192,0.0,0.0,0.811725,0.926047


# Best models of aspect extraction 3, using weighted f1 score
- for food sentiment : BOW_FOOD_XGB
- for service sentiment : ROW_service_RF
- for ambience sentiment : TFIDF_ambience_RF

In [180]:
res[res.model_name.isin(["BOW_food_XGB","BOW_service_RF","TFIDF_ambience_RF"])][["model_name","train_accuracy","test_accuracy","train_weighted_avg_f1","test_weghted_avg_f1"]]

Unnamed: 0,model_name,train_accuracy,test_accuracy,train_weighted_avg_f1,test_weghted_avg_f1
2,BOW_food_XGB,0.752143,0.724662,0.728624,0.698434
4,BOW_service_RF,0.8125,0.819257,0.798917,0.80001
16,TFIDF_ambience_RF,0.757857,0.736486,0.708667,0.679857


In [182]:
res[res.model_name.isin(["BOW_food_XGB","BOW_service_RF","TFIDF_ambience_RF"])][["model_name","test_POS_precision","test_POS_recall","test_NEG_precision","test_NEG_recall","test_NEU_precision","test_NEU_recall"]]

Unnamed: 0,model_name,test_POS_precision,test_POS_recall,test_NEG_precision,test_NEG_recall,test_NEU_precision,test_NEU_recall
2,BOW_food_XGB,0.781971,0.909756,0.647059,0.268293,0.459184,0.319149
4,BOW_service_RF,0.888889,0.427481,0.756098,0.62,0.815574,0.96837
16,TFIDF_ambience_RF,0.966102,0.3,0.0,0.0,0.711069,0.997368


In [181]:
res[res.model_name.str.contains("BERT")][["model_name","test_accuracy","test_weghted_avg_f1"]]

Unnamed: 0,model_name,test_accuracy,test_weghted_avg_f1
18,food_BERT,0.530405,0.529024
19,service_BERT,0.611486,0.545932
20,ambience_BERT,0.722973,0.659342


In [187]:
# import os
# import pickle

In [185]:
# path = os.getcwd() + "/extraction_3_models"
# if "extraction_3_models" not in os.listdir():
#     os.mkdir(path)

# for model_name, model in models.items():
#     filename = path + "/" + model_name + ".pkg"
#     pickle.dump(model[0], open(filename, "wb"))
    

In [186]:
# encodings = {
#     "bow_food": food_aspect_bow,
#     "bow_service": service_aspect_bow,
#     "bow_ambience": ambience_aspect_bow,
#     "tfidf_food": food_aspect_tfidf,
#     "tfidf_service": service_aspect_tfidf,
#     "tfidf_ambience": ambience_aspect_tfidf
# }

# for name, encoder in  encodings.items():
#     filename = path + "/encoding_" + name + ".pkg"
#     pickle.dump(encoder, open(filename, "wb"))