In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import classification_report , confusion_matrix , accuracy_score, f1_score,roc_auc_score, make_scorer

from hyperopt import fmin,hp,tpe,Trials,space_eval,STATUS_OK
from hyperopt.early_stop import no_progress_loss
import warnings
warnings.filterwarnings("ignore")

import torch
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer

  from pandas import MultiIndex, Int64Index


# Aspect extraction method 1
## Some important stuff (xgboost used here is version 1.5.1, seems that the newer versions cannot accept negative numbers as class labels)

### Load train,val,test sets
- subset the extraction methods and the actual aspect scores 

In [2]:
train = pd.read_csv("data/train.csv")
val = pd.read_csv("data/val.csv")
test = pd.read_csv("data/test.csv")

# replace all NULL values with empty string, (NULL values means that there is no part of text in the review mentioning that particular aspect, i.e the sentiment should be neutral)
train.loc[train.food_aspect_extraction_1.isna(), "food_aspect_extraction_1"] = ""
train.loc[train.service_aspect_extraction_1.isna(), "service_aspect_extraction_1"] = ""
train.loc[train.ambience_aspect_extraction_1.isna(), "ambience_aspect_extraction_1"] = ""

val.loc[val.food_aspect_extraction_1.isna(), "food_aspect_extraction_1"] = ""
val.loc[val.service_aspect_extraction_1.isna(), "service_aspect_extraction_1"] = ""
val.loc[val.ambience_aspect_extraction_1.isna(), "ambience_aspect_extraction_1"] = ""

test.loc[test.food_aspect_extraction_1.isna(), "food_aspect_extraction_1"] = ""
test.loc[test.service_aspect_extraction_1.isna(), "service_aspect_extraction_1"] = ""
test.loc[test.ambience_aspect_extraction_1.isna(), "ambience_aspect_extraction_1"] = ""



#extra features (other than text)
train_rating, val_rating, test_rating = train.rating, val.rating, test.rating

train.head(2)

Unnamed: 0.1,Unnamed: 0,id_review,caption,relative_date,retrieval_date,rating,username,n_review_user,n_photo_user,url_user,...,other_sentiment,food_aspect_extraction_1,service_aspect_extraction_1,ambience_aspect_extraction_1,food_aspect_extraction_2,service_aspect_extraction_2,ambience_aspect_extraction_2,food_aspect_extraction_3,service_aspect_extraction_3,ambience_aspect_extraction_3
0,0,ChZDSUhNMG9nS0VJQ0FnSUQ4bXFuSU53EAE,Staffs are rude. Food served cold. Clearly ove...,2 years ago,48:59.1,2,Vanessa Nee,0,0,https://www.google.com/maps/contrib/1081475767...,...,-1,food serve cold,staff rude,,,,,,,
1,1,ChZDSUhNMG9nS0VJQ0FnSUQybFBEUVVREAE,Overrated. Good for Instagram photos. Queue sy...,10 months ago,14:28.0,2,Gwen Goh,0,0,https://www.google.com/maps/contrib/1167846711...,...,0,queue system confusing i dine order,person ask i takeout queue staff say free sit ...,overrate good instagram photo,,,,,,


In [3]:
X_train_food_aspect = train.food_aspect_extraction_1
Y_train_food_aspect = train.food_sentiment
print("Food Aspect")
print(Y_train_food_aspect.value_counts() , "\n")

X_train_service_aspect = train.service_aspect_extraction_1
Y_train_service_aspect = train.service_sentiment
print("Service Aspect")
print(Y_train_service_aspect.value_counts() , "\n")

X_train_ambience_aspect = train.ambience_aspect_extraction_1
Y_train_ambience_aspect = train.ambience_sentiment
print("Ambience Aspect")
print(Y_train_ambience_aspect.value_counts() , "\n")

Food Aspect
 1    1839
 0     700
-1     261
Name: food_sentiment, dtype: int64 

Service Aspect
 0    1960
 1     560
-1     280
Name: service_sentiment, dtype: int64 

Ambience Aspect
 0    1847
 1     860
-1      93
Name: ambience_sentiment, dtype: int64 



In [4]:
X_val_food_aspect = val.food_aspect_extraction_1
Y_val_food_aspect = val.food_sentiment
print("Food Aspect")
print(Y_val_food_aspect.value_counts() , "\n")

X_val_service_aspect = val.service_aspect_extraction_1
Y_val_service_aspect = val.service_sentiment
print("Service Aspect")
print(Y_val_service_aspect.value_counts() , "\n")

X_val_ambience_aspect = val.ambience_aspect_extraction_1
Y_val_ambience_aspect = val.ambience_sentiment
print("Ambience Aspect")
print(Y_val_ambience_aspect.value_counts() , "\n")

Food Aspect
 1    315
 0    140
-1     45
Name: food_sentiment, dtype: int64 

Service Aspect
 0    353
 1     87
-1     60
Name: service_sentiment, dtype: int64 

Ambience Aspect
 0    334
 1    149
-1     17
Name: ambience_sentiment, dtype: int64 



In [5]:
X_test_food_aspect = test.food_aspect_extraction_1
Y_test_food_aspect = test.food_sentiment
print("Food Aspect")
print(Y_test_food_aspect.value_counts() , "\n")

X_test_service_aspect = test.service_aspect_extraction_1
Y_test_service_aspect = test.service_sentiment
print("Service Aspect")
print(Y_test_service_aspect.value_counts() , "\n")

X_test_ambience_aspect = test.ambience_aspect_extraction_1
Y_test_ambience_aspect = test.ambience_sentiment
print("Ambience Aspect")
print(Y_test_ambience_aspect.value_counts() , "\n")

Food Aspect
 1    410
 0    141
-1     41
Name: food_sentiment, dtype: int64 

Service Aspect
 0    411
 1    131
-1     50
Name: service_sentiment, dtype: int64 

Ambience Aspect
 0    380
 1    190
-1     22
Name: ambience_sentiment, dtype: int64 



# Bag of Words
### Food Aspect
* BOW encoding + rating (out of 5 stars) as features

In [6]:
models = {} # for storing all the models
results_table = {
    "model_name":[],
    "train_accuracy":[],
    "test_accuracy":[],
    "train_weighted_avg_f1":[],
    "test_weghted_avg_f1":[],
    "train_NEG_precision":[],
    "test_NEG_precision":[],
    "train_NEG_recall":[],
    "test_NEG_recall":[],
    "train_NEU_precision":[],
    "test_NEU_precision":[],
    "train_NEU_recall":[],
    "test_NEU_recall":[],
    "train_POS_precision":[],
    "test_POS_precision":[],
    "train_POS_recall":[],
    "test_POS_recall":[]

}

In [7]:
# Food aspect
food_aspect_bow = CountVectorizer()
X_train_food_aspect_bow = food_aspect_bow.fit_transform(X_train_food_aspect).toarray()
X_train_food_aspect_bow = np.hstack((X_train_food_aspect_bow,train_rating.to_numpy().reshape(-1,1)))
X_train_food_aspect_bow

array([[0, 0, 0, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 0, 5],
       ...,
       [0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 2]])

In [8]:
X_val_food_aspect_bow = food_aspect_bow.transform(X_val_food_aspect).toarray()
X_val_food_aspect_bow = np.hstack((X_val_food_aspect_bow,val_rating.to_numpy().reshape(-1,1)))
X_val_food_aspect_bow

array([[0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 4],
       [0, 0, 0, ..., 0, 0, 5],
       ...,
       [0, 0, 0, ..., 0, 0, 3],
       [0, 0, 0, ..., 0, 0, 4],
       [0, 0, 0, ..., 0, 0, 5]])

In [9]:
X_test_food_aspect_bow = food_aspect_bow.transform(X_test_food_aspect).toarray()
X_test_food_aspect_bow = np.hstack((X_test_food_aspect_bow,test_rating.to_numpy().reshape(-1,1)))
X_test_food_aspect_bow

array([[0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 3],
       [0, 0, 0, ..., 0, 0, 5],
       ...,
       [0, 0, 0, ..., 0, 0, 4],
       [0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 5]])

In [10]:
# Logistic model
scorer = make_scorer(f1_score)

LOGISTIC_search_space = {
    "C": hp.uniform("C",0,0.2),
    "max_iter" : hp.randint("max_iter",100,300),
}

def obj(search_space):
    model = LogisticRegression(**search_space)
    model.fit(X_train_food_aspect_bow, Y_train_food_aspect)
    preds = model.predict(X_val_food_aspect_bow)
    score = f1_score(Y_val_food_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = LOGISTIC_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(15)) 
params = space_eval(LOGISTIC_search_space,best_params)
print("Selected Params:")
print(params)

bow_food_aspect_logistic = LogisticRegression(**params)
bow_food_aspect_logistic.fit(X_train_food_aspect_bow,Y_train_food_aspect) 

# store the model and their encoders
model_name = "BOW_food_LgstcRegression"
models[model_name] = (bow_food_aspect_logistic, food_aspect_bow)



# store the results in a table
# performance on test set


train_report = classification_report(Y_train_food_aspect, bow_food_aspect_logistic.predict(X_train_food_aspect_bow), output_dict = True) 
test_report = classification_report(Y_test_food_aspect, bow_food_aspect_logistic.predict(X_test_food_aspect_bow), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])


 60%|██████    | 30/50 [00:16<00:11,  1.79trial/s, best loss: -0.7903606118123243]
Selected Params:
{'C': 0.15497318332066756, 'max_iter': 292}


In [11]:
# RandomForest Classifier

RF_search_space = {
    "n_estimators" : hp.randint("n_estimators", 100, 300),
    "max_depth" : hp.randint("max_depth", 8,200),
    "ccp_alpha" : hp.uniform("ccp_alpha", 0.001,0.02)
}


def obj(search_space):
    model = RandomForestClassifier(**search_space)
    model.fit(X_train_food_aspect_bow, Y_train_food_aspect)
    preds = model.predict(X_val_food_aspect_bow)
    score = f1_score(Y_val_food_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = RF_search_space, algo = tpe.suggest, max_evals= 100, early_stop_fn= no_progress_loss(50)) 
params = space_eval(RF_search_space,best_params)
print("Selected Params:")
print(params)

bow_food_aspect_rf = RandomForestClassifier(**params)
bow_food_aspect_rf.fit(X_train_food_aspect_bow,Y_train_food_aspect) 

# store the mode

model_name = "BOW_food_RF"
models[model_name] = (bow_food_aspect_rf, food_aspect_bow)


train_report = classification_report(Y_train_food_aspect, bow_food_aspect_rf.predict(X_train_food_aspect_bow), output_dict = True) 
test_report = classification_report(Y_test_food_aspect, bow_food_aspect_rf.predict(X_test_food_aspect_bow), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])



100%|██████████| 100/100 [03:11<00:00,  1.91s/trial, best loss: -0.8069551289647273]
Selected Params:
{'ccp_alpha': 0.0010237648480806087, 'max_depth': 154, 'n_estimators': 159}


In [12]:
#XBG Classifier class_weight = {1: 0.65, 0: 0.25, -1:10}


XGB_search_space = {
    "max_depth" : hp.randint("max_depth",1,4),
    "n_estimators" : hp.randint("n_estimators",500,1000),
    "eta" : hp.uniform("eta", 0.01,0.2),
}


def obj(search_space):
    model = XGBClassifier(**search_space, verbosity = 0)
    model.fit(X_train_food_aspect_bow, Y_train_food_aspect)
    preds = model.predict(X_val_food_aspect_bow)
    score = f1_score(Y_val_food_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = XGB_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(10)) 
params = space_eval(XGB_search_space,best_params)
print("Selected Params:")
print(params)

bow_food_aspect_xgb = XGBClassifier(**params)
bow_food_aspect_xgb.fit(X_train_food_aspect_bow,Y_train_food_aspect) 


model_name = "BOW_food_XGB"
models[model_name] = (bow_food_aspect_xgb, food_aspect_bow)

train_report = classification_report(Y_train_food_aspect, bow_food_aspect_xgb.predict(X_train_food_aspect_bow), output_dict = True) 
test_report = classification_report(Y_test_food_aspect, bow_food_aspect_xgb.predict(X_test_food_aspect_bow), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])


 64%|██████▍   | 32/50 [11:46<06:37, 22.07s/trial, best loss: -0.8410974113324508]
Selected Params:
{'eta': 0.10489343162342882, 'max_depth': 3, 'n_estimators': 764}


### Service aspect

In [13]:
service_aspect_bow = CountVectorizer()
X_train_service_aspect_bow = service_aspect_bow.fit_transform(X_train_service_aspect).toarray()
X_train_service_aspect_bow = np.hstack((X_train_service_aspect_bow,train_rating.to_numpy().reshape(-1,1)))
X_train_service_aspect_bow

array([[0, 0, 0, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 0, 5],
       ...,
       [0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 2]])

In [14]:
X_val_service_aspect_bow = service_aspect_bow.transform(X_val_service_aspect).toarray()
X_val_service_aspect_bow = np.hstack((X_val_service_aspect_bow,val_rating.to_numpy().reshape(-1,1)))
X_val_service_aspect_bow

array([[0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 4],
       [0, 0, 0, ..., 0, 0, 5],
       ...,
       [0, 0, 0, ..., 0, 0, 3],
       [0, 0, 0, ..., 0, 0, 4],
       [0, 0, 0, ..., 0, 0, 5]])

In [15]:
X_test_service_aspect_bow = service_aspect_bow.transform(X_test_service_aspect).toarray()
X_test_service_aspect_bow = np.hstack((X_test_service_aspect_bow,test_rating.to_numpy().reshape(-1,1)))
X_test_service_aspect_bow

array([[0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 3],
       [0, 0, 0, ..., 0, 0, 5],
       ...,
       [0, 0, 0, ..., 0, 0, 4],
       [0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 5]])

In [16]:
# Logistic model
scorer = make_scorer(f1_score)

LOGISTIC_search_space = {
    "C": hp.uniform("C",0,0.2),
    "max_iter" : hp.randint("max_iter",100,300),
}

def obj(search_space):
    model = LogisticRegression(**search_space, class_weight = {1: 0.65, 0: 0.25, -1:10})
    model.fit(X_train_service_aspect_bow, Y_train_service_aspect)
    preds = model.predict(X_val_service_aspect_bow)
    score = f1_score(Y_val_service_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = LOGISTIC_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(15)) 
params = space_eval(LOGISTIC_search_space,best_params)
print("Selected Params:")
print(params)

bow_service_aspect_logistic = LogisticRegression(**params, class_weight = {1: 0.65, 0: 0.25, -1:10})
bow_service_aspect_logistic.fit(X_train_service_aspect_bow,Y_train_service_aspect) 

# store the mode

model_name = "BOW_service_LgstcRegression"
models[model_name] = (bow_service_aspect_logistic, service_aspect_bow)

train_report = classification_report(Y_train_service_aspect, bow_service_aspect_logistic.predict(X_train_service_aspect_bow), output_dict = True) 
test_report = classification_report(Y_test_service_aspect, bow_service_aspect_logistic.predict(X_test_service_aspect_bow), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

 74%|███████▍  | 37/50 [00:18<00:06,  1.96trial/s, best loss: -0.793409396340296]  
Selected Params:
{'C': 0.19949169430052688, 'max_iter': 240}


In [17]:
# RandomForest Classifier

RF_search_space = {
    "n_estimators" : hp.randint("n_estimators", 100, 300),
    "max_depth" : hp.randint("max_depth", 8,200),
    "ccp_alpha" : hp.uniform("ccp_alpha", 0.001,0.02)
}


def obj(search_space):
    model = RandomForestClassifier(**search_space)
    model.fit(X_train_service_aspect_bow, Y_train_service_aspect)
    preds = model.predict(X_val_service_aspect_bow)
    score = f1_score(Y_val_service_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = RF_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(50)) 
params = space_eval(RF_search_space,best_params)
print("Selected Params:")
print(params)

bow_service_aspect_rf = RandomForestClassifier(**params)
bow_service_aspect_rf.fit(X_train_service_aspect_bow,Y_train_service_aspect) 

# store the mode

model_name = "BOW_service_RF"
models[model_name] = (bow_service_aspect_rf, service_aspect_bow)

train_report = classification_report(Y_train_service_aspect, bow_service_aspect_rf.predict(X_train_service_aspect_bow), output_dict = True) 
test_report = classification_report(Y_test_service_aspect, bow_service_aspect_rf.predict(X_test_service_aspect_bow), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

100%|██████████| 50/50 [01:19<00:00,  1.59s/trial, best loss: -0.8365707977056638]
Selected Params:
{'ccp_alpha': 0.00110426795739672, 'max_depth': 77, 'n_estimators': 230}


In [18]:
XGB_search_space = {
    "max_depth" : hp.randint("max_depth",1,4),
    "n_estimators" : hp.randint("n_estimators",300,1000),
    "eta" : hp.uniform("eta", 0.01,0.2)
}


def obj(search_space):
    model = XGBClassifier(**search_space, verbosity = 0)
    model.fit(X_train_service_aspect_bow, Y_train_service_aspect)
    preds = model.predict(X_val_service_aspect_bow)
    score = f1_score(Y_val_service_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = XGB_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(20)) 
params = space_eval(XGB_search_space,best_params)
print("Selected Params:")
print(params)

bow_service_aspect_xgb = XGBClassifier(**params)
bow_service_aspect_xgb.fit(X_train_service_aspect_bow,Y_train_service_aspect) 

# store the model
model_name = "BOW_service_XGB"
models[model_name] = (bow_service_aspect_xgb, service_aspect_bow)

train_report = classification_report(Y_train_service_aspect, bow_service_aspect_xgb.predict(X_train_service_aspect_bow), output_dict = True) 
test_report = classification_report(Y_test_service_aspect, bow_service_aspect_xgb.predict(X_test_service_aspect_bow), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

 44%|████▍     | 22/50 [04:20<05:31, 11.85s/trial, best loss: -0.8598966686177406]
Selected Params:
{'eta': 0.1588258633116282, 'max_depth': 3, 'n_estimators': 674}


### Ambience Aspect

In [19]:
ambience_aspect_bow = CountVectorizer()
X_train_ambience_aspect_bow = ambience_aspect_bow.fit_transform(X_train_ambience_aspect).toarray()
X_train_ambience_aspect_bow = np.hstack((X_train_ambience_aspect_bow,train_rating.to_numpy().reshape(-1,1)))
X_train_ambience_aspect_bow

array([[0, 0, 0, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 0, 5],
       ...,
       [0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 2]])

In [20]:
X_val_ambience_aspect_bow = ambience_aspect_bow.transform(X_val_ambience_aspect).toarray()
X_val_ambience_aspect_bow = np.hstack((X_val_ambience_aspect_bow,val_rating.to_numpy().reshape(-1,1)))
X_val_ambience_aspect_bow

array([[0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 4],
       [0, 0, 0, ..., 0, 0, 5],
       ...,
       [0, 0, 0, ..., 0, 0, 3],
       [0, 0, 0, ..., 0, 0, 4],
       [0, 0, 0, ..., 0, 0, 5]])

In [21]:
X_test_ambience_aspect_bow = ambience_aspect_bow.transform(X_test_ambience_aspect).toarray()
X_test_ambience_aspect_bow = np.hstack((X_test_ambience_aspect_bow,test_rating.to_numpy().reshape(-1,1)))
X_test_ambience_aspect_bow

array([[0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 3],
       [0, 0, 0, ..., 0, 0, 5],
       ...,
       [0, 0, 0, ..., 0, 0, 4],
       [0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 5]])

In [22]:
# Logistic model
scorer = make_scorer(f1_score)

LOGISTIC_search_space = {
    "C": hp.uniform("C",0,0.2),
    "max_iter" : hp.randint("max_iter",100,300),
}

def obj(search_space):
    model = LogisticRegression(**search_space)
    model.fit(X_train_ambience_aspect_bow, Y_train_ambience_aspect)
    preds = model.predict(X_val_ambience_aspect_bow)
    score = f1_score(Y_val_ambience_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = LOGISTIC_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(15)) 
params = space_eval(LOGISTIC_search_space,best_params)
print("Selected Params:")
print(params)

bow_ambience_aspect_logistic = LogisticRegression(**params)
bow_ambience_aspect_logistic.fit(X_train_ambience_aspect_bow,Y_train_ambience_aspect) 

# store the model
model_name = "BOW_ambience_LgstcRegression"
models[model_name] = (bow_ambience_aspect_logistic, ambience_aspect_bow)

train_report = classification_report(Y_train_ambience_aspect, bow_ambience_aspect_logistic.predict(X_train_ambience_aspect_bow), output_dict = True) 
test_report = classification_report(Y_test_ambience_aspect, bow_ambience_aspect_logistic.predict(X_test_ambience_aspect_bow), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

 54%|█████▍    | 27/50 [00:11<00:09,  2.37trial/s, best loss: -0.7923819434297046]
Selected Params:
{'C': 0.08398031352700266, 'max_iter': 217}


In [23]:
RF_search_space = {
    "n_estimators" : hp.randint("n_estimators", 100, 300),
    "max_depth" : hp.randint("max_depth", 8,200),
    "ccp_alpha" : hp.uniform("ccp_alpha", 0.001,0.02)
}


def obj(search_space):
    model = RandomForestClassifier(**search_space)
    model.fit(X_train_ambience_aspect_bow, Y_train_ambience_aspect)
    preds = model.predict(X_val_ambience_aspect_bow)
    score = f1_score(Y_val_ambience_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = RF_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(50)) 
params = space_eval(RF_search_space,best_params)
print("Selected Params:")
print(params)

bow_ambience_aspect_rf = RandomForestClassifier(**params)
bow_ambience_aspect_rf.fit(X_train_ambience_aspect_bow,Y_train_ambience_aspect) 

# store the mode

model_name = "BOW_ambience_RF"
models[model_name] = (bow_ambience_aspect_rf, ambience_aspect_bow)

train_report = classification_report(Y_train_ambience_aspect, bow_ambience_aspect_rf.predict(X_train_ambience_aspect_bow), output_dict = True) 
test_report = classification_report(Y_test_ambience_aspect, bow_ambience_aspect_rf.predict(X_test_ambience_aspect_bow), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

100%|██████████| 50/50 [01:09<00:00,  1.39s/trial, best loss: -0.8144892035188531]
Selected Params:
{'ccp_alpha': 0.0011930761831981142, 'max_depth': 58, 'n_estimators': 156}


In [24]:
XGB_search_space = {
    "max_depth" : hp.randint("max_depth",1,4),
    "n_estimators" : hp.randint("n_estimators",300,1000),
    "eta" : hp.uniform("eta", 0.01,0.2)
}


def obj(search_space):
    model = XGBClassifier(**search_space, verbosity = 0)
    model.fit(X_train_ambience_aspect_bow, Y_train_ambience_aspect)
    preds = model.predict(X_val_ambience_aspect_bow)
    score = f1_score(Y_val_ambience_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = XGB_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(20)) 
params = space_eval(XGB_search_space,best_params)
print("Selected Params:")
print(params)

bow_ambience_aspect_xgb = XGBClassifier(**params)
bow_ambience_aspect_xgb.fit(X_train_ambience_aspect_bow,Y_train_ambience_aspect) 

# store the mode

model_name = "BOW_ambience_XGB"
models[model_name] = (bow_ambience_aspect_xgb, ambience_aspect_bow)

train_report = classification_report(Y_train_ambience_aspect, bow_ambience_aspect_xgb.predict(X_train_ambience_aspect_bow), output_dict = True) 
test_report = classification_report(Y_test_ambience_aspect, bow_ambience_aspect_xgb.predict(X_test_ambience_aspect_bow), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

 48%|████▊     | 24/50 [04:18<04:40, 10.79s/trial, best loss: -0.8332610503380452]
Selected Params:
{'eta': 0.19332546548754542, 'max_depth': 3, 'n_estimators': 995}


# Tf-idf model
### Food aspect

In [25]:
food_aspect_tfidf = TfidfVectorizer()
X_train_food_aspect_tfidf = food_aspect_tfidf.fit_transform(X_train_food_aspect).toarray()
X_train_food_aspect_tfidf = np.hstack((X_train_food_aspect_tfidf,train_rating.to_numpy().reshape(-1,1)))
X_train_food_aspect_tfidf

array([[0., 0., 0., ..., 0., 0., 2.],
       [0., 0., 0., ..., 0., 0., 2.],
       [0., 0., 0., ..., 0., 0., 5.],
       ...,
       [0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 2.]])

In [26]:
X_val_food_aspect_tfidf = food_aspect_tfidf.transform(X_val_food_aspect).toarray()
X_val_food_aspect_tfidf = np.hstack((X_val_food_aspect_tfidf,val_rating.to_numpy().reshape(-1,1)))
X_val_food_aspect_tfidf

array([[0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 4.],
       [0., 0., 0., ..., 0., 0., 5.],
       ...,
       [0., 0., 0., ..., 0., 0., 3.],
       [0., 0., 0., ..., 0., 0., 4.],
       [0., 0., 0., ..., 0., 0., 5.]])

In [27]:
X_test_food_aspect_tfidf = food_aspect_tfidf.transform(X_test_food_aspect).toarray()
X_test_food_aspect_tfidf = np.hstack((X_test_food_aspect_tfidf,test_rating.to_numpy().reshape(-1,1)))
X_test_food_aspect_tfidf

array([[0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 3.],
       [0., 0., 0., ..., 0., 0., 5.],
       ...,
       [0., 0., 0., ..., 0., 0., 4.],
       [0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 5.]])

In [28]:
# Logistic model

LOGISTIC_search_space = {
    "C": hp.uniform("C",0.5,2),
    "max_iter" : hp.randint("max_iter",100,200),
}

def obj(search_space):
    model = LogisticRegression(**search_space, class_weight = {1: 0.65, 0: 0.25, -1:10})
    model.fit(X_train_food_aspect_tfidf, Y_train_food_aspect)
    preds = model.predict(X_val_food_aspect_tfidf)
    score = f1_score(Y_val_food_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = LOGISTIC_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(15)) 
params = space_eval(LOGISTIC_search_space,best_params)
print("Selected Params:")
print(params)

tfidf_food_aspect_logistic = LogisticRegression(**params, class_weight = {1: 0.65, 0: 0.25, -1:10})
tfidf_food_aspect_logistic.fit(X_train_food_aspect_tfidf,Y_train_food_aspect) 



model_name = "TFIDF_food_LgstcRegression"
models[model_name] = (tfidf_food_aspect_logistic, food_aspect_tfidf)

train_report = classification_report(Y_train_food_aspect, tfidf_food_aspect_logistic.predict(X_train_food_aspect_tfidf), output_dict = True) 
test_report = classification_report(Y_test_food_aspect, tfidf_food_aspect_logistic.predict(X_test_food_aspect_tfidf), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

 40%|████      | 20/50 [00:27<00:40,  1.36s/trial, best loss: -0.6025216137246211]
Selected Params:
{'C': 1.3237318643649247, 'max_iter': 103}


In [29]:
RF_search_space = {
    "n_estimators" : hp.randint("n_estimators", 100, 300),
    "max_depth" : hp.randint("max_depth", 8,200),
    "ccp_alpha" : hp.uniform("ccp_alpha", 0.001,0.02)
}


def obj(search_space):
    model = RandomForestClassifier(**search_space)
    model.fit(X_train_food_aspect_tfidf, Y_train_food_aspect)
    preds = model.predict(X_val_food_aspect_tfidf)
    score = f1_score(Y_val_food_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = RF_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(50)) 
params = space_eval(RF_search_space,best_params)
print("Selected Params:")
print(params)

tfidf_food_aspect_rf = RandomForestClassifier(**params)
tfidf_food_aspect_rf.fit(X_train_food_aspect_tfidf,Y_train_food_aspect) 

# store the mode

model_name = "TFIDF_food_RF"
models[model_name] = (tfidf_food_aspect_rf, food_aspect_tfidf)

train_report = classification_report(Y_train_food_aspect, tfidf_food_aspect_rf.predict(X_train_food_aspect_tfidf), output_dict = True) 
test_report = classification_report(Y_test_food_aspect, tfidf_food_aspect_rf.predict(X_test_food_aspect_tfidf), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

100%|██████████| 50/50 [01:25<00:00,  1.72s/trial, best loss: -0.7682637987901146]
Selected Params:
{'ccp_alpha': 0.0010379575369503157, 'max_depth': 128, 'n_estimators': 147}


In [30]:
XGB_search_space = {
    "max_depth" : hp.randint("max_depth",1,4),
    "n_estimators" : hp.randint("n_estimators",300,1000),
    "eta" : hp.uniform("eta", 0.01,0.2)
}


def obj(search_space):
    model = XGBClassifier(**search_space, verbosity = 0)
    model.fit(X_train_food_aspect_tfidf, Y_train_food_aspect)
    preds = model.predict(X_val_food_aspect_tfidf)
    score = f1_score(Y_val_food_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = XGB_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(20)) 
params = space_eval(XGB_search_space,best_params)
print("Selected Params:")
print(params)

tfidf_food_aspect_xgb = XGBClassifier(**params)
tfidf_food_aspect_xgb.fit(X_train_food_aspect_tfidf,Y_train_food_aspect) 

# store themodel

model_name = "tfidf_food_XGB"
models[model_name] = (tfidf_food_aspect_xgb, food_aspect_tfidf)

train_report = classification_report(Y_train_food_aspect, tfidf_food_aspect_xgb.predict(X_train_food_aspect_tfidf), output_dict = True) 
test_report = classification_report(Y_test_food_aspect, tfidf_food_aspect_xgb.predict(X_test_food_aspect_tfidf), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

100%|██████████| 50/50 [15:41<00:00, 18.84s/trial, best loss: -0.8366534102207198]
Selected Params:
{'eta': 0.1785365275832882, 'max_depth': 2, 'n_estimators': 840}


### Service aspect

In [31]:
service_aspect_tfidf = TfidfVectorizer()
X_train_service_aspect_tfidf = service_aspect_tfidf.fit_transform(X_train_service_aspect).toarray()
X_train_service_aspect_tfidf = np.hstack((X_train_service_aspect_tfidf,train_rating.to_numpy().reshape(-1,1)))
X_train_service_aspect_tfidf

array([[0., 0., 0., ..., 0., 0., 2.],
       [0., 0., 0., ..., 0., 0., 2.],
       [0., 0., 0., ..., 0., 0., 5.],
       ...,
       [0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 2.]])

In [32]:
X_val_service_aspect_tfidf = service_aspect_tfidf.transform(X_val_service_aspect).toarray()
X_val_service_aspect_tfidf = np.hstack((X_val_service_aspect_tfidf,val_rating.to_numpy().reshape(-1,1)))
X_val_service_aspect_tfidf

array([[0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 4.],
       [0., 0., 0., ..., 0., 0., 5.],
       ...,
       [0., 0., 0., ..., 0., 0., 3.],
       [0., 0., 0., ..., 0., 0., 4.],
       [0., 0., 0., ..., 0., 0., 5.]])

In [33]:
X_test_service_aspect_tfidf = service_aspect_tfidf.transform(X_test_service_aspect).toarray()
X_test_service_aspect_tfidf = np.hstack((X_test_service_aspect_tfidf,test_rating.to_numpy().reshape(-1,1)))
X_test_service_aspect_tfidf

array([[0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 3.],
       [0., 0., 0., ..., 0., 0., 5.],
       ...,
       [0., 0., 0., ..., 0., 0., 4.],
       [0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 5.]])

In [34]:
LOGISTIC_search_space = {
    "C": hp.uniform("C",0,0.2),
    "max_iter" : hp.randint("max_iter",100,300),
}

def obj(search_space):
    model = LogisticRegression(**search_space)
    model.fit(X_train_service_aspect_tfidf, Y_train_service_aspect)
    preds = model.predict(X_val_service_aspect_tfidf)
    score = f1_score(Y_val_service_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = LOGISTIC_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(15)) 
params = space_eval(LOGISTIC_search_space,best_params)
print("Selected Params:")
print(params)

tfidf_service_aspect_logistic = LogisticRegression(**params)
tfidf_service_aspect_logistic.fit(X_train_service_aspect_tfidf,Y_train_service_aspect) 

# store the mode

model_name = "TFIDF_service_LgstcRegression"
models[model_name] = (tfidf_service_aspect_logistic, service_aspect_tfidf)

train_report = classification_report(Y_train_service_aspect, tfidf_service_aspect_logistic.predict(X_train_service_aspect_tfidf), output_dict = True) 
test_report = classification_report(Y_test_service_aspect, tfidf_service_aspect_logistic.predict(X_test_service_aspect_tfidf), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

 84%|████████▍ | 42/50 [00:29<00:05,  1.43trial/s, best loss: -0.8043159768244185]
Selected Params:
{'C': 0.18348429820251555, 'max_iter': 276}


In [35]:
RF_search_space = {
    "n_estimators" : hp.randint("n_estimators", 100, 300),
    "max_depth" : hp.randint("max_depth", 8,200),
    "ccp_alpha" : hp.uniform("ccp_alpha", 0.001,0.02)
}


def obj(search_space):
    model = RandomForestClassifier(**search_space)
    model.fit(X_train_service_aspect_tfidf, Y_train_service_aspect)
    preds = model.predict(X_val_service_aspect_tfidf)
    score = f1_score(Y_val_service_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = RF_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(50)) 
params = space_eval(RF_search_space,best_params)
print("Selected Params:")
print(params)

tfidf_service_aspect_rf = RandomForestClassifier(**params)
tfidf_service_aspect_rf.fit(X_train_service_aspect_tfidf,Y_train_service_aspect) 

# store the mode

model_name = "TFIDF_service_RF"
models[model_name] = (tfidf_service_aspect_rf, service_aspect_tfidf)

train_report = classification_report(Y_train_service_aspect, tfidf_service_aspect_rf.predict(X_train_service_aspect_tfidf), output_dict = True) 
test_report = classification_report(Y_test_service_aspect, tfidf_service_aspect_rf.predict(X_test_service_aspect_tfidf), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

100%|██████████| 50/50 [01:16<00:00,  1.53s/trial, best loss: -0.8302663716705986]
Selected Params:
{'ccp_alpha': 0.0012833823753987958, 'max_depth': 122, 'n_estimators': 229}


In [36]:
XGB_search_space = {
    "max_depth" : hp.randint("max_depth",1,4),
    "n_estimators" : hp.randint("n_estimators",300,1000),
    "eta" : hp.uniform("eta", 0.01,0.2)
}


def obj(search_space):
    model = XGBClassifier(**search_space, verbosity = 0)
    model.fit(X_train_service_aspect_tfidf, Y_train_service_aspect)
    preds = model.predict(X_val_service_aspect_tfidf)
    score = f1_score(Y_val_service_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = XGB_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(20)) 
params = space_eval(XGB_search_space,best_params)
print("Selected Params:")
print(params)

tfidf_service_aspect_xgb = XGBClassifier(**params)
tfidf_service_aspect_xgb.fit(X_train_service_aspect_tfidf,Y_train_service_aspect) 

# store the mode
model_name = "tfidf_service_XGB"
models[model_name] = (tfidf_service_aspect_xgb, service_aspect_tfidf)

train_report = classification_report(Y_train_service_aspect, tfidf_service_aspect_xgb.predict(X_train_service_aspect_tfidf), output_dict = True) 
test_report = classification_report(Y_test_service_aspect, tfidf_service_aspect_xgb.predict(X_test_service_aspect_tfidf), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

 94%|█████████▍| 47/50 [12:41<00:48, 16.20s/trial, best loss: -0.8608636067402328]
Selected Params:
{'eta': 0.057166113396695854, 'max_depth': 3, 'n_estimators': 789}


### Ambience

In [37]:
ambience_aspect_tfidf = TfidfVectorizer()
X_train_ambience_aspect_tfidf = ambience_aspect_tfidf.fit_transform(X_train_ambience_aspect).toarray()
X_train_ambience_aspect_tfidf = np.hstack((X_train_ambience_aspect_tfidf,train_rating.to_numpy().reshape(-1,1)))
X_train_ambience_aspect_tfidf

array([[0., 0., 0., ..., 0., 0., 2.],
       [0., 0., 0., ..., 0., 0., 2.],
       [0., 0., 0., ..., 0., 0., 5.],
       ...,
       [0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 2.]])

In [38]:
X_val_ambience_aspect_tfidf = ambience_aspect_tfidf.transform(X_val_ambience_aspect).toarray()
X_val_ambience_aspect_tfidf = np.hstack((X_val_ambience_aspect_tfidf,val_rating.to_numpy().reshape(-1,1)))
X_val_ambience_aspect_tfidf

array([[0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 4.],
       [0., 0., 0., ..., 0., 0., 5.],
       ...,
       [0., 0., 0., ..., 0., 0., 3.],
       [0., 0., 0., ..., 0., 0., 4.],
       [0., 0., 0., ..., 0., 0., 5.]])

In [39]:
X_test_ambience_aspect_tfidf = ambience_aspect_tfidf.transform(X_test_ambience_aspect).toarray()
X_test_ambience_aspect_tfidf = np.hstack((X_test_ambience_aspect_tfidf,test_rating.to_numpy().reshape(-1,1)))
X_test_ambience_aspect_tfidf

array([[0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 3.],
       [0., 0., 0., ..., 0., 0., 5.],
       ...,
       [0., 0., 0., ..., 0., 0., 4.],
       [0., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 5.]])

In [40]:
LOGISTIC_search_space = {
    "C": hp.uniform("C",0,0.2),
    "max_iter" : hp.randint("max_iter",100,300),
}

def obj(search_space):
    model = LogisticRegression(**search_space)
    model.fit(X_train_ambience_aspect_tfidf, Y_train_ambience_aspect)
    preds = model.predict(X_val_ambience_aspect_tfidf)
    score = f1_score(Y_val_ambience_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = LOGISTIC_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(15)) 
params = space_eval(LOGISTIC_search_space,best_params)
print("Selected Params:")
print(params)

tfidf_ambience_aspect_logistic = LogisticRegression(**params)
tfidf_ambience_aspect_logistic.fit(X_train_ambience_aspect_tfidf,Y_train_ambience_aspect) 

# store the mode

model_name = "TFIDF_ambience_LgstcRegression"
models[model_name] = (tfidf_ambience_aspect_logistic, ambience_aspect_tfidf)

train_report = classification_report(Y_train_ambience_aspect, tfidf_ambience_aspect_logistic.predict(X_train_ambience_aspect_tfidf), output_dict = True) 
test_report = classification_report(Y_test_ambience_aspect, tfidf_ambience_aspect_logistic.predict(X_test_ambience_aspect_tfidf), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

 40%|████      | 20/50 [00:08<00:13,  2.29trial/s, best loss: -0.7970991180310123]
Selected Params:
{'C': 0.17559978149890698, 'max_iter': 140}


In [41]:
RF_search_space = {
    "n_estimators" : hp.randint("n_estimators", 100, 300),
    "max_depth" : hp.randint("max_depth", 8,200),
    "ccp_alpha" : hp.uniform("ccp_alpha", 0.001,0.02)
}


def obj(search_space):
    model = RandomForestClassifier(**search_space)
    model.fit(X_train_ambience_aspect_tfidf, Y_train_ambience_aspect)
    preds = model.predict(X_val_ambience_aspect_tfidf)
    score = f1_score(Y_val_ambience_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = RF_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(50)) 
params = space_eval(RF_search_space,best_params)
print("Selected Params:")
print(params)

tfidf_ambience_aspect_rf = RandomForestClassifier(**params)
tfidf_ambience_aspect_rf.fit(X_train_ambience_aspect_tfidf,Y_train_ambience_aspect) 

# store the mode

model_name = "TFIDF_ambience_RF"
models[model_name] = (tfidf_ambience_aspect_rf, ambience_aspect_tfidf)

train_report = classification_report(Y_train_ambience_aspect, tfidf_ambience_aspect_rf.predict(X_train_ambience_aspect_tfidf), output_dict = True) 
test_report = classification_report(Y_test_ambience_aspect, tfidf_ambience_aspect_rf.predict(X_test_ambience_aspect_tfidf), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

100%|██████████| 50/50 [01:06<00:00,  1.34s/trial, best loss: -0.8240814006838939]
Selected Params:
{'ccp_alpha': 0.001041402428746569, 'max_depth': 68, 'n_estimators': 104}


In [42]:
XGB_search_space = {
    "max_depth" : hp.randint("max_depth",1,4),
    "n_estimators" : hp.randint("n_estimators",300,1000),
    "eta" : hp.uniform("eta", 0.01,0.2)
}


def obj(search_space):
    model = XGBClassifier(**search_space, verbosity = 0)
    model.fit(X_train_ambience_aspect_tfidf, Y_train_ambience_aspect)
    preds = model.predict(X_val_ambience_aspect_tfidf)
    score = f1_score(Y_val_ambience_aspect,preds, average="weighted")
    return {"loss" : -score ,"status" : STATUS_OK}

best_params = fmin(fn = obj, space = XGB_search_space, algo = tpe.suggest, max_evals= 50, early_stop_fn= no_progress_loss(20)) 
params = space_eval(XGB_search_space,best_params)
print("Selected Params:")
print(params)

tfidf_ambience_aspect_xgb = XGBClassifier(**params)
tfidf_ambience_aspect_xgb.fit(X_train_ambience_aspect_tfidf,Y_train_ambience_aspect) 

# store the mode

model_name = "tfidf_ambience_XGB"
models[model_name] = (tfidf_ambience_aspect_xgb, ambience_aspect_tfidf)

train_report = classification_report(Y_train_ambience_aspect, tfidf_ambience_aspect_xgb.predict(X_train_ambience_aspect_tfidf), output_dict = True) 
test_report = classification_report(Y_test_ambience_aspect, tfidf_ambience_aspect_xgb.predict(X_test_ambience_aspect_tfidf), output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(train_report["accuracy"])
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(train_report["weighted avg"]["f1-score"])
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(train_report["-1"]["precision"])
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(train_report["-1"]["recall"])
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(train_report["0"]["precision"])
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(train_report["0"]["recall"])
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(train_report["1"]["precision"])
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(train_report["1"]["recall"])
results_table["test_POS_recall"].append(test_report["1"]["recall"])

 40%|████      | 20/50 [04:00<06:00, 12.01s/trial, best loss: -0.8272334401220441]
Selected Params:
{'eta': 0.19023183787781658, 'max_depth': 3, 'n_estimators': 498}


# BERT

In [43]:
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

In [44]:
# empty string means no mention of that aspect in the review, therefore we think it shud get a neutral rating. we replace the empty string with "normal" to get a neutral rating

test.loc[test.food_aspect_extraction_1 == "", "food_aspect_extraction_1"] = "normal"
test.loc[test.service_aspect_extraction_1 == "", "service_aspect_extraction_1"] = "normal"
test.loc[test.ambience_aspect_extraction_1 == "", "ambience_aspect_extraction_1"] = "normal"

In [45]:
def get_sentiment(review):
    tokens = tokenizer.encode(review, return_tensors="pt")
    result = model(tokens)
    sentiment = int(torch.argmax(result.logits)+1)
    # the out put is range from 1~5 , we classifiy 1~2 as negative , 3 as neutral , 4 and 5 as positvive
    if sentiment >=4 :
        return 1
    elif sentiment == 3:
        return 0
    else:
        return -1

In [46]:
# Food aspect
preds = test.food_aspect_extraction_1.apply(lambda x : get_sentiment(x))

model_name = "food_BERT"

test_report = classification_report(Y_test_ambience_aspect, preds, output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(np.nan)
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(np.nan)
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(np.nan)
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(np.nan)
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(np.nan)
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(np.nan)
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(np.nan)
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(np.nan)
results_table["test_POS_recall"].append(test_report["1"]["recall"])


In [47]:
# service aspect
preds = test.service_aspect_extraction_1.apply(lambda x : get_sentiment(x))

model_name = "service_BERT"

test_report = classification_report(Y_test_ambience_aspect, preds, output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(np.nan)
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(np.nan)
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(np.nan)
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(np.nan)
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(np.nan)
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(np.nan)
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(np.nan)
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(np.nan)
results_table["test_POS_recall"].append(test_report["1"]["recall"])

In [48]:
# ambience aspect
preds = test.ambience_aspect_extraction_1.apply(lambda x : get_sentiment(x))

model_name = "ambience_BERT"

test_report = classification_report(Y_test_ambience_aspect, preds, output_dict = True)

results_table["model_name"].append(model_name)

results_table["train_accuracy"].append(np.nan)
results_table["test_accuracy"].append(test_report["accuracy"])

results_table["train_weighted_avg_f1"].append(np.nan)
results_table["test_weghted_avg_f1"].append(test_report["weighted avg"]["f1-score"])

results_table["train_NEG_precision"].append(np.nan)
results_table["test_NEG_precision"].append(test_report["-1"]["precision"])

results_table["train_NEG_recall"].append(np.nan)
results_table["test_NEG_recall"].append(test_report["-1"]["recall"])

results_table["train_NEU_precision"].append(np.nan)
results_table["test_NEU_precision"].append(test_report["0"]["precision"])

results_table["train_NEU_recall"].append(np.nan)
results_table["test_NEU_recall"].append(test_report["0"]["recall"])

results_table["train_POS_precision"].append(np.nan)
results_table["test_POS_precision"].append(test_report["1"]["precision"])

results_table["train_POS_recall"].append(np.nan)
results_table["test_POS_recall"].append(test_report["1"]["recall"])

# Results


In [49]:
res = pd.DataFrame(results_table)
res

Unnamed: 0,model_name,train_accuracy,test_accuracy,train_weighted_avg_f1,test_weghted_avg_f1,train_NEG_precision,test_NEG_precision,train_NEG_recall,test_NEG_recall,train_NEU_precision,test_NEU_precision,train_NEU_recall,test_NEU_recall,train_POS_precision,test_POS_precision,train_POS_recall,test_POS_recall
0,BOW_food_LgstcRegression,0.828929,0.790541,0.819172,0.779532,0.822222,0.708333,0.56705,0.414634,0.734767,0.627119,0.585714,0.524823,0.854995,0.837778,0.958673,0.919512
1,BOW_food_RF,0.7925,0.758446,0.766755,0.717191,0.943089,0.7,0.444444,0.170732,0.756477,0.633803,0.417143,0.319149,0.790485,0.776908,0.984774,0.968293
2,BOW_food_XGB,0.8925,0.810811,0.892516,0.811737,0.895928,0.5625,0.758621,0.439024,0.789688,0.631902,0.831429,0.730496,0.933225,0.904282,0.934747,0.87561
3,BOW_service_LgstcRegression,0.785,0.798986,0.799705,0.814815,0.436508,0.4,0.982143,0.96,0.931985,0.927978,0.77602,0.815085,0.747212,0.810811,0.717857,0.687023
4,BOW_service_RF,0.880714,0.842905,0.876162,0.829559,0.92973,0.857143,0.614286,0.36,0.889678,0.851293,0.958673,0.961071,0.82505,0.803738,0.741071,0.656489
5,BOW_service_XGB,0.918571,0.853041,0.917216,0.847672,0.946281,0.789474,0.817857,0.6,0.925672,0.871622,0.965816,0.941606,0.877193,0.8,0.803571,0.671756
6,BOW_ambience_LgstcRegression,0.820357,0.790541,0.799483,0.764644,1.0,0.0,0.010753,0.0,0.809633,0.776824,0.955604,0.952632,0.857835,0.84127,0.617442,0.557895
7,BOW_ambience_RF,0.846429,0.829392,0.833233,0.811337,1.0,0.0,0.053763,0.0,0.861041,0.838095,0.922577,0.926316,0.810049,0.80814,0.768605,0.731579
8,BOW_ambience_XGB,0.905714,0.824324,0.902426,0.810299,0.960784,0.0,0.526882,0.0,0.895657,0.832941,0.971305,0.931579,0.928954,0.8375,0.805814,0.705263
9,TFIDF_food_LgstcRegression,0.730357,0.702703,0.655686,0.635918,0.409449,0.300885,0.996169,0.829268,1.0,0.8,0.045714,0.028369,0.821847,0.797468,0.953235,0.921951


# test results

In [50]:
test_results = res.columns[res.columns.str.contains("test")]
test_results = ["model_name"] + test_results.to_list()
res[test_results]

Unnamed: 0,model_name,test_accuracy,test_weghted_avg_f1,test_NEG_precision,test_NEG_recall,test_NEU_precision,test_NEU_recall,test_POS_precision,test_POS_recall
0,BOW_food_LgstcRegression,0.790541,0.779532,0.708333,0.414634,0.627119,0.524823,0.837778,0.919512
1,BOW_food_RF,0.758446,0.717191,0.7,0.170732,0.633803,0.319149,0.776908,0.968293
2,BOW_food_XGB,0.810811,0.811737,0.5625,0.439024,0.631902,0.730496,0.904282,0.87561
3,BOW_service_LgstcRegression,0.798986,0.814815,0.4,0.96,0.927978,0.815085,0.810811,0.687023
4,BOW_service_RF,0.842905,0.829559,0.857143,0.36,0.851293,0.961071,0.803738,0.656489
5,BOW_service_XGB,0.853041,0.847672,0.789474,0.6,0.871622,0.941606,0.8,0.671756
6,BOW_ambience_LgstcRegression,0.790541,0.764644,0.0,0.0,0.776824,0.952632,0.84127,0.557895
7,BOW_ambience_RF,0.829392,0.811337,0.0,0.0,0.838095,0.926316,0.80814,0.731579
8,BOW_ambience_XGB,0.824324,0.810299,0.0,0.0,0.832941,0.931579,0.8375,0.705263
9,TFIDF_food_LgstcRegression,0.702703,0.635918,0.300885,0.829268,0.8,0.028369,0.797468,0.921951


# training results

In [51]:
# training rest
train_results = res.columns[res.columns.str.contains("train")]
train_results = ["model_name"] + train_results.to_list()
res[train_results]

Unnamed: 0,model_name,train_accuracy,train_weighted_avg_f1,train_NEG_precision,train_NEG_recall,train_NEU_precision,train_NEU_recall,train_POS_precision,train_POS_recall
0,BOW_food_LgstcRegression,0.828929,0.819172,0.822222,0.56705,0.734767,0.585714,0.854995,0.958673
1,BOW_food_RF,0.7925,0.766755,0.943089,0.444444,0.756477,0.417143,0.790485,0.984774
2,BOW_food_XGB,0.8925,0.892516,0.895928,0.758621,0.789688,0.831429,0.933225,0.934747
3,BOW_service_LgstcRegression,0.785,0.799705,0.436508,0.982143,0.931985,0.77602,0.747212,0.717857
4,BOW_service_RF,0.880714,0.876162,0.92973,0.614286,0.889678,0.958673,0.82505,0.741071
5,BOW_service_XGB,0.918571,0.917216,0.946281,0.817857,0.925672,0.965816,0.877193,0.803571
6,BOW_ambience_LgstcRegression,0.820357,0.799483,1.0,0.010753,0.809633,0.955604,0.857835,0.617442
7,BOW_ambience_RF,0.846429,0.833233,1.0,0.053763,0.861041,0.922577,0.810049,0.768605
8,BOW_ambience_XGB,0.905714,0.902426,0.960784,0.526882,0.895657,0.971305,0.928954,0.805814
9,TFIDF_food_LgstcRegression,0.730357,0.655686,0.409449,0.996169,1.0,0.045714,0.821847,0.953235


# Best model on weighted f1 on test set

In [61]:
res[res.model_name.isin(["tfidf_food_XGB","BOW_service_XGB","TFIDF_ambience_RF"])][["model_name","train_accuracy","test_accuracy","train_weighted_avg_f1","test_weghted_avg_f1"]]

Unnamed: 0,model_name,train_accuracy,test_accuracy,train_weighted_avg_f1,test_weghted_avg_f1
5,BOW_service_XGB,0.918571,0.853041,0.917216,0.847672
11,tfidf_food_XGB,0.914643,0.822635,0.914695,0.823495
16,TFIDF_ambience_RF,0.864286,0.834459,0.853256,0.816743


In [62]:
res[res.model_name.isin(["tfidf_food_XGB","BOW_service_XGB","TFIDF_ambience_RF"])][["model_name","test_POS_precision","test_POS_recall","test_NEG_precision","test_NEG_recall","test_NEU_precision","test_NEU_recall"]]

Unnamed: 0,model_name,test_POS_precision,test_POS_recall,test_NEG_precision,test_NEG_recall,test_NEU_precision,test_NEU_recall
5,BOW_service_XGB,0.8,0.671756,0.789474,0.6,0.871622,0.941606
11,tfidf_food_XGB,0.910448,0.892683,0.540541,0.487805,0.660131,0.716312
16,TFIDF_ambience_RF,0.806818,0.747368,0.0,0.0,0.846154,0.926316


In [60]:
res[res.model_name.str.contains("BERT")][["model_name","test_accuracy","test_weghted_avg_f1"]]

Unnamed: 0,model_name,test_accuracy,test_weghted_avg_f1
18,food_BERT,0.410473,0.389114
19,service_BERT,0.508446,0.525009
20,ambience_BERT,0.773649,0.77989


In [65]:
# import os
# import pickle


# # make sure dun have any folder already named "models" with stuff in it 
# path = os.getcwd() + "/extraction_1_models"
# if "extraction_1_models" not in os.listdir():
#     os.mkdir(path)

# for model_name, model in models.items():
#     filename = path + "/" +  model_name + ".pkg"
#     pickle.dump(model[0], open(filename, "wb"))
    

In [66]:
# encodings = {
#     "bow_food": food_aspect_bow,
#     "bow_service": service_aspect_bow,
#     "bow_ambience": ambience_aspect_bow,
#     "tfidf_food": food_aspect_tfidf,
#     "tfidf_service": service_aspect_tfidf,
#     "tfidf_ambience": ambience_aspect_tfidf
# }

# for name, encoder in  encodings.items():
#     filename = path + "/encoding_" + name + ".pkg"
#     pickle.dump(encoder, open(filename, "wb"))