In [84]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import GridSearchCV
from bert_embedding import BertEmbedding

## Load preprocessed data

In [2]:
train = pd.read_pickle("data/df_train.pickle")
test = pd.read_pickle("data/df_test.pickle")

In [3]:
train

Unnamed: 0,review,rating
0,great speaker,3
1,great little,4
2,awesome,5
3,love,5
4,great device,5
...,...,...
6850,fun love,5
6851,lot fun,5
6852,buy gift husband problem set want return past ...,3
6853,set control light home thermostat love able se...,5


# 1. Logistic Regression + TF-IDF features

## Transform reviews into features (TF-IDF encoding)

In [4]:
vectoriser = TfidfVectorizer()

In [5]:
# Transfrom training data
X = vectoriser.fit_transform(train['review'])
y = train['rating']

In [6]:
X.shape

(6765, 3625)

In [8]:
# Transform test data
X_test = vectoriser.transform(test['review'])
y_test = test['rating']

In [9]:
X_test.shape

(3039, 3625)

## Hyperparameter tuning (Logistic Regression)

In [11]:
# Make validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.15, random_state = 100)

In [12]:
# Create a parameter grid
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet'],
    'C': np.arange(0.1, 5, 0.2),
    'solver': ['lbfgs', 'liblinear'],
    'class_weight': ['balanced']   
}

In [13]:
# Create grid search object
clf = GridSearchCV(LogisticRegression(max_iter = 500), param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1)

# Fit on data
best_clf = clf.fit(X_train, y_train)

Fitting 5 folds for each of 150 candidates, totalling 750 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   12.1s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   31.9s
[Parallel(n_jobs=-1)]: Done 750 out of 750 | elapsed:  1.8min finished


In [14]:
best_clf.best_params_

{'C': 0.30000000000000004,
 'class_weight': 'balanced',
 'penalty': 'l2',
 'solver': 'liblinear'}

In [15]:
model = best_clf.best_estimator_

In [16]:
preds_val = model.predict(X_val)
preds_train = model.predict(X_train)

In [17]:
print("Training accuracy score: ", np.round(accuracy_score(y_train, preds_train), 4))
print("Validation accuracy score: ", np.round(accuracy_score(y_val, preds_val), 4))

Training accuracy score:  0.7689
Validation accuracy score:  0.7133


In [18]:
print("Training F1 score: ", np.round(f1_score(y_train, preds_train, average='weighted'), 4))
print("Validation F1 score: ", np.round(f1_score(y_val, preds_val, average='weighted'), 4))

Training F1 score:  0.7566
Validation F1 score:  0.7001


## Refit the best model and predict on test dataset

In [19]:
model = best_clf.best_estimator_

In [20]:
# Refit the model on the full training set
model.fit(X, y)

LogisticRegression(C=0.30000000000000004, class_weight='balanced', max_iter=500,
                   solver='liblinear')

In [21]:
preds_test = model.predict(X_test)

In [22]:
print("Accuracy score on the test set: ", np.round(accuracy_score(y_test, preds_test), 4))

Accuracy score on the test set:  0.6779


In [23]:
print("F1 score on the test set: ", np.round(f1_score(y_test, preds_test, average='weighted'), 4))

F1 score on the test set:  0.6834


# 2. Logistic Regression + BERT embeddings

## Transform reviews into features (embeddings)

In [24]:
train

Unnamed: 0,review,rating
0,great speaker,3
1,great little,4
2,awesome,5
3,love,5
4,great device,5
...,...,...
6850,fun love,5
6851,lot fun,5
6852,buy gift husband problem set want return past ...,3
6853,set control light home thermostat love able se...,5


In [25]:
bert_embedding = BertEmbedding()

In [26]:
def get_embedding(review):
    """Return mean of word embeddings for a reivew"""
    row_embeddings = bert_embedding(review.split('/n'))[0][1:]
    avg_embedding = np.mean(row_embeddings, axis=1)[0].tolist()
    return avg_embedding

In [27]:
%%time
# Get embeddings for training set
X_emb = np.array(list(train['review'].apply(get_embedding)))
y_emb = train['rating']

CPU times: user 1h 7min 16s, sys: 3min 29s, total: 1h 10min 46s
Wall time: 16min 34s


In [28]:
X_emb.shape

(6765, 768)

In [29]:
# Save train features to pickle file
train_emb = pd.DataFrame(X_emb)
train_emb['y'] = y_emb
train_emb.to_pickle("data/df_train_emb.pickle")

In [30]:
%%time
# Get embeddings for test set
X_emb_test = np.array(list(test['review'].apply(get_embedding)))
y_emb_test = test['rating']

CPU times: user 28min 19s, sys: 1min 19s, total: 29min 39s
Wall time: 6min 58s


In [31]:
X_emb_test.shape

(3039, 768)

In [32]:
# Save test features to pickle file
test_emb = pd.DataFrame(X_emb_test)
test_emb['y'] = y_emb_test
test_emb.to_pickle("data/df_test_emb.pickle")

## Hyperparameter tuning (Logistic Regression)

In [108]:
# Make validation split
X_train_emb, X_val_emb, y_train_emb, y_val_emb = train_test_split(X_emb, y_emb, test_size = 0.15, random_state = 100)

In [109]:
X_train_emb.shape

(5750, 768)

In [110]:
# Create a parameter grid
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet'],
    'C': np.arange(0.1, 5, 0.5),
    'solver': ['saga'],
    'class_weight': ['balanced']   
}

In [111]:
# Create grid search object
clf = GridSearchCV(LogisticRegression(max_iter = 100), param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1)

# Fit on data
best_clf = clf.fit(X_train_emb, y_train_emb)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  9.4min finished


In [112]:
best_clf.best_params_

{'C': 3.6, 'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'saga'}

In [113]:
model = best_clf.best_estimator_

In [114]:
preds_val = model.predict(X_val_emb)
preds_train = model.predict(X_train_emb)

In [115]:
print("Training accuracy score: ", np.round(accuracy_score(y_train_emb, preds_train), 4))
print("Validation accuracy score: ", np.round(accuracy_score(y_val_emb, preds_val), 4))

Training accuracy score:  0.7294
Validation accuracy score:  0.5586


In [116]:
print("Training F1 score: ", np.round(f1_score(y_train_emb, preds_train, average='weighted'), 4))
print("Validation F1 score: ", np.round(f1_score(y_val_emb, preds_val, average='weighted'), 4))

Training F1 score:  0.7435
Validation F1 score:  0.6029


## Refit the best model and predict on test dataset

In [117]:
model = best_clf.best_estimator_

In [1]:
# Refit the model on the full training set
model.fit(X_emb, y_emb)

NameError: name 'model' is not defined

In [119]:
preds_test = model.predict(X_emb_test)

In [120]:
print("Accuracy score on the test set: ", np.round(accuracy_score(y_emb_test, preds_test), 4))

Accuracy score on the test set:  0.4877


In [121]:
print("F1 score on the test set: ", np.round(f1_score(y_emb_test, preds_test, average='weighted'), 4))

F1 score on the test set:  0.5556
