# Baselines
This notebook takes the preprocessed steam reviews data, fits the baseline models based on training data, and evaluates the models.

In [1]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import math

In [2]:
# read in preprocessed and split datasets
train_df = pd.read_csv("steam_sample_train.csv")
dev_df = pd.read_csv("steam_sample_dev.csv")
test_df = pd.read_csv("steam_sample_test.csv")
train_df.head()

Unnamed: 0,Review,Sentiment
0,ruined my life,1.0
1,this game saved my virginity,1.0
2,do you like original games do you like games t...,1.0
3,easy to learn hard to master,1.0
4,nor revolver will play again,1.0


In [3]:
# replace null reviews with null strings
train_df.loc[train_df["Review"].isnull(), 'Review'] = ""
dev_df.loc[dev_df["Review"].isnull(), 'Review'] = ""
test_df.loc[test_df["Review"].isnull(), 'Review'] = ""

In [4]:
# download the USE model
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [5]:
# create function to generate embeddings
def generate_embeddings(text_list, model):

    embeddings_exists = False

    # it may take too much memory to generate all
    # embeddings at once. Split into chunks of 1000
    for i in range(math.ceil(len(text_list)/1000)):
        chunk = text_list[i*1000:i*1000+1000]

        if not embeddings_exists:
            embeddings = model(chunk)
            embeddings_exists = True
        else:
            embeddings = tf.concat([embeddings, model(chunk)], 0)
    
    return embeddings

In [6]:
# generate embeddings for datasets
train_embeddings = generate_embeddings(train_df["Review"].tolist(), embed).numpy()
dev_embeddings = generate_embeddings(dev_df["Review"].tolist(), embed).numpy()
test_embeddings = generate_embeddings(test_df["Review"].tolist(), embed).numpy()

## Gaussian Naive Bayes
Fit the Gaussian Naive Bayes model to the training data and evaluate.

In [7]:
# Fit Gaussian Naive Bayes and score predictions
gnb = GaussianNB()
gnb.fit(train_embeddings, train_df["Sentiment"])
dev_predictions = gnb.predict(dev_embeddings)
print("Accuracy: {}".format(accuracy_score(dev_df["Sentiment"], dev_predictions)))
print("Precision: {}".format(precision_score(dev_df["Sentiment"], dev_predictions)))
print("Recall: {}".format(recall_score(dev_df["Sentiment"], dev_predictions)))
print("F1: {}".format(f1_score(dev_df["Sentiment"], dev_predictions)))

Accuracy: 0.8168964652795186
Precision: 0.9615829984333493
Recall: 0.8205172914850334
F1: 0.8854669761023646


## Support Vector Machine
Fit the support vector machine model to the training data and evaluate.

In [8]:
# Fit support vector machine and score predictions
svm = SVC()
svm.fit(train_embeddings, train_df["Sentiment"])
dev_predictions = svm.predict(dev_embeddings)
print("Accuracy: {}".format(accuracy_score(dev_df["Sentiment"], dev_predictions)))
print("Precision: {}".format(precision_score(dev_df["Sentiment"], dev_predictions)))
print("Recall: {}".format(recall_score(dev_df["Sentiment"], dev_predictions)))
print("F1: {}".format(f1_score(dev_df["Sentiment"], dev_predictions)))



Accuracy: 0.8626222110804713
Precision: 0.8626222110804713
Recall: 1.0
F1: 0.9262449528936743
