# Baselines
This notebook takes the preprocessed steam reviews data, fits the baseline models based on training data, and evaluates the models.

In [1]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import math

In [2]:
# read in preprocessed and split datasets
train_df = pd.read_csv("data/steam_processed_with_spelling_train.csv")
dev_df = pd.read_csv("data/steam_processed_with_spelling_dev.csv")
test_df = pd.read_csv("data/steam_processed_with_spelling_test.csv")
train_df.head()

Unnamed: 0,Review,Sentiment
0,an amazing game but your probably better off b...,1.0
1,great game in which you can also shoot some co...,1.0
2,because this is not ripping off agaric,0.0
3,if you are looking for a rift racer this is a ...,1.0
4,played on and fax and rob ram mhz default ultr...,1.0


In [3]:
# replace null reviews with null strings
train_df.loc[train_df["Review"].isnull(), 'Review'] = ""
dev_df.loc[dev_df["Review"].isnull(), 'Review'] = ""
test_df.loc[test_df["Review"].isnull(), 'Review'] = ""

In [4]:
# download the USE model
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [5]:
# create function to generate embeddings
def generate_embeddings(text_list, model):

    embeddings_exists = False

    # it may take too much memory to generate all
    # embeddings at once. Split into chunks of 1000
    for i in range(math.ceil(len(text_list)/1000)):
        chunk = text_list[i*1000:i*1000+1000]

        if not embeddings_exists:
            embeddings = model(chunk)
            embeddings_exists = True
        else:
            embeddings = tf.concat([embeddings, model(chunk)], 0)
    
    return embeddings

In [None]:
# generate embeddings for datasets
train_embeddings = generate_embeddings(train_df["Review"].tolist(), embed).numpy()
dev_embeddings = generate_embeddings(dev_df["Review"].tolist(), embed).numpy()
test_embeddings = generate_embeddings(test_df["Review"].tolist(), embed).numpy()

## Gaussian Naive Bayes
Fit the Gaussian Naive Bayes model to the training data and evaluate.

In [None]:
# Fit Gaussian Naive Bayes and score predictions
gnb = GaussianNB()
gnb.fit(train_embeddings, train_df["Sentiment"])
dev_predictions = gnb.predict(dev_embeddings)
print("Accuracy: {}".format(accuracy_score(dev_df["Sentiment"], dev_predictions)))
print("Precision: {}".format(precision_score(dev_df["Sentiment"], dev_predictions)))
print("Recall: {}".format(recall_score(dev_df["Sentiment"], dev_predictions)))
print("F1: {}".format(f1_score(dev_df["Sentiment"], dev_predictions)))

## Support Vector Machine
Fit the support vector machine model to the training data and evaluate.

In [None]:
# Fit support vector machine and score predictions
svm = SVC()
svm.fit(train_embeddings, train_df["Sentiment"])
dev_predictions = svm.predict(dev_embeddings)
print("Accuracy: {}".format(accuracy_score(dev_df["Sentiment"], dev_predictions)))
print("Precision: {}".format(precision_score(dev_df["Sentiment"], dev_predictions)))
print("Recall: {}".format(recall_score(dev_df["Sentiment"], dev_predictions)))
print("F1: {}".format(f1_score(dev_df["Sentiment"], dev_predictions)))