# Text Preprocessing

In [1]:
import pandas as pd
import math
import string
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
from sklearn.metrics import mean_squared_error

[nltk_data] Downloading package stopwords to /home/felix/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/felix/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# read data into pandas dataframe
path = "data/video_games.tsv.gz"
video_games = pd.read_csv(path, sep="\t", verbose=True, parse_dates=[14], on_bad_lines="skip")

Tokenization took: 381.15 ms
Type conversion took: 204.73 ms
Parser memory cleanup took: 0.01 ms
Tokenization took: 301.18 ms
Type conversion took: 210.35 ms
Parser memory cleanup took: 0.01 ms
Tokenization took: 288.56 ms
Type conversion took: 203.96 ms
Parser memory cleanup took: 0.01 ms
Tokenization took: 267.04 ms
Type conversion took: 195.00 ms
Parser memory cleanup took: 0.01 ms
Tokenization took: 254.21 ms
Type conversion took: 192.30 ms
Parser memory cleanup took: 0.01 ms
Tokenization took: 285.15 ms
Type conversion took: 206.40 ms
Parser memory cleanup took: 0.00 ms
Tokenization took: 318.69 ms
Type conversion took: 228.17 ms
Parser memory cleanup took: 0.01 ms
Tokenization took: 299.66 ms
Type conversion took: 215.99 ms
Parser memory cleanup took: 0.01 ms
Tokenization took: 338.64 ms
Type conversion took: 249.24 ms
Parser memory cleanup took: 0.01 ms
Tokenization took: 476.82 ms
Type conversion took: 303.05 ms
Parser memory cleanup took: 0.00 ms
Tokenization took: 382.67 ms
T

In [3]:
video_games = video_games[["review_headline", "review_body", "star_rating"]]

video_games.head()

Unnamed: 0,review_headline,review_body,star_rating
0,an amazing joystick. I especially love that yo...,"Used this for Elite Dangerous on my mac, an am...",5
1,Definitely a silent mouse... Not a single clic...,"Loved it, I didn't even realise it was a gami...",5
2,One Star,poor quality work and not as it is advertised.,1
3,"good, but could be bettee","nice, but tend to slip away from stick in inte...",3
4,Great but flawed.,"Great amiibo, great for collecting. Quality ma...",4


In [4]:
# binary sentiment values
# star ratings of 1-3 -> 0 (negative)
# star ratings of 4-5 -> 1 (positive)

video_games["sentiment"] = video_games["star_rating"].map({1: 0, 2: 0, 3: 0, 4: 1, 5: 1})

video_games.head()

Unnamed: 0,review_headline,review_body,star_rating,sentiment
0,an amazing joystick. I especially love that yo...,"Used this for Elite Dangerous on my mac, an am...",5,1
1,Definitely a silent mouse... Not a single clic...,"Loved it, I didn't even realise it was a gami...",5,1
2,One Star,poor quality work and not as it is advertised.,1,0
3,"good, but could be bettee","nice, but tend to slip away from stick in inte...",3,0
4,Great but flawed.,"Great amiibo, great for collecting. Quality ma...",4,1


## Text Preprocessing

In [5]:
# functions used for preprocessing

def tokenize_words(text):
    tokenized_text = word_tokenize(text)
    return tokenized_text

def remove_punctuation(text):
    text = str(text)
    punctiations = string.punctuation
    return text.translate(str.maketrans('', '', punctiations))

STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(plain_text):
    return ' '.join([word for word in plain_text.split() if word not in STOPWORDS])

def remove_spec_char(text):
    text = str(text)
    text = re.sub('[^a-zA-Z0-9]', ' ', text)
    text = re.sub('\s+', ' ', text)
    return text

#lemmatizer = WordNetLemmatizer()
#wordnet_map = {'N':wordnet.NOUN, 'V':wordnet.VERB, 'J':wordnet.ADJ, 'R':wordnet.ADV}
#def lemmatize_word(plain_text):
#    # Finind pos tags
#    pos_text = pos_tag(plain_text.split())
#    return ' '.join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_text])

ps = PorterStemmer()
def stem_words(text):
    return ' '.join([ps.stem(word) for word in text.split()])


In [6]:
# lowercase words
video_games["review_headline"] = video_games["review_headline"].str.lower()
# remove punctuation,...
video_games["review_headline"] = video_games["review_headline"].apply(remove_punctuation)
# ...special characters,...
video_games["review_headline"] = video_games["review_headline"].apply(remove_spec_char)
# ...and stopwords
video_games["review_headline"] = video_games["review_headline"].apply(remove_stopwords)
# apply stemming
video_games["review_headline"] = video_games["review_headline"].apply(stem_words)

video_games.head()

Unnamed: 0,review_headline,review_body,star_rating,sentiment
0,amaz joystick especi love twist,"Used this for Elite Dangerous on my mac, an am...",5,1
1,definit silent mous singl click heard,"Loved it, I didn't even realise it was a gami...",5,1
2,one star,poor quality work and not as it is advertised.,1,0
3,good could bette,"nice, but tend to slip away from stick in inte...",3,0
4,great flaw,"Great amiibo, great for collecting. Quality ma...",4,1


## Creating train, validation and test sets

In [7]:
# create train, validation and test splits
train_df, test_df = train_test_split(video_games, test_size=0.3)
train_df, val_df = train_test_split(train_df, test_size=0.3)

print(len(train_df))
print(len(val_df))
print(len(test_df))

872330
373857
534081


In [8]:
# bindary sentiment values
y_train_bin = train_df["sentiment"].values
y_val_bin = val_df["sentiment"].values
y_test_bin = test_df["sentiment"].values

# star ratings
y_train_stars = train_df["star_rating"].values
y_val_stars = val_df["star_rating"].values
y_test_stars = test_df["star_rating"].values

## Feature generation - binary word occurence vectors (only review headlines)

In [9]:
# binary word occurence vectors
vectorizer = CountVectorizer(binary=True)

X_train = vectorizer.fit_transform(train_df["review_headline"])
X_val = vectorizer.transform(val_df["review_headline"])
X_test = vectorizer.transform(test_df["review_headline"])

print(f"number of terms: {len(vectorizer.get_feature_names_out())}")

number of terms: 54062


## Stupid Baseline Model

**Predicting the most common class...**

### Predicting Sentiment (star ratings 1, 2 and 3: negative; star ratings 4 and 5: positive)

In [13]:
print("train set:")
print(f"acc: {accuracy_score(y_train_bin, [1 for _ in range(len(y_train_bin))])}")
print(f"pre: {precision_score(y_train_bin, [1 for _ in range(len(y_train_bin))])}")
print(f"rec: {recall_score(y_train_bin, [1 for _ in range(len(y_train_bin))])}")
print("validation set:")
print(f"acc: {accuracy_score(y_val_bin, [1 for _ in range(len(y_val_bin))])}")
print(f"pre: {precision_score(y_val_bin, [1 for _ in range(len(y_val_bin))])}")
print(f"rec: {recall_score(y_val_bin, [1 for _ in range(len(y_val_bin))])}")
print("test set:")
print(f"acc: {accuracy_score(y_test_bin, [1 for _ in range(len(y_test_bin))])}")
print(f"pre: {precision_score(y_test_bin, [1 for _ in range(len(y_test_bin))])}")
print(f"rec: {recall_score(y_test_bin, [1 for _ in range(len(y_test_bin))])}")

train set:
acc: 0.7531736842708608
pre: 0.7531736842708608
rec: 1.0
validation set:
acc: 0.7529456449926041
pre: 0.7529456449926041
rec: 1.0
test set:
acc: 0.7529138838490791
pre: 0.7529138838490791
rec: 1.0


### Predicting Star Ratings

In [18]:
print("train set:")
print(f"rmse: {math.sqrt(mean_squared_error(y_train_stars, [5 for _ in range(len(y_train_stars))]))}")
print("validation set:")
print(f"rmse: {math.sqrt(mean_squared_error(y_val_stars, [5 for _ in range(len(y_val_stars))]))}")
print("test set:")
print(f"rmse: {math.sqrt(mean_squared_error(y_test_stars, [5 for _ in range(len(y_test_stars))]))}")

train set:
rmse: 1.6503482381086434
validation set:
rmse: 1.6501169726009062
test set:
rmse: 1.650271459302246


## Naive Bayes Model

### Predicting Sentiment (star ratings 1, 2 and 3: negative; star ratings 4 and 5: positive)

In [10]:
naive_bayes = BernoulliNB()

naive_bayes.fit(X_train, y_train_bin)

y_pred_bin_train = naive_bayes.predict(X_train)
y_pred_bin_val = naive_bayes.predict(X_val)
y_pred_bin_test = naive_bayes.predict(X_test)

print("train set:")
print(f"acc: {accuracy_score(y_train_bin, y_pred_bin_train)}")
print(f"pre: {precision_score(y_train_bin, y_pred_bin_train)}")
print(f"rec: {recall_score(y_train_bin, y_pred_bin_train)}")
print(classification_report(y_train_bin, y_pred_bin_train))
print("validation set:")
print(f"acc: {accuracy_score(y_val_bin, y_pred_bin_val)}")
print(f"pre: {precision_score(y_val_bin, y_pred_bin_val)}")
print(f"rec: {recall_score(y_val_bin, y_pred_bin_val)}")
print(classification_report(y_val_bin, y_pred_bin_val))
print("test set:")
print(f"acc: {accuracy_score(y_test_bin, y_pred_bin_test)}")
print(f"pre: {precision_score(y_test_bin, y_pred_bin_test)}")
print(f"rec: {recall_score(y_test_bin, y_pred_bin_test)}")
print(classification_report(y_test_bin, y_pred_bin_test))

train set:
acc: 0.8474854699482994
pre: 0.874927908524325
rec: 0.9305237619783993
              precision    recall  f1-score   support

           0       0.74      0.59      0.66    215314
           1       0.87      0.93      0.90    657016

    accuracy                           0.85    872330
   macro avg       0.81      0.76      0.78    872330
weighted avg       0.84      0.85      0.84    872330

validation set:
acc: 0.838539869522304
pre: 0.8670528674578803
rec: 0.9278279465992171
              precision    recall  f1-score   support

           0       0.72      0.57      0.63     92363
           1       0.87      0.93      0.90    281494

    accuracy                           0.84    373857
   macro avg       0.79      0.75      0.77    373857
weighted avg       0.83      0.84      0.83    373857

test set:
acc: 0.8379290781735355
pre: 0.8666243760979282
rec: 0.9274837920306777
              precision    recall  f1-score   support

           0       0.72      0.57      0

### Predicting Star Ratings

In [16]:
naive_bayes_stars = BernoulliNB()

naive_bayes_stars.fit(X_train, y_train_stars)

y_pred_stars_train= naive_bayes_stars.predict(X_train)
y_pred_stars_val = naive_bayes_stars.predict(X_val)
y_pred_stars_test = naive_bayes_stars.predict(X_test)

print("train set:")
print(f"rmse: {math.sqrt(mean_squared_error(y_train_stars, y_pred_stars_train))}")
print(classification_report(y_train_stars, y_pred_stars_train))
print("validation set:")
print(f"rmse: {math.sqrt(mean_squared_error(y_val_stars, y_pred_stars_val))}")
print(classification_report(y_val_stars, y_pred_stars_val))
print("test set:")
print(f"rmse: {math.sqrt(mean_squared_error(y_test_stars, y_pred_stars_test))}")
print(classification_report(y_test_stars, y_pred_stars_test))

train set:
rmse: 1.2458439155874828
              precision    recall  f1-score   support

           1       0.56      0.57      0.57     94008
           2       0.40      0.08      0.13     46149
           3       0.56      0.30      0.39     75157
           4       0.52      0.32      0.40    155824
           5       0.73      0.92      0.81    501192

    accuracy                           0.68    872330
   macro avg       0.55      0.44      0.46    872330
weighted avg       0.64      0.68      0.64    872330

validation set:
rmse: 1.2732793121766508
              precision    recall  f1-score   support

           1       0.54      0.54      0.54     40184
           2       0.25      0.04      0.07     19936
           3       0.50      0.26      0.34     32243
           4       0.47      0.28      0.35     66630
           5       0.72      0.91      0.80    214864

    accuracy                           0.66    373857
   macro avg       0.50      0.41      0.42    373857
