# Pre-Processing

In [1]:
# Import necessary packages
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

### Load and create dataframe to pre-process

In [2]:
# Load data and check which columns are identifier columns
df = pd.read_csv('MetacriticGameReviewsClean.csv', index_col=0)
print(df.columns)

# Change the genre of shoot to shooter
df['genre'] = df.genre.replace('Shoot', 'Shooter')

Index(['title', 'platform', 'metascore', 'metasentiment', 'average_userscore',
       'average_usersentiment', 'developer', 'genre', 'number_of_players',
       'esrb_rating', 'release_date', 'username', 'userscore', 'usersentiment',
       'review', 'review_date', 'clean_text', 'language', 'word_count',
       'sentence_count', 'exclamation_count', 'question_count', 'period_count',
       'reading_level', 'syllable_count'],
      dtype='object')


In [3]:
# Remove features not useful for modeling and target variable
data = df.drop(columns=['title','developer','release_date','username','usersentiment','review','review_date','clean_text'])

### Standardize and encode categorical features

In [4]:
# Select all non-object columns and standardize their values
data_num = data.select_dtypes(exclude='object')
SS = StandardScaler()
scaled = SS.fit_transform(data_num)
scaled_df = pd.DataFrame(data=scaled, columns=data_num.columns)

# Create dummy variables for multicategorical columns
data_obj = data.select_dtypes(include='object')
dummies = pd.get_dummies(data_obj, drop_first=True)

# Concatenate standardized and dummy feautures
predictors = pd.concat([scaled_df, dummies], axis=1)

pd.set_option('display.max_columns', None)
predictors.head()

Unnamed: 0,metascore,average_userscore,userscore,word_count,sentence_count,exclamation_count,question_count,period_count,reading_level,syllable_count,platform_Switch,platform_Xbox One,metasentiment_negative,metasentiment_positive,average_usersentiment_negative,average_usersentiment_positive,genre_Other,genre_RPG,genre_Shooter,genre_Sports,number_of_players_singleplayer,esrb_rating_E10+,esrb_rating_M,esrb_rating_T,language_ar,language_bg,language_ca,language_cs,language_cy,language_da,language_de,language_el,language_en,language_es,language_et,language_fi,language_fr,language_id,language_it,language_ja,language_ko,language_lt,language_nl,language_no,language_pl,language_pt,language_ro,language_ru,language_sk,language_so,language_sq,language_sv,language_sw,language_tl,language_tr,language_uk,language_vi,language_zh-cn
0,1.922673,0.637296,0.057479,0.233446,0.159792,-0.174746,-0.218959,0.072435,-0.047344,0.287327,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.922673,0.637296,0.057479,-0.362197,-0.194813,-0.174746,-0.218959,-0.281347,-0.044224,-0.313695,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.922673,0.637296,0.057479,1.274317,1.460011,0.129021,0.665319,2.372019,-0.039767,1.352374,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.922673,0.637296,-0.530577,1.280333,1.105406,-0.174746,2.433876,0.691554,0.300311,1.19328,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.922673,0.637296,-0.236549,2.85668,2.878432,0.736557,3.318154,3.079583,0.107318,2.788641,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Find best parameters for count and TFIDF vectorizers

In [5]:
X = df.clean_text
y = df.usersentiment.replace(['negative','mixed','positive'],[0,1,2])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

cv = CountVectorizer()
tfidf = TfidfTransformer()

In [7]:
multiNB = MultinomialNB()
steps = [('cv', cv),
         ('tfidf', tfidf),
         ('multiNB', multiNB)]

param_grid = [{'cv__min_df': [0.025,0.05,0.1],
              'cv__max_df': [0.95,0.9,0.85]}]

pipeline = Pipeline(steps)
clf = GridSearchCV(pipeline, param_grid=param_grid, cv=5)

clf.fit(X_train, y_train)
y_pred = clf.best_estimator_.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(clf.best_params_)

0.7040140296180826
{'cv__max_df': 0.95, 'cv__min_df': 0.025}


In [8]:
knn = KNeighborsClassifier(n_neighbors=3)

steps = [('cv', cv),
         ('tfidf', tfidf),
         ('knn', knn)]

param_grid = [{'cv__min_df': [0.025,0.05,0.1],
              'cv__max_df': [0.95,0.9,0.85]}]

pipeline = Pipeline(steps)
clf = GridSearchCV(pipeline, param_grid=param_grid, cv=5)

clf.fit(X_train, y_train)
y_pred = clf.best_estimator_.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(clf.best_params_)

0.5600155884645363
{'cv__max_df': 0.95, 'cv__min_df': 0.1}


In [9]:
svc = SVC()

steps = [('cv', cv),
         ('tfidf', tfidf),
         ('svc', svc)]

param_grid = [{'cv__min_df': [0.025,0.05,0.1],
               'cv__max_df': [0.95,0.9,0.85]}]

pipeline = Pipeline(steps)
clf = GridSearchCV(pipeline, param_grid=param_grid, cv=5)

clf.fit(X_train, y_train)
y_pred = clf.best_estimator_.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(clf.best_params_)

0.7498051441932969
{'cv__max_df': 0.95, 'cv__min_df': 0.025}


In [10]:
rf = RandomForestClassifier()

steps = [('cv', cv),
         ('tfidf', tfidf),
         ('rf', rf)]

param_grid = [{'cv__min_df': [0.025,0.05,0.1],
               'cv__max_df': [0.95,0.9,0.85]}]

pipeline = Pipeline(steps)
clf = GridSearchCV(pipeline, param_grid=param_grid, cv=5)

clf.fit(X_train, y_train)
y_pred = clf.best_estimator_.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(clf.best_params_)

0.7264224473889321
{'cv__max_df': 0.95, 'cv__min_df': 0.025}
