In [11]:
import pandas as pd
import numpy as np
import pickle

import pprint as pp
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import confusion_matrix 
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import statsmodels.api as sm
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score
from scipy.stats import zscore 
from sklearn.model_selection import KFold

In [2]:
with open('gamewithbins.pkl', 'rb') as read_file:
    game_df = pickle.load(read_file)

In [3]:
with open('nmfdf.pkl', 'rb') as read_file:
    topic_df = pickle.load(read_file)

In [4]:
topic_df.columns = ["top_Game_Att","top_Cards","top_War","top_Dice","top_Party",
                       "top_AbstractSt","top_TilePl","top_Family"]
topic_df

Unnamed: 0,top_Game_Att,top_Cards,top_War,top_Dice,top_Party,top_AbstractSt,top_TilePl,top_Family
1,0.089252,0.020393,0.013614,0.008835,0.042686,0.000000,0.000000,0.002569
2,0.021168,0.084448,0.001249,0.000000,0.000000,0.000000,0.000000,0.032647
3,0.035323,0.002290,0.010180,0.000000,0.001677,0.111365,0.102896,0.011952
4,0.064767,0.003299,0.007060,0.001130,0.000000,0.042180,0.008445,0.012634
5,0.047224,0.000000,0.003576,0.000000,0.009358,0.013468,0.080075,0.054948
...,...,...,...,...,...,...,...,...
271693,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.121162,0.000000
271785,0.002131,0.002344,0.015125,0.000000,0.000000,0.001420,0.004556,0.045559
271869,0.000000,0.023892,0.000000,0.170552,0.023393,0.000000,0.000000,0.000000
271896,0.032694,0.005555,0.024477,0.010438,0.000000,0.000000,0.000000,0.000000


In [5]:
df = pd.merge(game_df,topic_df, how='left',left_on='id',right_index=True,left_index=False)

In [6]:
df = df.drop(columns=['primary','median','playerage','podcast','label','boardgamedesigner', 'boardgameartist',
       'boardgamepublisher', 'yearpublished','boardgamecategory','boardgamemechanic',
                     'maxplayers','minplaytime','maxplaytime',
                      'Expansion for Base-game','Pirates','Collectible Components','Storytelling',
                     'Dice','Card Game','Spies/Secret Agents',
                      'Party Game','Political','Animals','rating_cat','siteviews','conv_rate','numgeeklists'])

In [7]:
df = df.dropna(axis=0)

In [8]:
X = df.drop(columns=['owned_cat','id','bayesaverage','Rank'])
y = df['owned_cat']

In [9]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2,random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=.25, random_state=43)

In [13]:
# training a DescisionTreeClassifier 
from sklearn.tree import DecisionTreeClassifier 
dtree_model = DecisionTreeClassifier(max_depth = 3).fit(X_train, y_train) 
dtree_predictions = dtree_model.predict(X_val) 
  
# creating a confusion matrix 
cm = confusion_matrix(y_val, dtree_predictions) 

In [14]:
cm

array([[554, 106,   0,  56,   2],
       [361, 139,   0, 129,  19],
       [215, 137,   0, 272,  55],
       [ 87,  56,   0, 347, 183],
       [  7,  10,   0, 133, 539]])

In [15]:
# training a KNN classifier 
from sklearn.neighbors import KNeighborsClassifier 
knn = KNeighborsClassifier(n_neighbors = 7).fit(X_train, y_train) 
  
# accuracy on X_test 
accuracy = knn.score(X_val, y_val) 
print(accuracy)
  
# creating a confusion matrix 
knn_predictions = knn.predict(X_val)  
cm = confusion_matrix(y_val, knn_predictions)

0.4479013795127678


In [16]:
cm

array([[431, 175,  80,  26,   6],
       [244, 204, 115,  74,  11],
       [146, 158, 175, 144,  56],
       [ 55,  91, 162, 246, 119],
       [  4,  18,  48, 149, 470]])

In [17]:
from sklearn.naive_bayes import GaussianNB 
gnb = GaussianNB().fit(X_train, y_train) 
gnb_predictions = gnb.predict(X_val) 
  
# accuracy on X_test 
accuracy = gnb.score(X_val, y_val) 
print(accuracy)
  
# creating a confusion matrix 
cm = confusion_matrix(y_val, gnb_predictions) 

0.37217493395949514


In [18]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline


Using TensorFlow backend.


In [None]:
# https://machinelearningmastery.com/multi-class-classification-tutorial-keras-deep-learning-library/

In [23]:
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(y_train)
encoded_Y = encoder.transform(y_train)

In [24]:
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y_train = np_utils.to_categorical(encoded_Y)

In [27]:
# define baseline model
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(8, input_dim=113, activation='relu'))
    model.add(Dense(5, activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

estimator = KerasClassifier(build_fn=baseline_model, epochs=5, batch_size=5, verbose=0)
kfold = KFold(n_splits=10, shuffle=True)
results = cross_val_score(estimator, X_train, dummy_y_train, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

KeyboardInterrupt: 