In [19]:
# Built-in/Generic Imports
import os
import sys
#

# Libraries
#import xgboost as xgb
import numpy as np
import pandas as pd
import scipy as sp
#

# Modules
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
#from sklearn.metrics import classification_report,confusion_matrix
#from sklearn.cross_validation import cross_val_score
#from sklearn.metrics import precision_score
from sklearn.model_selection import GridSearchCV

#Suppress scientific notation
np.set_printoptions(suppress=True)




In [20]:
#import file for dataset
vgsales = pd.read_csv('R:/Project Files/videogamesales/vgsales.csv')
vgsales

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.00
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.00,31.37
...,...,...,...,...,...,...,...,...,...,...,...
16593,16596,Woody Woodpecker in Crazy Castle 5,GBA,2002.0,Platform,Kemco,0.01,0.00,0.00,0.00,0.01
16594,16597,Men in Black II: Alien Escape,GC,2003.0,Shooter,Infogrames,0.01,0.00,0.00,0.00,0.01
16595,16598,SCORE International Baja 1000: The Official Game,PS2,2008.0,Racing,Activision,0.00,0.00,0.00,0.00,0.01
16596,16599,Know How 2,DS,2010.0,Puzzle,7G//AMES,0.00,0.01,0.00,0.00,0.01


In [21]:
# separate the data from the target attributes
X = vgsales.drop(['Publisher','Genre','Year'],axis=1)
y = vgsales.Genre


In [22]:
#Generate feature name array
X_featurenames = np.asarray(X.columns.values)

X_featurenames



array(['Rank', 'Name', 'Platform', 'NA_Sales', 'EU_Sales', 'JP_Sales',
       'Other_Sales', 'Global_Sales'], dtype=object)

In [23]:
#Generate class names array
classnames = y.unique()

classnames


array(['Sports', 'Platform', 'Racing', 'Role-Playing', 'Puzzle', 'Misc',
       'Shooter', 'Simulation', 'Action', 'Fighting', 'Adventure',
       'Strategy'], dtype=object)

In [24]:
le = LabelEncoder()

#Encoding Rank
#le.fit(X.Rank)
#X.Rank = le.transform(X.Rank)

#Encoding Name
le.fit(X.Name)
X.Name = le.transform(X.Name)

#Encoding Platform
le.fit(X.Platform)
X.Platform = le.transform(X.Platform)

#Encoding Genre - y
le.fit(y)
y = le.transform(y)



In [25]:
# standardize the data attributes
X = preprocessing.scale(X)

X


array([[-1.73207602,  1.5690442 ,  1.21567658, ..., 11.93805759,
        44.60608534, 52.86402505],
       [-1.73186733,  1.06354125, -0.57173047, ..., 21.76729621,
         3.82822442, 25.53250298],
       [-1.73165863, -0.06601715,  1.21567658, ..., 12.00272364,
        17.29711476, 22.69002469],
       ...,
       [ 1.7316147 ,  0.70758292,  0.02407188, ..., -0.25149161,
        -0.25486439, -0.33919426],
       [ 1.73182339, -0.23421724, -1.40585376, ..., -0.25149161,
        -0.25486439, -0.33919426],
       [ 1.73203209,  0.95521919, -1.16753282, ..., -0.25149161,
        -0.25486439, -0.33919426]])

In [26]:
# The relevant dataset is imported and split into its training and test sets in this cell.
#from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y, test_size=0.20, random_state=42)

In [34]:
#tree = DecisionTreeClassifier(max_depth=25, random_state=42)

parameters = {'max_depth':range(3,20)}
clf = GridSearchCV(tree.DecisionTreeClassifier(), parameters, cv=5, n_jobs=1)
clf.fit(X=X_train, y=y_train)
tree_model = clf.best_estimator_
print (clf.best_score_, clf.best_params_) 

tree_model.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(tree_model.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(tree_model.score(X_test, y_test)))

0.3453828591200962 {'max_depth': 19}
Accuracy on training set: 0.865
Accuracy on test set: 0.351


In [35]:
import graphviz

In [36]:
#In this cell, the data is exported out to a dot file in preparation of visualizing the generated decision tree.

tree.export_graphviz(tree_model, out_file="tree.dot", class_names=classnames,feature_names=X_featurenames, impurity=False, filled=True)

In [None]:
#In this cell, the previously imported dot file is processed using graphviz

#import graphviz
with open("tree.dot") as f:
 dot_graph = f.read()
display(graphviz.Source(dot_graph))

In [None]:
# read in data
#dtrain = xgb.DMatrix(X_train, label=y_train)
#dtest = xgb.DMatrix(X_test, label=y_test)

In [None]:
# specify parameters

#param = {
    #'max_depth': 100,  # the maximum depth of each tree
    #'eta': 0.3,  # the training step for each iteration
    #'silent': 1,  # logging mode - quiet
    #'objective': 'multi:softprob',  # error evaluation for multiclass training
    #'num_class': 12}  # the number of classes that exist in this datset

In [None]:
#num_round = 100  # the number of training iterations

#bst = xgb.train(param, dtrain, num_round)

In [None]:
#preds = bst.predict(dtest)

#type(preds)

In [None]:
#print(preds)

In [None]:
#best_preds = np.asarray([np.argmax(line) for line in preds])
#best_preds

In [None]:
#print(precision_score(y_test, best_preds, average='macro'))

In [None]:
#print(confusion_matrix(y_test,best_preds))