In [1]:
#COS30019 - Assignment 2
#Video Game Sales Prediction
#Leslie Ling (4331990)
#Hwang Chae Hyun (100070579)


#scikit-learn 

import numpy as np
import pandas as pd

#graph plotting
import matplotlib.pyplot as plt
import matplotlib

#training & test, cross validation
from sklearn.model_selection import train_test_split, cross_val_score, KFold 

#clasification
from sklearn.metrics 		 import accuracy_score, classification_report

#algorithms
from sklearn.tree 			 import DecisionTreeClassifier
from sklearn.neighbors 		 import KNeighborsClassifier
from sklearn.naive_bayes 	 import GaussianNB
from sklearn.svm 			 import SVC
from sklearn.preprocessing 	 import LabelEncoder
from sklearn.linear_model 	 import LogisticRegression
from sklearn.ensemble 		 import RandomForestClassifier




In [2]:
#read CSV
vg_data = pd.read_csv('vgsales.csv',low_memory=False)


In [3]:
#list CSV data
vg_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16332 entries, 0 to 16331
Data columns (total 11 columns):
Rank            16332 non-null int64
Name            16332 non-null object
Platform        16332 non-null object
Year            16327 non-null object
Genre           16332 non-null object
Publisher       16295 non-null object
NA_Sales        16332 non-null float64
EU_Sales        16332 non-null float64
JP_Sales        16332 non-null float64
Other_Sales     16332 non-null float64
Global_Sales    16330 non-null float64
dtypes: float64(5), int64(1), object(5)
memory usage: 1.4+ MB


In [4]:
#Drop unneeded columns, only show global sales
vg_data = vg_data.drop(['Rank','Name', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales'], axis=1)

In [5]:
# Change data columns
def order(frame,var):
    varlist =[col for col in frame.columns if col not in var]
    frame = frame[var + varlist]
    return frame

#order the columns
vg_data = order(vg_data,['Year','Platform','Publisher', 'Genre','Global_Sales'])

In [6]:
# Filter only sports genre
sports_genre = (vg_data['Genre'] == "Sports")
vg_data = vg_data[sports_genre]
print (vg_data.Genre.value_counts())

Sports    2306
Name: Genre, dtype: int64


In [7]:
print ("Sports data (rows, columns):", vg_data.shape)
print ("Null values:\n")
print (vg_data.isnull().sum())
print ("______________\n")
print (vg_data.info())
print ("______________\n")

Sports data (rows, columns): (2306, 5)
Null values:

Year            2
Platform        0
Publisher       0
Genre           0
Global_Sales    0
dtype: int64
______________

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2306 entries, 5 to 16328
Data columns (total 5 columns):
Year            2304 non-null object
Platform        2306 non-null object
Publisher       2306 non-null object
Genre           2306 non-null object
Global_Sales    2306 non-null float64
dtypes: float64(1), object(4)
memory usage: 108.1+ KB
None
______________



In [8]:
# Table-lizing the raw data
# Previously there are 2306 rows of data but after grouping
# data that have the same year into one there will be 1124 rows
vg_sale = pd.pivot_table(vg_data,
                                index=['Publisher','Platform','Year'],
                                columns=['Genre'],
                                values=['Global_Sales'], aggfunc=sum)

vg_sale = vg_sale['Global_Sales'].reset_index()
print(vg_sale)
print (vg_sale.info())
print (vg_sale.shape)

Genre               Publisher Platform  Year  Sports
0                         3DO       PS  2000    0.58
1                         3DO       PS  2001    0.04
2                         3DO      PS2  2001    0.14
3                         3DO      PS2  2002    0.37
4                         3DO      PS2  2003    0.36
5                         3DO       XB  2003    0.08
6                     49Games     X360  2009    0.04
7                   505 Games       DS  2007    0.17
8                   505 Games       DS  2008    0.05
9                   505 Games       DS  2009    0.06
10                  505 Games       PC  2016    0.02
11                  505 Games      PS2  2005    0.12
12                  505 Games      PS2  2007    0.40
13                  505 Games      PS3  2010    0.78
14                  505 Games      PS3  2011    0.33
15                  505 Games      PS4  2016    0.23
16                  505 Games      PSP  2007    0.21
17                  505 Games      Wii  2007  

In [9]:
# Making report date the index
vg_sale.sort_values("Year", inplace=True)
vg_sale.set_index("Year", inplace=True)
vg_sale.index = pd.to_datetime(vg_sale.index)
vg_sale['year']  = vg_sale.index.year

In [10]:
#sports colum actaully refers to global_sales for sports, it will store global sales
vg_sale['Sports'].fillna(0, inplace=True)
vg_sale['Sports'] = vg_sale.Sports.astype(float)

In [11]:
print (vg_sale.info())
print ("______________\n")
print (vg_sale.shape)

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1124 entries, 1980-01-01 to 2016-01-01
Data columns (total 4 columns):
Publisher    1124 non-null object
Platform     1124 non-null object
Sports       1124 non-null float64
year         1124 non-null int32
dtypes: float64(1), int32(1), object(2)
memory usage: 39.5+ KB
None
______________

(1124, 4)


In [12]:
# Creating data frame graphs by publisher, year
vg_sale.index.names = ['date']
vg_sale = vg_sale.reset_index()
sports_globalsales_groupby = vg_sale.groupby(['Publisher','date']).sum()


In [13]:
sports_globalsales_groupby = sports_globalsales_groupby.reset_index()

In [14]:
print (sports_globalsales_groupby.columns)
print (sports_globalsales_groupby.head(3))



Index(['Publisher', 'date', 'Sports', 'year'], dtype='object', name='Genre')
Genre Publisher       date  Sports  year
0           3DO 2000-01-01    0.58  2000
1           3DO 2001-01-01    0.18  4002
2           3DO 2002-01-01    0.37  2002


In [15]:
# Plotting the graph (graph style is ggplot)
matplotlib.style.use('ggplot')
def graph_draw(data, column, seq):
    row = data[column].unique()
    fig = plt.figure(figsize=(12,8))
    ax = fig.gca()
    for i in row:
        dataf = data[data[column] == i]
        dataf = dataf.set_index('date')
        
        ax.plot(dataf['Sports'], lw=1, linestyle='dashed', marker='o',markerfacecolor='red', markersize=5, label="Global Sales for %s"%i)

    plt.legend(loc='best', fontsize=10, bbox_to_anchor=(0., 1.02, 1., .102), ncol=3)
    plt.xlabel('\nTime progression in %s'%seq)
    plt.ylabel("Total global Sales\n")
    plt.show()

In [16]:
graph_draw(sports_globalsales_groupby, 'Publisher', 'years')

In [17]:

# We want to predict and train for global sales value, from vg_sale table we need to transform string data type of "publisher" and "platform" to digits
def convert(data):
    number = LabelEncoder()
    data['Publisher'] = number.fit_transform(data.Publisher)
    data['Platform'] = number.fit_transform(data.Platform)
    data=data.fillna(-999)
    return data

#We want to copy and convert all the dataset except unnecerssary datetime64 of "date" column.
encoded = convert(vg_sale.drop('date', axis=1))

#print out encode parameters
print("This is the dataset parameter from vg_sale table")
print(encoded.info())

vg_sale_data = encoded.values

# Training the machine
X = vg_sale_data[:,:3]
y = vg_sale_data[:, 1]
test_size = 0.4
rand_seed = 10
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=rand_seed)

This is the dataset parameter from vg_sale table
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1124 entries, 0 to 1123
Data columns (total 4 columns):
Publisher    1124 non-null int64
Platform     1124 non-null int64
Sports       1124 non-null float64
year         1124 non-null int32
dtypes: float64(1), int32(1), int64(2)
memory usage: 30.8 KB
None


In [18]:
# Algorithms
algorithms = []
algorithms.append(('GauNB', GaussianNB()))
algorithms.append(('Rfors', RandomForestClassifier()))
algorithms.append(('Dtree', DecisionTreeClassifier()))
algorithms.append(('SVMac', SVC()))
algorithms.append(('KNbor', KNeighborsClassifier()))
algorithms.append(('LogRg', LogisticRegression()))

# Cross validation with k-fold
k_folds = 20
scoring = 'accuracy'

# Run and print output for every algorithm
results = []
names   = []
for name, algorithm in algorithms:
    kfold = KFold(n_splits=k_folds, random_state=rand_seed, shuffle=False)
    cv_results = cross_val_score(algorithm, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    output = "%s:\t%f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(output)
    
# The box plot for algorithms used
fig = plt.figure(figsize=(12,10))
plt.xlabel('\nAlgorithms Used')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

GauNB:	0.992602 (0.012814)
Rfors:	0.796435 (0.082822)
Dtree:	0.998529 (0.006410)
SVMac:	0.669251 (0.091834)
KNbor:	0.508824 (0.095132)
LogRg:	0.310339 (0.076971)


In [19]:
# Display predictions from highest accuracy to lowest
print("\nPredictions from highest accuracy to lowest:")
print("_____________________________________________________\n")

# Decision Tree Classifier
dtree =  DecisionTreeClassifier().fit(X_train, y_train)
dtree_predictions = dtree.predict(X_test)
dtree_report = classification_report(y_test, dtree_predictions)
print("Decision Tree Classification Accuracy: %s" % accuracy_score(y_test, dtree_predictions))
print("Decision Tree Classification Report:\n\n%s\n" % dtree_report)
print("_____________________________________________________\n")

# Gaussian Naive Bayes
bayes = GaussianNB().fit(X_train, y_train)
bayes_predictions = bayes.predict(X_test)
bayes_report = classification_report(y_test, bayes_predictions)
print("Gaussian Naive Bayes Accuracy: %s" % accuracy_score(y_test, bayes_predictions))
print("Gaussian Naive Bayes Report:\n\n%s" % bayes_report)
print("_____________________________________________________\n")

#Random Forest Classifier
rfors = RandomForestClassifier().fit(X_train, y_train)
rfors_predictions = rfors.predict(X_test)
rfors_report = classification_report(y_test, rfors_predictions)
print("Random Forest Classification Accuracy: %s" % accuracy_score(y_test, rfors_predictions))
print("Random Forest Classification Report:\n\n%s" % rfors_report)
print("_____________________________________________________\n")

# Support Vector Classification
svm = SVC().fit(X_train, y_train)
svm_predictions = svm.predict(X_test)
svm_report = classification_report(y_test, svm_predictions)
print("Support Vector Classification Accuracy: %s" % accuracy_score(y_test, svm_predictions))
print("Support Vector Classification Report:\n\n%s" % svm_report)
print("_____________________________________________________\n")

# K Neighbors Classifier
kneighbor = KNeighborsClassifier().fit(X_train, y_train)
kneighbor_predictions = kneighbor.predict(X_test)
kneighbor_report = classification_report(y_test, kneighbor_predictions)
print("K Neighbors Classifier Accuracy: %s" % accuracy_score(y_test, kneighbor_predictions))
print("K Neighbors Classifier Report:\n\n%s" % kneighbor_report)
print("_____________________________________________________\n")


# Logistic Regression
logrg = LogisticRegression().fit(X_train, y_train)
logrg_predictions = logrg.predict(X_test)
logrg_report = classification_report(y_test, logrg_predictions)
print("Logistic Regression Classification Accuracy: %s" % accuracy_score(y_test, logrg_predictions))
print("Logistic Regression Classification Report:\n\n%s" % logrg_report)
print("_____________________________________________________\n")


Predictions from highest accuracy to lowest:
_____________________________________________________

Decision Tree Classification Accuracy: 0.993333333333
Decision Tree Classification Report:

             precision    recall  f1-score   support

        0.0       1.00      1.00      1.00         2
        1.0       0.62      1.00      0.77         5
        2.0       0.00      0.00      0.00         3
        3.0       1.00      1.00      1.00        31
        4.0       1.00      1.00      1.00         4
        5.0       1.00      1.00      1.00        17
        6.0       1.00      1.00      1.00        13
        7.0       1.00      1.00      1.00         1
        8.0       1.00      1.00      1.00        14
        9.0       1.00      1.00      1.00         4
       11.0       1.00      1.00      1.00        18
       12.0       1.00      1.00      1.00        46
       13.0       1.00      1.00      1.00        62
       14.0       1.00      1.00      1.00        40
       15.0

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Support Vector Classification Accuracy: 0.651111111111
Support Vector Classification Report:

             precision    recall  f1-score   support

        0.0       1.00      0.50      0.67         2
        1.0       0.67      0.80      0.73         5
        2.0       0.00      0.00      0.00         3
        3.0       0.96      0.74      0.84        31
        4.0       0.00      0.00      0.00         4
        5.0       0.56      0.59      0.57        17
        6.0       0.80      0.62      0.70        13
        7.0       0.00      0.00      0.00         1
        8.0       0.91      0.71      0.80        14
        9.0       0.60      0.75      0.67         4
       11.0       1.00      0.50      0.67        18
       12.0       0.55      0.61      0.58        46
       13.0       0.64      0.66      0.65        62
       14.0       0.72      0.70      0.71        40
       15.0       1.00      0.09      0.17        11
       16.0       0.65      1.00      0.79        17
    