In [1]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from statsmodels.formula.api import ols
from scipy import stats
import statsmodels.api as sm
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.metrics import r2_score
from math import sqrt

import scipy, matplotlib
from scipy.optimize import curve_fit
from scipy.optimize import differential_evolution
import warnings

Data = pd.read_csv('Video_Games_Sales_as_at_22_Dec_2016.csv')
sb.set() # set the default Seaborn style for graphics
Data.head()
print(Data.dtypes)
print(Data.Platform.describe()) #tbd
print()

Name                object
Platform            object
Year_of_Release    float64
Genre               object
Publisher           object
NA_Sales           float64
EU_Sales           float64
JP_Sales           float64
Other_Sales        float64
Global_Sales       float64
Critic_Score       float64
Critic_Count       float64
User_Score         float64
User_Count         float64
Developer           object
Rating              object
dtype: object
count     16719
unique       31
top         PS2
freq       2161
Name: Platform, dtype: object



### As there are too many publisher so we decided to narrow down to 20 for analysis

In [2]:
Publisher_list=[]
Publisher_list=Data.Publisher.value_counts().iloc[:20].index.tolist() # set top 20 to list
print(Publisher_list)
mainData = Data[ Data["Publisher"].isin(Publisher_list)]
mainData.head()
mainData.describe()

['Electronic Arts', 'Activision', 'Namco Bandai Games', 'Ubisoft', 'Konami Digital Entertainment', 'THQ', 'Nintendo', 'Sony Computer Entertainment', 'Sega', 'Take-Two Interactive', 'Capcom', 'Atari', 'Tecmo Koei', 'Warner Bros. Interactive Entertainment', 'Square Enix', 'Disney Interactive Studios', 'Unknown', 'Midway Games', 'Eidos Interactive', 'Microsoft Game Studios']


Unnamed: 0,Year_of_Release,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count
count,10588.0,10791.0,10791.0,10791.0,10791.0,10791.0,5929.0,5929.0,5645.0,5645.0
mean,2006.554212,0.34675,0.192842,0.094904,0.064007,0.698692,70.648001,28.576826,7.187458,174.446767
std,5.705269,0.980995,0.606857,0.356509,0.227078,1.868017,13.176693,19.764714,1.44047,557.8346
min,1980.0,0.0,0.0,0.0,0.0,0.01,21.0,3.0,0.0,4.0
25%,2003.0,0.02,0.0,0.0,0.0,0.1,62.0,13.0,6.5,11.0
50%,2007.0,0.12,0.04,0.0,0.01,0.25,72.0,24.0,7.5,27.0
75%,2010.0,0.33,0.16,0.05,0.05,0.635,80.0,39.0,8.2,94.0
max,2020.0,41.36,28.96,10.22,10.57,82.53,98.0,113.0,9.7,10665.0


The first issue with the dataset was the presence of NaN values in different columns of data. 

We decided to clean the dataset by removing the rows of data that contained these NaN values. 

Another issue that we found was the presence of a very big outlier, as shown in the boxplot above, which belonged to the data for ‘Wii Sports’. 

We felt that this data would greatly affect the model between the variables and Global Sales,so we removed this particular outlier. 
Below is the code snippet that we used to achieve this.

In [3]:
#Filter out data without critic scores (need to change)??
mainData.isnull().sum()
mainData = mainData.drop(0) #Wii Sports massive outlier, if removed, graph looks cleaner
mainData = mainData.dropna()
mainData = mainData.reset_index()
mainData.drop('index', axis=1, inplace=True)

# set numeric variable to category
data_dummy1= pd.get_dummies(mainData["Platform"])
data_dummy2= pd.get_dummies(mainData["Genre"])
data_dummy3= pd.get_dummies(mainData["Rating"])
data_dummy4= pd.get_dummies(mainData["Developer"])
data_dummy5 = pd.get_dummies(mainData["Year_of_Release"])
mainData = pd.concat([mainData,data_dummy1],axis=1)
mainData = pd.concat([mainData,data_dummy2],axis=1)
mainData = pd.concat([mainData,data_dummy3],axis=1)
mainData = pd.concat([mainData,data_dummy4],axis=1)
mainData = pd.concat([mainData,data_dummy5],axis=1)
mainData.head()

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,...,2007.0,2008.0,2009.0,2010.0,2011.0,2012.0,2013.0,2014.0,2015.0,2016.0
0,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.68,12.76,3.79,3.29,35.52,...,0,1,0,0,0,0,0,0,0,0
1,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.61,10.93,3.28,2.95,32.77,...,0,0,1,0,0,0,0,0,0,0
2,New Super Mario Bros.,DS,2006.0,Platform,Nintendo,11.28,9.14,6.5,2.88,29.8,...,0,0,0,0,0,0,0,0,0,0
3,Wii Play,Wii,2006.0,Misc,Nintendo,13.96,9.18,2.93,2.84,28.92,...,0,0,0,0,0,0,0,0,0,0
4,New Super Mario Bros. Wii,Wii,2009.0,Platform,Nintendo,14.44,6.94,4.7,2.24,28.32,...,0,0,1,0,0,0,0,0,0,0
