Sourcing and Loading Data 

In [1]:
import numpy as np
import pandas as pd 
import matplotlib as mpl 
import matplotlib.pyplot as plt 
import seaborn as sns 
import statsmodels.api as sm 
from statsmodels.graphics.api import abline_plot 
from sklearn.metrics import mean_squared_error, r2_score 
from sklearn.model_selection import train_test_split 
from sklearn import linear_model, preprocessing 
import warnings # For handling error messages 
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd") 

In [2]:
data = pd.read_csv('VideoGamesData.csv') 

In [3]:
data.head() 

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating
0,Wii Sports,Wii,2006.0,Sports,Nintendo,41.36,28.96,3.77,8.45,82.53,76.0,51.0,8.0,322.0,Nintendo,E
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,,,,,,
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.68,12.76,3.79,3.29,35.52,82.0,73.0,8.3,709.0,Nintendo,E
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.61,10.93,3.28,2.95,32.77,80.0,73.0,8.0,192.0,Nintendo,E
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37,,,,,,


In [4]:
data.tail() 

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating
16714,Samurai Warriors: Sanada Maru,PS3,2016.0,Action,Tecmo Koei,0.0,0.0,0.01,0.0,0.01,,,,,,
16715,LMA Manager 2007,X360,2006.0,Sports,Codemasters,0.0,0.01,0.0,0.0,0.01,,,,,,
16716,Haitaka no Psychedelica,PSV,2016.0,Adventure,Idea Factory,0.0,0.0,0.01,0.0,0.01,,,,,,
16717,Spirits & Spells,GBA,2003.0,Platform,Wanadoo,0.01,0.0,0.0,0.0,0.01,,,,,,
16718,Winning Post 8 2016,PSV,2016.0,Simulation,Tecmo Koei,0.0,0.0,0.01,0.0,0.01,,,,,,


In [5]:
# remove unneeded columns
df = pd.DataFrame(data, columns= ['Name', 'Platform', 'Genre', 'Publisher', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']) 

In [6]:
df.head() 

Unnamed: 0,Name,Platform,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,Wii Sports,Wii,Sports,Nintendo,41.36,28.96,3.77,8.45,82.53
1,Super Mario Bros.,NES,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,Mario Kart Wii,Wii,Racing,Nintendo,15.68,12.76,3.79,3.29,35.52
3,Wii Sports Resort,Wii,Sports,Nintendo,15.61,10.93,3.28,2.95,32.77
4,Pokemon Red/Pokemon Blue,GB,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


Cleaning, transforming, and visualizing Dataframe 

In [7]:
df.dtypes 

Name             object
Platform         object
Genre            object
Publisher        object
NA_Sales        float64
EU_Sales        float64
JP_Sales        float64
Other_Sales     float64
Global_Sales    float64
dtype: object

Check for/remove nulls

In [8]:
df.isnull().values.any() 

True

In [9]:
df = df.dropna() 
df.isnull().values.any() 

False

Check for/remove duplicates 

In [10]:
df.duplicated().sum() 

0

Summary 

In [11]:
df.describe() 

Unnamed: 0,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
count,16663.0,16663.0,16663.0,16663.0,16663.0
mean,0.263598,0.145202,0.077805,0.047446,0.534302
std,0.814619,0.504016,0.309306,0.186985,1.550158
min,0.0,0.0,0.0,0.0,0.01
25%,0.0,0.0,0.0,0.0,0.06
50%,0.08,0.02,0.0,0.01,0.17
75%,0.24,0.11,0.04,0.03,0.47
max,41.36,28.96,10.22,10.57,82.53


Pre-Processing 

In [12]:
df = pd.DataFrame(df, columns= ['Platform', 'Genre', 'Publisher', 'NA_Sales']) 
df 

Unnamed: 0,Platform,Genre,Publisher,NA_Sales
0,Wii,Sports,Nintendo,41.36
1,NES,Platform,Nintendo,29.08
2,Wii,Racing,Nintendo,15.68
3,Wii,Sports,Nintendo,15.61
4,GB,Role-Playing,Nintendo,11.27
...,...,...,...,...
16714,PS3,Action,Tecmo Koei,0.00
16715,X360,Sports,Codemasters,0.00
16716,PSV,Adventure,Idea Factory,0.00
16717,GBA,Platform,Wanadoo,0.01


In [13]:
from sklearn.preprocessing import StandardScaler 
scaler = StandardScaler() 

# save the features we don't want to scale
var1 = df['Genre']  
var2 = df['Platform']
var3 = df['Publisher'] 

# fit scale to data 
scaler.fit(df.drop(['Platform', 'Genre', 'Publisher'], axis = 1)) 
scaled_values = scaler.transform(df.drop(['Platform', 'Genre', 'Publisher'], axis = 1)) 
df = pd.DataFrame(scaled_values) 

# merge saved features to scaled features 
df['Genre'] = var1
df['Platform'] = var2 
df['Publisher'] = var3

print(df) 

               0         Genre Platform           Publisher
0      50.450162        Sports      Wii            Nintendo
1      35.375169      Platform      NES            Nintendo
2      18.925257        Racing      Wii            Nintendo
3      18.839325        Sports      Wii            Nintendo
4      13.511518  Role-Playing       GB            Nintendo
...          ...           ...      ...                 ...
16658  -0.323594        Action     X360  Namco Bandai Games
16659  -0.323594        Sports       PC   DTP Entertainment
16660  -0.323594        Sports       DS             Ubisoft
16661  -0.311318     Adventure      PSV           Nitroplus
16662  -0.323594           NaN      NaN                 NaN

[16663 rows x 4 columns]


In [15]:
df = df.rename(columns={0: 'NA_Sales'}) 
df

Unnamed: 0,NA_Sales,Genre,Platform,Publisher
0,50.450162,Sports,Wii,Nintendo
1,35.375169,Platform,NES,Nintendo
2,18.925257,Racing,Wii,Nintendo
3,18.839325,Sports,Wii,Nintendo
4,13.511518,Role-Playing,GB,Nintendo
...,...,...,...,...
16658,-0.323594,Action,X360,Namco Bandai Games
16659,-0.323594,Sports,PC,DTP Entertainment
16660,-0.323594,Sports,DS,Ubisoft
16661,-0.311318,Adventure,PSV,Nitroplus


In [16]:
# create dummy features 

In [18]:
X = df[['Genre', 'Platform', 'Publisher']] 
Y = df['NA_Sales'] 
X = pd.get_dummies(data=X, drop_first=True) 

In [19]:
# split data into training and testing sets 
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .20, random_state = 42) 