In [1]:
# Standard Data Science Toolkit
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

# Statistics
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Machine Learning
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('vgsales_clean.csv')
df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


Before jumping into advanced techniques, let's establish a baseline model using Statmodels OLS regression.

Because this only takes in numeric values let's:
* Drop unneccessary columns like rank and name
* Encode Platform, genre, publisher

We'll predict on Global Sales and drop the others.

In [3]:
df = df.drop(columns=['Rank', 'Name', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales'])
df.head()

Unnamed: 0,Platform,Year,Genre,Publisher,Global_Sales
0,Wii,2006,Sports,Nintendo,82.74
1,NES,1985,Platform,Nintendo,40.24
2,Wii,2008,Racing,Nintendo,35.82
3,Wii,2009,Sports,Nintendo,33.0
4,GB,1996,Role-Playing,Nintendo,31.37


In [4]:
cat_cols = ['Platform', 'Genre', 'Publisher']
num_cols = ['Year']

In [5]:
cat_df = df[cat_cols]
dummies = pd.get_dummies(cat_df, drop_first=True, dtype=int)
dummies

Unnamed: 0,Platform_3DO,Platform_3DS,Platform_DC,Platform_DS,Platform_GB,Platform_GBA,Platform_GC,Platform_GEN,Platform_GG,Platform_N64,...,Publisher_Zushi Games,Publisher_bitComposer Games,Publisher_dramatic create,Publisher_fonfun,Publisher_iWin,Publisher_id Software,Publisher_imageepoch Inc.,Publisher_inXile Entertainment,"Publisher_mixi, Inc",Publisher_responDESIGN
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16296,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16297,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16298,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16299,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
numeric = df[num_cols]
scaler = StandardScaler()

scaled_numeric = scaler.fit_transform(numeric)
scaled_numeric_df = pd.DataFrame(scaled_numeric, columns=numeric.columns)

In [7]:
X = pd.merge(dummies, scaled_numeric_df, left_index=True, right_index=True)
y = df['Global_Sales']

In [8]:
model = sm.OLS(y, sm.add_constant(X))
results = model.fit()
results.summary()

0,1,2,3
Dep. Variable:,Global_Sales,R-squared:,0.14
Model:,OLS,Adj. R-squared:,0.106
Method:,Least Squares,F-statistic:,4.139
Date:,"Mon, 17 Feb 2025",Prob (F-statistic):,6.360000000000001e-210
Time:,15:36:53,Log-Likelihood:,-29227.0
No. Observations:,16301,AIC:,59690.0
Df Residuals:,15683,BIC:,64450.0
Df Model:,617,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.4360,0.886,-0.492,0.623,-2.172,1.300
Platform_3DO,0.1585,1.166,0.136,0.892,-2.128,2.445
Platform_3DS,0.7381,0.259,2.855,0.004,0.231,1.245
Platform_DC,0.3208,0.300,1.070,0.284,-0.267,0.908
Platform_DS,0.6741,0.234,2.880,0.004,0.215,1.133
Platform_GB,1.1845,0.252,4.697,0.000,0.690,1.679
Platform_GBA,0.2562,0.222,1.154,0.249,-0.179,0.691
Platform_GC,0.2034,0.226,0.900,0.368,-0.240,0.646
Platform_GEN,0.6197,0.361,1.716,0.086,-0.088,1.327

0,1,2,3
Omnibus:,34424.685,Durbin-Watson:,0.288
Prob(Omnibus):,0.0,Jarque-Bera (JB):,320825575.247
Skew:,18.047,Prob(JB):,0.0
Kurtosis:,689.33,Cond. No.,1920.0


Together, the features (platform, genre, publisher, and year) are only accouting for about 14% of the variance in Global Sales. Let's see if we can increase that using more advanced techniques.

In [9]:
# Machine Learning Preprocessing and Scoring Metrics
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.metrics import  mean_squared_error, r2_score

# Machine Learning Algorithms
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR

For Machine Learning it's important to scale the training and test data separately. With that in mind, let's redefine our X data, split, then scale.

In [10]:
X = pd.merge(dummies, df[num_cols], left_index=True, right_index=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.80, random_state=42)

In [11]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Now let's choose some models to determine how they compare to our baseline.

In [12]:
linreg = LinearRegression()
lasso = Lasso()
ridge = Ridge()
dtr = DecisionTreeRegressor(random_state=42)
rf = RandomForestRegressor(random_state=42)
gbr = GradientBoostingRegressor(random_state=42)
xgbr = XGBRegressor(random_state=42)
svr = SVR()

models = [linreg, lasso, ridge, dtr, rf, gbr, xgbr, svr]

In [13]:
def test_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    
    y_train_pred = model.predict(X_train)
    r2_train = r2_score(y_train, y_train_pred)
    
    y_test_pred = model.predict(X_test)
    r2_test = r2_score(y_test, y_test_pred)

    return f'Train: {r2_train:.2f} / Test: {r2_test:.2f}'

In [14]:
for model in models:
    print(f"{model} {test_model(model, X_train, y_train, X_test, y_test)}")

LinearRegression() Train: 0.16 / Test: -1154396084008847681781760.00
Lasso() Train: 0.00 / Test: -0.00
Ridge() Train: 0.16 / Test: 0.09
DecisionTreeRegressor(random_state=42) Train: 0.71 / Test: -0.32
RandomForestRegressor(random_state=42) Train: 0.65 / Test: -0.07
GradientBoostingRegressor(random_state=42) Train: 0.25 / Test: 0.08
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators

None of these models perform particularly well, given these inputs. This means that more data is needed to accurately perform predictive analysis. Still, let's see if we can advance the SVR model a bit since it currently has the lowest difference between training and testing performance. We can target model improvement through Hyperparameter Tuning!

In [15]:
param_grid = {
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 10],
    'epsilon': [0.1, 1],
}

In [16]:
gs = GridSearchCV(svr, param_grid, cv=3).fit(X_train, y_train)

In [17]:
gs.best_score_

0.09701340687978195

In [18]:
y_pred = gs.predict(X_test)
r2_score(y_test, y_pred)

0.04007790302293435

Unfortunately, GridSearch did not meaningfully increase the r2 score of our training or test sets. This highlights that global sales prediction relies on other factors that are not included in this dataset. Additional information such as amount spent on production, amount spent on marketing, marketing strategy, ESRB rating, release month, and whether or not the game is a sequel would be helpful in predicting global sales.