In [29]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, FunctionTransformer, OneHotEncoder
from sklearn.linear_model import LinearRegression, Lasso

In [2]:
df = pd.read_csv('../data/cleaned_video_game_sales.csv', parse_dates=['release_year'])

In [3]:
df.head()

Unnamed: 0,name,platform,release_year,genre,publisher,na_sales,eu_sales,jp_sales,other_sales,global_sales,critic_score,critic_count,user_score,user_count,developer,rating
0,Wii Sports,Wii,2006-01-01,Sports,Nintendo,41.36,28.96,3.77,8.45,82.53,76.0,51.0,8.0,322.0,Nintendo,E
1,Super Mario Bros.,NES,1985-01-01,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,,,,,,
2,Mario Kart Wii,Wii,2008-01-01,Racing,Nintendo,15.68,12.76,3.79,3.29,35.52,82.0,73.0,8.3,709.0,Nintendo,E
3,Wii Sports Resort,Wii,2009-01-01,Sports,Nintendo,15.61,10.93,3.28,2.95,32.77,80.0,73.0,8.0,192.0,Nintendo,E
4,Pokemon Red/Pokemon Blue,GB,1996-01-01,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37,,,,,,


In [4]:
df.rating.value_counts()

E       3991
T       2961
M       1563
E10+    1420
EC         8
K-A        3
RP         3
AO         1
Name: rating, dtype: int64

In [5]:
df.publisher.value_counts().describe()

count     581.000000
mean       28.683305
std       116.813038
min         1.000000
25%         1.000000
50%         3.000000
75%        10.000000
max      1356.000000
Name: publisher, dtype: float64

In [6]:
# No choice but to drop nulls
df.dropna(inplace=True)

Create squared and interaction features

In [8]:
# Adjust user_score so it is in the same units as critic score
df.user_score = df.user_score * 10

In [9]:
poly_features = ['na_sales', 'eu_sales', 'jp_sales', 'other_sales', 'global_sales', \
            'critic_score', 'critic_count', 'user_score', 'user_count']

categorical_features = ['platform', 'genre', 'rating']

get_poly_data = FunctionTransformer(lambda x: x[poly_features], validate=False)
get_cate_data = FunctionTransformer(lambda x: x[categorical_features], validate=False)

In [13]:
# Convert categorical feature columns into categorical data type
df.platform = df.platform.astype('category')
df.genre = df.genre.astype('category')
df.rating = df.rating.astype('category')

In [78]:
pipe_poly = Pipeline([
    ('features', FeatureUnion([
        ('poly_features', Pipeline([
            ('selector', get_poly_data),
            ('poly', PolynomialFeatures())
        ])),
        ('cate_features', Pipeline([
            ('selector', get_cate_data),
            ('cate', OneHotEncoder(handle_unknown='ignore'))
        ]))
    ])),
    ('lasso', Lasso())
])

params = {
    'features__poly_features__poly__interaction_only': [False, True],
    'lasso__alpha': [1, 5, 10],
    'lasso__max_iter': [4000, 5000]
}

In [21]:
X = df[poly_features + categorical_features]
y = df['global_sales']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [79]:
gs = GridSearchCV(pipe_poly, params, cv=KFold(5, shuffle=True))
gs.fit(X_train, y_train)
gs.score(X_train, y_train)

0.9918427925482408

In [80]:
gs.best_params_

{'features__poly_features__poly__interaction_only': False,
 'lasso__alpha': 1,
 'lasso__max_iter': 4000}

Use LASSO to narrow out which of these new features are noise

In [104]:
coeffs = gs.best_estimator_.named_steps['lasso'].coef_

In [88]:
gs.best_estimator_.named_steps['features'].transformer_list[0][1].named_steps['poly'].get_feature_names()

['1',
 'x0',
 'x1',
 'x2',
 'x3',
 'x4',
 'x5',
 'x6',
 'x7',
 'x8',
 'x0^2',
 'x0 x1',
 'x0 x2',
 'x0 x3',
 'x0 x4',
 'x0 x5',
 'x0 x6',
 'x0 x7',
 'x0 x8',
 'x1^2',
 'x1 x2',
 'x1 x3',
 'x1 x4',
 'x1 x5',
 'x1 x6',
 'x1 x7',
 'x1 x8',
 'x2^2',
 'x2 x3',
 'x2 x4',
 'x2 x5',
 'x2 x6',
 'x2 x7',
 'x2 x8',
 'x3^2',
 'x3 x4',
 'x3 x5',
 'x3 x6',
 'x3 x7',
 'x3 x8',
 'x4^2',
 'x4 x5',
 'x4 x6',
 'x4 x7',
 'x4 x8',
 'x5^2',
 'x5 x6',
 'x5 x7',
 'x5 x8',
 'x6^2',
 'x6 x7',
 'x6 x8',
 'x7^2',
 'x7 x8',
 'x8^2']

In [87]:
len(poly_features)

55

In [85]:
features = ['na_sales', 'eu_sales', 'jp_sales', 'other_sales', 'global_sales', \
            'critic_score', 'critic_count', 'user_score', 'user_count']

df_poly = df[features]

poly = PolynomialFeatures(include_bias=False)

df_poly = poly.fit_transform(df_poly)

pd.DataFrame(df_poly, columns=poly.get_feature_names(features)).head(1)

Unnamed: 0,na_sales,eu_sales,jp_sales,other_sales,global_sales,critic_score,critic_count,user_score,user_count,na_sales^2,...,critic_score^2,critic_score critic_count,critic_score user_score,critic_score user_count,critic_count^2,critic_count user_score,critic_count user_count,user_score^2,user_score user_count,user_count^2
0,41.36,28.96,3.77,8.45,82.53,76.0,51.0,80.0,322.0,1710.6496,...,5776.0,3876.0,6080.0,24472.0,2601.0,4080.0,16422.0,6400.0,25760.0,103684.0


In [114]:
poly.get_feature_names(features)

['x0_3DS',
 'x0_DC',
 'x0_DS',
 'x0_GBA',
 'x0_GC',
 'x0_PC',
 'x0_PS',
 'x0_PS2',
 'x0_PS3',
 'x0_3DS^2',
 'x0_3DS x0_DC',
 'x0_3DS x0_DS',
 'x0_3DS x0_GBA',
 'x0_3DS x0_GC',
 'x0_3DS x0_PC',
 'x0_3DS x0_PS',
 'x0_3DS x0_PS2',
 'x0_3DS x0_PS3',
 'x0_DC^2',
 'x0_DC x0_DS',
 'x0_DC x0_GBA',
 'x0_DC x0_GC',
 'x0_DC x0_PC',
 'x0_DC x0_PS',
 'x0_DC x0_PS2',
 'x0_DC x0_PS3',
 'x0_DS^2',
 'x0_DS x0_GBA',
 'x0_DS x0_GC',
 'x0_DS x0_PC',
 'x0_DS x0_PS',
 'x0_DS x0_PS2',
 'x0_DS x0_PS3',
 'x0_GBA^2',
 'x0_GBA x0_GC',
 'x0_GBA x0_PC',
 'x0_GBA x0_PS',
 'x0_GBA x0_PS2',
 'x0_GBA x0_PS3',
 'x0_GC^2',
 'x0_GC x0_PC',
 'x0_GC x0_PS',
 'x0_GC x0_PS2',
 'x0_GC x0_PS3',
 'x0_PC^2',
 'x0_PC x0_PS',
 'x0_PC x0_PS2',
 'x0_PC x0_PS3',
 'x0_PS^2',
 'x0_PS x0_PS2',
 'x0_PS x0_PS3',
 'x0_PS2^2',
 'x0_PS2 x0_PS3',
 'x0_PS3^2']

In [94]:
cate_features = gs.best_estimator_.named_steps['features'].transformer_list[1][1].named_steps['cate'].get_feature_names()

In [113]:
len(poly_features)

55

In [101]:
features = list(cate_features) + list(poly_features)

In [105]:
coef_df = pd.DataFrame({'features': features,
                       'coef': coeffs})

In [107]:
pd.set_option('display.max_rows', None)
coef_df

Unnamed: 0,features,coef
0,x0_3DS,0.0
1,x0_DC,0.0
2,x0_DS,0.0
3,x0_GBA,0.0
4,x0_GC,0.0
5,x0_PC,0.0
6,x0_PS,0.0
7,x0_PS2,0.0
8,x0_PS3,0.0
9,x0_PS4,0.0003390521


## Sources
1. https://medium.com/bigdatarepublic/integrating-pandas-and-scikit-learn-with-pipelines-f70eb6183696
2. https://blog.usejournal.com/featureunion-a-time-saver-when-building-a-machine-learning-model-d0ad7a90f215