In [29]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, FunctionTransformer, OneHotEncoder
from sklearn.linear_model import LinearRegression, Lasso

In [2]:
df = pd.read_csv('../data/cleaned_video_game_sales.csv', parse_dates=['release_year'])

In [3]:
df.head()

Unnamed: 0,name,platform,release_year,genre,publisher,na_sales,eu_sales,jp_sales,other_sales,global_sales,critic_score,critic_count,user_score,user_count,developer,rating
0,Wii Sports,Wii,2006-01-01,Sports,Nintendo,41.36,28.96,3.77,8.45,82.53,76.0,51.0,8.0,322.0,Nintendo,E
1,Super Mario Bros.,NES,1985-01-01,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,,,,,,
2,Mario Kart Wii,Wii,2008-01-01,Racing,Nintendo,15.68,12.76,3.79,3.29,35.52,82.0,73.0,8.3,709.0,Nintendo,E
3,Wii Sports Resort,Wii,2009-01-01,Sports,Nintendo,15.61,10.93,3.28,2.95,32.77,80.0,73.0,8.0,192.0,Nintendo,E
4,Pokemon Red/Pokemon Blue,GB,1996-01-01,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37,,,,,,


In [4]:
df.rating.value_counts()

E       3991
T       2961
M       1563
E10+    1420
EC         8
K-A        3
RP         3
AO         1
Name: rating, dtype: int64

In [5]:
df.publisher.value_counts().describe()

count     581.000000
mean       28.683305
std       116.813038
min         1.000000
25%         1.000000
50%         3.000000
75%        10.000000
max      1356.000000
Name: publisher, dtype: float64

In [6]:
# No choice but to drop nulls
df.dropna(inplace=True)

Create squared and interaction features

In [8]:
# Adjust user_score so it is in the same units as critic score
df.user_score = df.user_score * 10

In [9]:
poly_features = ['na_sales', 'eu_sales', 'jp_sales', 'other_sales', 'global_sales', \
            'critic_score', 'critic_count', 'user_score', 'user_count']

categorical_features = ['platform', 'genre', 'rating']

get_poly_data = FunctionTransformer(lambda x: x[poly_features], validate=False)
get_cate_data = FunctionTransformer(lambda x: x[categorical_features], validate=False)

In [13]:
# Convert categorical feature columns into categorical data type
df.platform = df.platform.astype('category')
df.genre = df.genre.astype('category')
df.rating = df.rating.astype('category')

In [42]:
pipe_poly = Pipeline([
    ('features', FeatureUnion([
        ('poly_features', Pipeline([
            ('selector', get_poly_data),
            ('poly', PolynomialFeatures(include_bias=False))
        ])),
        ('cate_features', Pipeline([
            ('selector', get_cate_data),
            ('cate', OneHotEncoder(handle_unknown='ignore'))
        ]))
    ])),
    ('lasso', Lasso())
])

params = {
    'lasso__alpha': [1, 5, 10],
    'lasso__max_iter': [4000, 5000]
}

In [21]:
X = df[poly_features + categorical_features]
y = df['global_sales']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=24)

In [44]:
gs.best_params_

{'lasso__alpha': 1, 'lasso__max_iter': 4000}

In [43]:
gs = GridSearchCV(pipe_poly, params, cv=KFold(5, shuffle=True))
gs.fit(X_train, y_train)
gs.score(X_train, y_train)

0.9918427925482408

In [None]:
features = ['na_sales', 'eu_sales', 'jp_sales', 'other_sales', 'global_sales', \
            'critic_score', 'critic_count', 'user_score', 'user_count']

df_poly = df[features]

poly = PolynomialFeatures(include_bias=False)

df_poly = poly.fit_transform(df_poly)

poly_data = pd.DataFrame(df_poly, columns=poly.get_feature_names(features))

In [None]:
poly_data.shape

Use LASSO to narrow out which of these new features are noise

In [None]:
lasso = LogisticRegression()

Dummify categorical variables

Use ridge/lasso regression to find which of the features are most useful

Use advanced pipeline to use both categorical and numeric features in a model

## Sources
1. https://medium.com/bigdatarepublic/integrating-pandas-and-scikit-learn-with-pipelines-f70eb6183696