In [174]:
import pandas as pd
import sklearn.linear_model as skl
import sklearn.neural_network as nn
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_val_predict
import numpy as np

# Clean train

In [175]:
df_train = pd.read_csv("data/train.csv")

In [176]:
df_train.dropna(axis=0, inplace=True)

In [177]:
df_train['purchase_date'] = pd.to_datetime(df_train['purchase_date']).astype(int)/ 10**9
df_train['release_date'] = pd.to_datetime(df_train['release_date']).astype(int)/ 10**9

In [178]:
categories = df_train['categories'].str.get_dummies(',')
genres = df_train['genres'].str.get_dummies(',')
categories.columns = ['cat_' + str(col) for col in categories.columns]
genres.columns = ['genre_' + str(col) for col in genres.columns]

In [179]:
df_train = df_train.drop(['tags', 'is_free', 'categories', 'genres', 'purchase_date'], axis=1)

In [180]:
df_train = df_train.fillna(0)

In [181]:
df_train = df_train.join(categories).join(genres)

In [182]:
df_train.to_csv('./new_train.csv')

# Clean test

In [183]:
df_test = pd.read_csv("data/test.csv")

In [184]:
df_test['purchase_date'] = pd.to_datetime(df_test['purchase_date']).astype(int)/ 10**9
df_test['release_date'] = pd.to_datetime(df_test['release_date']).astype(int)/ 10**9

In [185]:
categories = df_test['categories'].str.get_dummies(',')
genres = df_test['genres'].str.get_dummies(',')
categories.columns = ['cat_' + str(col) for col in categories.columns]
genres.columns = ['genre_' + str(col) for col in genres.columns]

In [186]:
df_test = df_test.drop(['tags', 'is_free', 'categories', 'genres', 'purchase_date'], axis=1)

In [187]:
df_test = df_test.fillna(0)

In [188]:
df_test = df_test.join(categories).join(genres)

# Sync train and test

In [189]:
cols_to_add_to_test = []
cols_to_add_to_train = []

In [190]:
train_cat = [col for col in df_train.columns if col.startswith('cat_')]
test_cat = [col for col in df_test.columns if col.startswith('cat_')]

In [191]:
cols_to_add_to_test.extend(list(set(train_cat) - set(test_cat)))
cols_to_add_to_train.extend(list(set(test_cat) - set(train_cat)))

In [192]:
train_genre = [col for col in df_train.columns if col.startswith('genre_')]
test_genre = [col for col in df_test.columns if col.startswith('genre_')]

In [193]:
cols_to_add_to_test.extend(list(set(train_genre) - set(test_genre)))
cols_to_add_to_train.extend(list(set(test_genre) - set(train_genre)))

In [194]:
train_tag = [col for col in df_train.columns if col.startswith('tag_')]
test_tag = [col for col in df_test.columns if col.startswith('tag_')]

In [195]:
cols_to_add_to_test.extend(list(set(train_tag) - set(test_tag)))
cols_to_add_to_train.extend(list(set(test_tag) - set(train_tag)))

In [196]:
for col in cols_to_add_to_train:
    df_train[col] = 0
    

In [197]:
for col in cols_to_add_to_test:
    df_test[col] = 0

## Train linear reg

In [198]:
training_data = df_train.loc[ : int(len(df_train)*0.8), :]
testing_data = df_train.loc[int(len(df_train)*0.8) : , :]
submission_data = df_test

In [199]:
def mean_square_error(predict_y, y):
    N = predict_y.shape[0]
    return np.sum((predict_y - y) ** 2) / N

In [200]:
linear_model = skl.LinearRegression()
linear_model.fit(training_data.drop(["playtime_forever"], axis = 1), training_data["playtime_forever"])


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [201]:
predict_train = linear_model.predict(training_data.drop(["playtime_forever"], axis = 1))
predict_train = predict_train.clip(min=0)
print(mean_square_error(predict_train, training_data["playtime_forever"]))

83.49518106098313


In [202]:
predict_test = linear_model.predict(testing_data.drop(["playtime_forever"], axis = 1))
predict_test = predict_test.clip(min=0)
print(mean_square_error(predict_test, testing_data["playtime_forever"]))

110.66477847978439


In [203]:
predict_submission_linear_reg = linear_model.predict(submission_data)
predict_submission_linear_reg = predict_submission_linear_reg.clip(min=0)

In [204]:
predict_submission_linear_reg

array([ 0.        ,  0.        ,  5.32157828,  0.        ,  5.62159965,
        1.18138046, 15.87705637,  0.        ,  0.        ,  0.        ,
        5.49473624,  0.        ,  0.48587745,  9.17190904,  0.        ,
        0.        ,  5.72304947,  8.13023063,  0.        ,  0.        ,
        0.        ,  0.        ,  1.03095241,  5.80020168,  4.93379886,
        0.        ,  0.        ,  4.99343043,  2.53117607,  0.        ,
        4.33669581,  8.77304359,  0.        ,  0.        ,  0.        ,
        6.51408681,  0.        ,  9.85812202,  0.94393719,  0.        ,
        0.        ,  0.        ,  0.        ,  1.21406801,  4.74160813,
        0.        ,  2.7756235 ,  3.35828015,  0.        ,  0.        ,
       25.48602354,  8.09521951,  0.        ,  0.        ,  0.        ,
        1.72162459, 10.89134748,  0.        ,  1.958089  ,  0.        ,
        2.93345271,  0.        ,  6.22170689,  0.        ,  3.33747627,
        4.85549798,  0.        , 10.0210042 ,  0.        ,  7.62

# Train random forest

In [205]:
def rfr_model(X, y):
# Perform Grid-Search
    gsc = GridSearchCV(
        estimator=RandomForestRegressor(),
        param_grid={
            'max_depth': range(3,7),
            'n_estimators': (10, 50, 100, 1000),
        },
        cv=5, scoring='neg_mean_squared_error', verbose=0,                         n_jobs=-1)
    
    grid_result = gsc.fit(X, y)
    return grid_result

In [216]:
rfr = rfr_model(training_data.drop(["playtime_forever"], axis = 1), training_data["playtime_forever"])
predict_train = rfr.predict(training_data.drop(["playtime_forever"], axis = 1))
predict_train = predict_train.clip(min=0)
print(mean_square_error(predict_train, training_data["playtime_forever"]))

47.600715486796354




In [217]:
predict_test = rfr.predict(testing_data.drop(["playtime_forever"], axis = 1))
predict_test = predict_test.clip(min=0)
print(mean_square_error(predict_test, testing_data["playtime_forever"]))

67.30402578772147


In [218]:
predict_submission_random_forest = rfr.predict(submission_data)
predict_submission_random_forest = predict_submission_random_forest.clip(min=0)

In [219]:
predict_submission_random_forest

array([ 5.37564167,  6.18476719,  5.18027908,  7.3318267 , 11.94341838,
        5.18027908,  1.00921987,  1.00921987,  5.93576133,  3.35613008,
        1.1900328 ,  1.19057964,  1.00921987,  1.36058955,  7.76425268,
        1.07939267,  1.78143416,  1.07939267,  1.07939267,  1.07939267,
        1.07939267,  1.45611502,  1.26075243,  1.07939267,  1.07939267,
        1.07939267,  1.07939267,  1.07939267,  1.45611502,  1.26075243,
        1.26075243, 13.39857276,  1.07939267,  1.07939267,  1.26075243,
        1.07939267,  1.26075243,  1.96196175,  1.07939267,  1.26075243,
        1.62612493,  1.43076234,  3.43445702,  1.07939267,  1.07939267,
        1.07939267,  1.26075243,  1.26020559,  2.55858043, 18.59811194,
        1.78143416,  1.43076234,  1.26075243,  1.45556818,  1.26075243,
        1.07939267,  4.26290654,  1.07939267,  1.07939267,  1.07939267,
        1.07939267,  1.07939267,  1.07939267, 10.35387254,  1.07939267,
        1.07939267,  1.07939267,  1.78143416,  3.43445702,  1.07