# Model training, evaluation & exporting

In [159]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, Lars, LassoLars, BayesianRidge
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn import svm, neighbors, tree

In [160]:
df = pd.read_csv('outputs/main_dataset.csv')

In [161]:
# split data
x_df = df.drop(['avg','Unnamed: 0'],axis=1)
y_df = df['avg']
x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.33, random_state=42)

## Experiment with different regression models

### Linear Regression

In [162]:
reg = LinearRegression().fit(x_train, y_train)

In [163]:
# test custom prediction
test_data = {'gamename':  [5],
        'publisher': [10],
         'developer': [3],
         'tag_common': [20],
         'multi': [1],
         'genre_common': [15],
         'plat_count': [2],
        }

test_df = pd.DataFrame (test_data, columns = ['gamename','publisher','developer','tag_common','multi','genre_common','plat_count'])
test_predict = reg.predict(test_df)

In [164]:
score = reg.score(x_test, y_test)

In [165]:
# # plot result of linear regression model
# plt.scatter(x_df.publisher, y_df, color='red')
# sns.regplot(x_df.publisher, reg.predict(x_df))
# # plt.scatter(x_df['publisher'] , reg.predict(x_df), color='blue')
# plt.title('Average no. players predicted to play the game (Linear Regression)')
# plt.xlabel('Number of games published by the publisher')
# plt.ylabel('Average no. players')
# plt.show()

### Polynomial Regression

In [166]:
# fitting the polynomial regression model to the dataset
poly_reg = PolynomialFeatures(degree=2)
x_poly = poly_reg.fit_transform(x_train)
poly_reg.fit(x_poly,y_train)
lin_reg2 = LinearRegression()
poreg = lin_reg2.fit(x_poly,y_train)

In [167]:
score = poreg.score(poly_reg.fit_transform(x_test), y_test)

In [168]:
# # view result of poly regression model
# plt.scatter(x_df.developer, y_df, color='red')
# sns.regplot(x_df.publisher, poreg.predict(poly_reg.fit_transform(x_df)))
# # plt.plot(x_df.publisher, lin_reg2.predict(poly_reg.fit_transform(x_df)), color='blue')
# plt.title('Average no. players predicted to play the game (Polynomial Regression)')
# plt.xlabel('Number of games published by the publisher')
# plt.ylabel('Average no. players')
# plt.show()


### Ridge Regression

In [169]:
ridge = Ridge(alpha=1.0).fit(x_train, y_train)

In [170]:
score = ridge.score(x_test, y_test)

In [171]:
# # plot result of ridge model
# plt.scatter(x_df.publisher, y_df, color='red')
# sns.regplot(x_df.publisher, ridge.predict(x_df))
# # plt.scatter(x_df['publisher'] , reg.predict(x_df), color='blue')
# plt.title('Average no. players predicted to play the game (Ridge)')
# plt.xlabel('Number of games published by the publisher')
# plt.ylabel('Average no. players')
# plt.show()

### Lasso

In [172]:
lasso = Lasso(alpha=0.1).fit(x_train, y_train)

In [173]:
score = lasso.score(x_test, y_test)

-0.0020072349232092

In [174]:
# # plot result of lasso model
# plt.scatter(x_df.publisher, y_df, color='red')
# sns.regplot(x_df.publisher, lasso.predict(x_df))
# # plt.scatter(x_df['publisher'] , lasso.predict(x_df), color='blue')
# plt.title('Average no. players predicted to play the game (Lasso)')
# plt.xlabel('Number of games published by the publisher')
# plt.ylabel('Average no. players')
# plt.show()

### Lars

In [175]:
lars = Lars(n_nonzero_coefs=1).fit(x_train, y_train)

In [176]:
score = lars.score(x_test, y_test)

In [177]:
# # plot result of lars model
# plt.scatter(x_df.publisher, y_df, color='red', s=1)
# sns.regplot(x_df.publisher, lars.predict(x_df), scatter_kws={'s':1})
# # plt.scatter(x_df['publisher'] , reg.predict(x_df), color='blue')
# plt.title('Average no. players predicted to play the game (Lars)')
# plt.xlabel('Number of games published by the publisher')
# plt.ylabel('Average no. players')
# plt.show()

### LassoLars

In [178]:
larso = LassoLars(alpha=0.1).fit(x_train, y_train)

In [179]:
score = larso.score(x_test, y_test)

In [180]:
# # plot result of lassolars model
# plt.scatter(x_df.publisher, y_df, color='red')
# sns.regplot(x_df.publisher, larso.predict(x_df))
# # plt.scatter(x_df['publisher'] , reg.predict(x_df), color='blue')
# plt.title('Average no. players predicted to play the game (LassoLars)')
# plt.xlabel('Number of games published by the publisher')
# plt.ylabel('Average no. players')
# plt.show()

### Bayesian Ridge

In [181]:
bridge = BayesianRidge().fit(x_train, y_train)

In [182]:
score = bridge.score(x_test, y_test)

In [183]:
# # plot result of bayesian ridge model
# plt.scatter(x_df.publisher, y_df, color='red')
# sns.regplot(x_df.publisher, bridge.predict(x_df))
# # plt.scatter(x_df['publisher'] , reg.predict(x_df), color='blue')
# plt.title('Average no. players predicted to play the game (Bayesian Ridge)')
# plt.xlabel('Number of games published by the publisher')
# plt.ylabel('Average no. players')
# plt.show()

### Support Vector Regression

In [184]:
vegr = svm.SVR().fit(x_train, y_train)

In [185]:
score = vegr.score(x_test, y_test)

In [186]:
# # plot result of SVR model
# plt.scatter(x_df.publisher, y_df, color='red')
# sns.regplot(x_df.publisher, vegr.predict(x_df))
# # plt.scatter(x_df['publisher'] , reg.predict(x_df), color='blue')
# plt.title('Average no. players predicted to play the game (SVR)')
# plt.xlabel('Number of games published by the publisher')
# plt.ylabel('Average no. players')
# plt.show()

### Nearest Neighbors

In [187]:
knn = neighbors.KNeighborsRegressor(5, weights='distance').fit(x_train, y_train)

In [188]:
score = knn.score(x_test, y_test)

0.12876929149896832

In [189]:
# # plot result of nearest neighbors model
# plt.scatter(x_df.publisher, y_df, color='red', s=1)
# sns.regplot(x_df.publisher, knn.predict(x_df), scatter_kws={'s':1})
# # plt.scatter(x_df['publisher'] , reg.predict(x_df), color='blue')
# plt.title('Average no. players predicted to play the game (Nearest Neighbors)')
# plt.xlabel('Number of games published by the publisher')
# plt.ylabel('Average no. players')
# plt.show()

### Decision Tree

In [190]:
treel = tree.DecisionTreeRegressor().fit(x_train, y_train)

In [191]:
score = treel.score(x_test, y_test)

-2.0471544903983747

In [192]:
# # plot result of decision tree model
# plt.scatter(x_df.publisher, y_df, color='red', s=1)
# sns.regplot(x_df.publisher, treel.predict(x_df), scatter_kws={'s':1})
# # plt.scatter(x_df['publisher'] , reg.predict(x_df), color='blue')
# plt.title('Average no. players predicted to play the game (Decision Tree)')
# plt.xlabel('Number of games published by the publisher')
# plt.ylabel('Average no. players')
# plt.show()

### MultiLayer Perceptron 

In [193]:
mlpr = MLPRegressor(random_state=123, max_iter=500).fit(x_train, y_train)



In [194]:
score = mlpr.score(x_test, y_test)

In [195]:
# # plot result of MLPR model
# plt.scatter(x_df.publisher, y_df, color='red', s=1)
# sns.regplot(x_df.publisher, mlpr.predict(x_df), scatter_kws={'s':1})
# # plt.scatter(x_df['publisher'] , reg.predict(x_df), color='blue')
# plt.title('Average no. players predicted to play the game (MLPR)')
# plt.xlabel('Number of games published by the publisher')
# plt.ylabel('Average no. players')
# plt.show()

## Export model to deploy
Based on R2 score, Lars model performs the best

In [196]:
# export model as pkl file
joblib.dump(value=lars, filename="outputs/reg_model.pkl")

['outputs/reg_model.pkl']