In [6]:
import pandas as pd
import numpy as np

In [7]:
df = pd.read_csv('D:/PhD VIII/CSE 6242/Proj_6242/CSE-6242-Project/DB/playtime_pred/playtime_ML.csv')

In [8]:
df.drop(columns=['country_code'], inplace=True)

In [4]:
df = df[['Genre', 'ESRB_Rating', 'Platform', 'price', 'NA_playtime']]

Process dataframe

In [None]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

Filling na values

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
imputer.fit(X[:, 3:4])
X[:, 3:4] = imputer.transform(X[:, 3:4])

In [None]:
print(X)

Encoding categorical features

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0, 1, 2])], remainder='passthrough')
X = ct.fit_transform(X).toarray()

Split training and testing sets

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

Train Models

Decision tree

In [None]:
from sklearn.tree import DecisionTreeRegressor
dt_regressor = DecisionTreeRegressor(random_state=0)
dt_regressor.fit(X, y)

In [None]:
y_pred = dt_regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

In [None]:
from sklearn.metrics import r2_score, mean_squared_error
r2_score(y_test, y_pred)

In [None]:
from sklearn.model_selection import cross_val_score
accurracies = cross_val_score(estimator=dt_regressor, X=X_train, y=y_train, cv=10)
print("Accurracy: {:.2f} %".format(accurracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accurracies.std()*100))

Random forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf_regressor = RandomForestRegressor(n_estimators=10, random_state=0)
rf_regressor.fit(X, y)

In [None]:
y_pred = rf_regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

In [None]:
r2_score(y_test, y_pred)

In [None]:
accurracies = cross_val_score(estimator=rf_regressor, X=X_train, y=y_train, cv=10)
print("Accurracy: {:.2f} %".format(accurracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accurracies.std()*100))

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = [{'n_estimators': [2, 4, 8, 16, 32, 64, 128, 256, 512]}]
grid_search = GridSearchCV(estimator=rf_regressor,
                           param_grid=parameters,
                           scoring='r2',
                           cv=10,
                           n_jobs=-1)
grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print('Best Accuracy: {:.2f} %'.format(best_accuracy*100))
print('Best Parameters:', best_parameters)

XGBoost

In [None]:
from xgboost import XGBRFRegressor
xg_regressor = XGBRFRegressor()
xg_regressor.fit(X_train, y_train)

In [None]:
y_pred = xg_regressor.predict(X_test)
np.set_printoptions(precision=2)

In [None]:
y_pred = xg_regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

In [None]:
r2_score(y_test, y_pred)

In [None]:
accurracies = cross_val_score(estimator=xg_regressor, X=X_train, y=y_train, cv=10)
print("Accurracy: {:.2f} %".format(accurracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accurracies.std()*100))

Make prediction for any game design

In [None]:
genre = 'Action-Adventure'
rating = 'M'
platform = 'X360'
price = 1999.0
design = [[genre, rating, platform, price]]
X_ds = ct.transform(design).toarray()

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf_regressor = RandomForestRegressor(n_estimators=64, random_state=0)
rf_regressor.fit(X, y)

Save the model to disk

In [None]:
import pickle
filename = 'D:/PhD VIII/CSE 6242/Proj_6242/CSE-6242-Project/DB/playtime_pred/NA_playtime.sav'
pickle.dump(rf_regressor, open(filename, 'wb'))

Load and reuse the model for prediction

In [18]:
# load the model from disk
import pickle
filename = 'D:/PhD VIII/CSE 6242/Proj_6242/CSE-6242-Project/DB/playtime_pred/NA_playtime.sav'
loaded_model = pickle.load(open(filename, 'rb'))


import pandas as pd
import numpy as np
df = pd.read_csv('D:/PhD VIII/CSE 6242/Proj_6242/CSE-6242-Project/DB/playtime_pred/playtime_ML.csv')
df.drop(columns=['country_code'], inplace=True)
df = df[['Genre', 'ESRB_Rating', 'Platform', 'price', 'NA_playtime']]
X = df.iloc[:, :-1].values


from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0, 1, 2])], remainder='passthrough')


genre = 'Action'
rating = 'E10'
platform = 'PC'
price = 0.0


ct.fit(X)
design = [[genre, rating, platform, price]]
X_ds = ct.transform(design).toarray()
y_pred = loaded_model.predict(X_ds)

print("The playtime for genre {}, rating {}, platform {}, and price {} in North America is: {}".format(genre, rating, platform, price, y_pred[0]))

The playtime for genre Action, rating E10, platform PC, and price 0.0 in North America is: 3.032075195716897


In [14]:
df = df[['Genre', 'ESRB_Rating', 'Platform', 'price', 'NA_playtime', 'EU_playtime', 'JP_playtime', 'Other_playtime']]

KeyError: "['JP_playtime', 'EU_playtime', 'Other_playtime'] not in index"

In [15]:
df.groupby(['Genre', 'Platform', 'ESRB_Rating', 'price']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,NA_playtime
Genre,Platform,ESRB_Rating,price,Unnamed: 4_level_1
Action,3DS,T,999.0,12.188732
Action,Linux,E10,0.0,2.715385
Action,NGage,T,199.0,4.897143
Action,OSX,E10,0.0,2.715385
Action,PC,E10,0.0,2.715385
Action,PC,M,999.0,1.055556
Action,PC,M,1499.0,1.478431
Action,PC,M,1999.0,0.333333
Action,PC,M,2499.0,4.562500
Action,PC,M,2998.0,0.908333
