In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings(action='ignore')

In [45]:
data = pd.read_csv("vgsales.csv")
data.head()

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Rating
0,Wii Sports,Wii,2006.0,Sports,Nintendo,41.36,28.96,3.77,8.45,82.54,76.0,51.0,8.0,324.0,E
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,,,,,
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.68,12.8,3.79,3.29,35.57,82.0,73.0,8.3,712.0,E
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.61,10.95,3.28,2.95,32.78,80.0,73.0,8.0,193.0,E
4,Pokemon Red/Pokemon Blue,G,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37,,,,,


In [46]:
data.nunique()

Name               12080
Platform              31
Year_of_Release       42
Genre                 12
Publisher            627
NA_Sales             399
EU_Sales             306
JP_Sales             245
Other_Sales          157
Global_Sales         627
Critic_Score          82
Critic_Count         106
User_Score            95
User_Count           903
Rating                 8
dtype: int64

In [47]:
data.isnull().sum()

Name                  0
Platform              0
Year_of_Release       8
Genre                 0
Publisher             1
NA_Sales              0
EU_Sales              0
JP_Sales              0
Other_Sales           0
Global_Sales          0
Critic_Score       9080
Critic_Count       9080
User_Score         9618
User_Count         9618
Rating             7164
dtype: int64

In [48]:
data.dropna(inplace=True)

In [49]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7112 entries, 0 to 17407
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             7112 non-null   object 
 1   Platform         7112 non-null   object 
 2   Year_of_Release  7112 non-null   float64
 3   Genre            7112 non-null   object 
 4   Publisher        7112 non-null   object 
 5   NA_Sales         7112 non-null   float64
 6   EU_Sales         7112 non-null   float64
 7   JP_Sales         7112 non-null   float64
 8   Other_Sales      7112 non-null   float64
 9   Global_Sales     7112 non-null   float64
 10  Critic_Score     7112 non-null   float64
 11  Critic_Count     7112 non-null   float64
 12  User_Score       7112 non-null   float64
 13  User_Count       7112 non-null   float64
 14  Rating           7112 non-null   object 
dtypes: float64(10), object(5)
memory usage: 889.0+ KB


In [50]:
counts = data['Publisher'].value_counts()
data['Publisher'] = data['Publisher'].apply(lambda x: 'Small Publisher' if counts[x] < 50 else x)

In [51]:
data['Rating'].value_counts()

T       2489
E       2162
M       1489
E10+     968
RP         2
AO         1
K-A        1
Name: Rating, dtype: int64

In [52]:
data.drop(data.index[data['Rating'] == 'RP'], inplace=True)
data.drop(data.index[data['Rating'] == 'AO'], inplace=True)
data.drop(data.index[data['Rating'] == 'K-A'], inplace=True)

In [53]:
data = data.drop('NA_Sales', axis=1)
data = data.drop('EU_Sales', axis=1)
data = data.drop('JP_Sales', axis=1)
data = data.drop('Other_Sales', axis=1)
data = data.drop('Critic_Count', axis = 1)
data = data.drop('User_Count', axis = 1)

In [54]:
train, test = train_test_split(data, test_size=0.2, random_state=42, shuffle=True)

In [55]:
train=train.drop('Name',1)

In [56]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [57]:
import catboost as cat
cat_feat = ['Platform', 'Genre', 'Publisher','Rating']
features = list(set(train.columns)-set(['Global_Sales']))
target = 'Global_Sales'

In [58]:
model = cat.CatBoostRegressor(random_state=100,cat_features=cat_feat,verbose=0)
model.fit(train[features],train[target])

<catboost.core.CatBoostRegressor at 0x7fa4cc8ab580>

In [59]:
y_true= pd.DataFrame(data=test[target], columns=['Global_Sales'])

In [60]:
test_temp = test.drop(columns=[target])

In [61]:
y_pred = model.predict(test_temp[features])

In [62]:
from sklearn.metrics import mean_squared_error
from math import sqrt

rmse = sqrt(mean_squared_error(y_true, y_pred))
print(rmse)

1.234242274522487


In [63]:
import pickle
filename = 'game_model.sav'

In [64]:
pickle.dump(model, open(filename, 'wb'))

In [65]:
loaded_model = pickle.load(open(filename, 'rb'))

In [66]:
test_temp[features].head()

Unnamed: 0,Year_of_Release,Genre,Critic_Score,Publisher,Platform,User_Score,Rating
9392,2014.0,Action,67.0,Namco Bandai Games,PSV,7.8,T
8118,2016.0,Action,86.0,Bethesda Softworks,PC,6.2,M
8248,2005.0,Action,75.0,Midway Games,PS2,8.0,M
9026,2011.0,Action,83.0,Small Publisher,PC,8.5,T
6688,2015.0,Racing,64.0,Codemasters,XOne,5.3,E


In [67]:
loaded_model.predict(test_temp[features].head()) # predicted the global sales in millions

array([0.09520664, 1.39296105, 0.63691259, 0.08911364, 0.15098226])