In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import mean_absolute_error,accuracy_score,classification_report,precision_recall_fscore_support, confusion_matrix,r2_score,mean_squared_error
from sklearn.model_selection import GridSearchCV, cross_val_score
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler  
from sklearn.pipeline import Pipeline
import pickle
import seaborn as sns



## Data reading

In [2]:
data = pd.read_csv('Video_Games_Sales_as_at_22_Dec_2016.csv')
data.head()    


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating
0,Wii Sports,Wii,2006.0,Sports,Nintendo,41.36,28.96,3.77,8.45,82.53,76.0,51.0,8.0,322.0,Nintendo,E
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,,,,,,
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.68,12.76,3.79,3.29,35.52,82.0,73.0,8.3,709.0,Nintendo,E
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.61,10.93,3.28,2.95,32.77,80.0,73.0,8.0,192.0,Nintendo,E
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37,,,,,,


In [None]:
print(data.shape)  
print(data.dtypes)

In [3]:

numerical_columns = data.select_dtypes('float64').columns
categorical_columns = data.select_dtypes('object').columns
print(numerical_columns)


Index(['Year_of_Release', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales',
       'Global_Sales', 'Critic_Score', 'Critic_Count', 'User_Score',
       'User_Count'],
      dtype='object')


### Üresek eltávolítása

In [4]:
cols_to_replace = ['Publisher', 'Developer','Name']
for column in cols_to_replace:
    data[column].fillna('Unknown', inplace=True)

In [5]:
#regi jatekok altalaban nem ertekeltek
data['Critic_Count'] = data['Critic_Count'].fillna(1)
data['User_Count'] = data['User_Count'].fillna(1)

In [6]:
from sklearn.impute import KNNImputer
columns_with_missing = ['User_Score', 'Critic_Score','Year_of_Release']
imputer = KNNImputer(n_neighbors=5)
data[columns_with_missing] = imputer.fit_transform(data[columns_with_missing])

In [7]:
#lehet hogy nagyon rossz ötlet
#helyette
#data.dropna(subset=['Genre'], inplace=True)
for index, row in data.iterrows():
    year = row['Year_of_Release']
    genre = row['Genre']
    if pd.isnull(row['Rating']):
        if year >= 2010 and genre in ['Action', 'Adventure']:
            data.at[index, 'Rating'] = 'T'
        elif year < 2010 and genre in ['Action', 'Adventure']:
            data.at[index, 'Rating'] = 'M'
        elif genre in ['Sports', 'Racing']:
            data.at[index, 'Rating'] = 'E'
        else:
            data.at[index, 'Rating'] = 'E10+'


In [8]:
data.dropna(subset=['Genre'], inplace=True)

In [None]:
print(data.isnull().sum())
print(data.shape)

In [9]:

data['Year_of_Release'] = data['Year_of_Release'].astype(int)
data['User_Count'] = data['User_Count'].astype(int)
data['Critic_Count'] = data['Critic_Count'].astype(int)
data['Critic_Score'] = data['Critic_Score'].astype(int)

data['User_Score'] = (data['User_Score'] * 10).astype(int)

columns_to_convert = ['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']
data[columns_to_convert] = (data[columns_to_convert] * 100).astype(int)


In [10]:
data[numerical_columns]

Unnamed: 0,Year_of_Release,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count
0,2006,4136,2896,377,844,8253,76,51,80,322
1,1985,2908,358,681,77,4024,73,1,59,1
2,2008,1568,1276,379,329,3552,82,73,83,709
3,2009,1561,1093,328,295,3277,80,73,80,192
4,1996,1127,889,1022,100,3137,91,1,84,1
...,...,...,...,...,...,...,...,...,...,...
16714,2016,0,0,1,0,1,78,1,75,1
16715,2006,0,1,0,0,1,67,1,78,1
16716,2016,0,0,1,0,1,78,1,75,1
16717,2003,1,0,0,0,1,77,1,74,1


In [None]:
genre_counts = data['Genre'].value_counts()
plt.figure(figsize=(10, 6))
plt.bar(genre_counts.index, genre_counts.values)
plt.xlabel('Genre')
plt.ylabel('Count')
plt.title('Number of Games per Genre')
plt.xticks(rotation=90)
plt.show()

In [None]:
##Name nem kell mert egyéni érték 
data = data.drop(['Name'],axis=1)


In [None]:
numerical_columns = data.select_dtypes('int32').columns
categorical_columns = data.select_dtypes('object').columns
print(numerical_columns)

## LabelEncoding, normalization, splitting

In [11]:

label_encoders = {}

for column in categorical_columns:
    label_encoder = LabelEncoder()

    encoded_values = label_encoder.fit_transform(data[column].astype(str))
    label_encoders[column] = label_encoder
    data[column] = encoded_values
with open('label_encoders.pkl', 'wb') as file:
   pickle.dump(label_encoders, file)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

columns_to_plot = ["Platform", "Year_of_Release", "Publisher", "NA_Sales", "EU_Sales",
                   "JP_Sales", "Other_Sales", "Global_Sales", "Critic_Score",
                   "Critic_Count", "User_Score", "User_Count", "Developer", "Rating", "Genre"]

num_columns = 4
num_rows = (len(columns_to_plot) + num_columns - 1) // num_columns

fig, axes = plt.subplots(num_rows, num_columns, figsize=(15, 20))

axes = axes.flatten()

for i, column in enumerate(columns_to_plot):
    color = plt.cm.Set1(i % 14)

    stats.probplot(data[column], plot=axes[i], dist='norm', fit=True)
    axes[i].set_title(column)
    axes[i].set_xlabel("Theoretical Quantiles")
    axes[i].set_ylabel("Ordered Values")

for j in range(len(columns_to_plot), num_rows * num_columns):
    fig.delaxes(axes[j])

plt.tight_layout()

plt.show()


In [None]:

# Create boxplots for each column
plt.figure(figsize=(12, 8))
sns.boxplot(data=data)
plt.xticks(rotation=90)
plt.title("Boxplot of Columns")
plt.xlabel("Columns")
plt.ylabel("Values")
plt.show()

# Create scatter plots for each column
plt.figure(figsize=(12, 8))
sns.scatterplot(data=data)
plt.xticks(rotation=90)
plt.title("Scatterplot of Columns")
plt.xlabel("Columns")
plt.ylabel("Values")
plt.show()

In [None]:
corr_matrix = data.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True) 
plt.show()                            


In [12]:
data

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating
0,11075,26,2006,10,361,4136,2896,377,844,8253,76,51,80,322,1020,1
1,9389,11,1985,4,361,2908,358,681,77,4024,73,1,59,1,1573,2
2,5613,26,2008,6,361,1568,1276,379,329,3552,82,73,83,709,1020,1
3,11077,26,2009,10,361,1561,1093,328,295,3277,80,73,80,192,1020,1
4,7392,5,1996,7,361,1127,889,1022,100,3137,91,1,84,1,1573,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16714,8343,17,2016,0,504,0,0,1,0,1,78,1,75,1,1573,7
16715,5160,28,2006,10,91,0,1,0,0,1,67,1,78,1,1573,1
16716,3890,20,2016,1,233,0,0,1,0,1,78,1,75,1,1573,7
16717,9028,6,2003,4,550,1,0,0,0,1,77,1,74,1,1573,2


In [13]:
X = data.drop('Genre', axis=1)
y = data['Genre']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:

scaler = StandardScaler()
X_train_standard = scaler.fit_transform(X_train)
X_test_standard = scaler.transform(X_test)

In [15]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
X_train_robust = scaler.fit_transform(X_train)
X_test_robust = scaler.transform(X_test)

## Correlation matrix

##  normal probability plot (Q-Q plot) 

## RandomForest

In [None]:
rf_classifier = RandomForestClassifier()
#max_depth= None, min_samples_leaf= 1, min_samples_split=2, n_estimators= 300
rf_classifier.fit(X_train,y=y_train)
y_pred = rf_classifier.predict(X_test)

print(accuracy_score(y_test,y_pred))
print(f'MAE: {mean_absolute_error(y_test,y_pred)}')

with open('random_forest_model.pkl', 'wb') as f:
    pickle.dump(rf_classifier, f)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
rf = RandomForestClassifier()
param_grid = {
    'n_estimators': randint(100, 1000),  
    'max_depth': randint(5, 20),         
    'min_samples_split': randint(2, 10),  
    'min_samples_leaf': randint(1, 10),  
    'max_features': ['sqrt', 'log2']      
}
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, n_iter=10, cv=5)
random_search.fit(X_train, y_train)

random_best_params = random_search.best_params_
random_best_score = random_search.best_score_

print(random_best_params)

In [None]:
random_rf_classifier = RandomForestClassifier(**random_best_params)
random_rf_classifier.fit(X_train, y_train)

y_pred_random = random_rf_classifier.predict(X_test)
print('Accuracy Random Search: ',accuracy_score(y_test, y_pred_random))

## SVM

In [None]:
svm = SVC()
svm.fit(X_train,y_train)
y_pred = svm.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(f'MAE: {mean_absolute_error(y_test,y_pred)}')

with open('svm_model.pkl', 'wb') as f:
    pickle.dump(svm, f)

In [None]:
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': [0.1, 1, 10]
}
svm_classifier = SVC()
grid_search = GridSearchCV(svm_classifier, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

In [None]:

svm = SVC(**best_params)
svm.fit(X_train, y_train)


y_pred = svm.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print('Accuracy: ',accuracy_score(y_test,y_pred))

## LogisticRegression 

In [None]:
X_lr = X
X_train, X_test, y_train, y_test = train_test_split(X_lr, y, test_size=0.2, random_state=420)
lr = LogisticRegression()
lr.fit(X_train,y_train)

st_x= StandardScaler()  
X_train= st_x.fit_transform(X_train)  
X_test= st_x.transform(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(f'MAE: {mean_absolute_error(y_test,y_pred)}')

with open('logistic_regression_model.pkl', 'wb') as f:
    pickle.dump(lr, f)


In [None]:
param_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}

logistic_classifier = LogisticRegression()

grid_search = GridSearchCV(logistic_classifier, param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

In [None]:
best_logistic_classifier = LogisticRegression(**best_params)
best_logistic_classifier.fit(X_train, y_train)

y_pred = best_logistic_classifier.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

## Voting

In [None]:

rf = RandomForestClassifier(max_depth= None, min_samples_leaf= 1, min_samples_split=2, n_estimators= 300)
lr = LogisticRegression(C= 10, penalty= 'l2',solver= 'liblinear')
svc = SVC(C= 10, gamma= 0.1, kernel= 'rbf')



voting_classifier = VotingClassifier(
    estimators=[('svc', svc), ('rf', rf), ('lr', lr)],
    voting='hard'
)

voting_classifier.fit(X_train, y_train)

y_pred = voting_classifier.predict(X_test)
y_pred = voting_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(f'MAE: {mean_absolute_error(y_test,y_pred)}')

with open('voting_classifier.pkl', 'wb') as f:
    pickle.dump(voting_classifier, f)

## Stacking

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

rf = RandomForestClassifier(max_depth= None, min_samples_leaf= 1, min_samples_split=2, n_estimators= 300)
lr = LogisticRegression(C= 10, penalty= 'l2',solver= 'liblinear',max_iter=1000)
svc = SVC(C= 10, gamma= 0.1, kernel= 'rbf',max_iter=1000)
fnn = MLPClassifier(hidden_layer_sizes=(256,), activation='relu', max_iter=1000)

stacking_classifier = StackingClassifier(
    estimators=[('svc', svc), ('rf', rf), ('lr', lr)],
    final_estimator=fnn
)

stacking_classifier.fit(X_train, y_train)

y_pred = stacking_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

with open('stacking_model.pkl', 'wb') as f:
    pickle.dump(stacking_classifier, f)

## Saját modell

In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.optimizers import Adam

In [39]:
X_train_=X_train_robust
X_test_=X_test_robust

model = Sequential()
model.add(Dense(1024, activation='relu', input_shape=(X_train_.shape[1],)))
model.add(Dropout(0.3))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(len(y.unique()), activation='softmax'))

learning_rate = 0.0001 
optimizer = Adam(learning_rate=learning_rate)
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])



early_stopping = EarlyStopping(patience=5)
history = model.fit(X_train_, y_train, validation_data=(X_test_, y_test),
                    batch_size=64, epochs=100, callbacks=[early_stopping])


test_loss, test_accuracy = model.evaluate(X_test_, y_test)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78