In [1]:
# For colab
from google.colab import drive
drive.mount('/content/Mydrive')
%cd "/content/Mydrive/MyDrive/Github/KT_Devchall"

Drive already mounted at /content/Mydrive; to attempt to forcibly remount, call drive.mount("/content/Mydrive", force_remount=True).
/content/Mydrive/MyDrive/Github/KT_Devchall


# Preprocessing

In [2]:
import random
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.optim as optim
import torch.functional as F
from torch.utils.data import Dataset, DataLoader

In [3]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn import set_config

set_config(display='diagram')

In [4]:
# data load
df_train = pd.read_pickle('data/train.pkl')
X = df_train.drop('Class', axis=1)
y = df_train['Class']

In [5]:
## Column Selection from catboost
feature_importance = pd.Series([3.12768611e+01, 6.62209523e+00, 1.55948023e+01, 1.05818328e+00,
       5.35072461e-02, 0.00000000e+00, 3.21271373e+00, 2.57594949e-02,
       1.17493059e+01, 9.91394069e-02, 9.48666051e+00, 9.53121732e-01,
       5.07253124e+00, 8.31159974e+00, 5.98996443e-02, 3.63612996e+00,
       2.78768950e+00], index = X.columns)
col_drop = feature_importance[feature_importance<1] 
col_drop

Platform      0.053507
OS_type       0.000000
Ex_Rate       0.025759
Country_ID    0.099139
P2            0.953122
weekend       0.059900
dtype: float64

In [6]:
X = X[X.columns.drop(col_drop.index)]

In [7]:
# num_features = ['Ex_Rate','P1','P2','P3','P4']
num_features = ['P1','P3','P4']
cat_features = X.columns.drop(num_features).tolist()

In [8]:
# Transformer
numeric_transformer = StandardScaler()
cat_transformer = OneHotEncoder(drop='first', sparse=False)

In [9]:
# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_features),
        ("cat", cat_transformer, cat_features)
    ]
)

In [10]:
# Train/Val/Test Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, stratify=y)

In [11]:
# Fit
X_train = preprocessor.fit_transform(X_train)
X_val = preprocessor.fit_transform(X_val)

# Sklearn Modeling

In [12]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

In [13]:
sgd = SGDClassifier(n_jobs=-1, verbose=5, random_state=123, 
                      early_stopping=True, n_iter_no_change=5, max_iter=100)

In [14]:
# gridsearch hyperparameters
param_grid = { 
    'loss' : ['log_loss', 'hinge', 'modified_huber'],
    'penalty' : ['l2', 'l1']
}

In [17]:
model = GridSearchCV(sgd, param_grid, verbose=5, scoring='f1', n_jobs=1, cv=5)

## GridSearch Fitting

In [18]:
# Fit
model.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END ...........loss=log_loss, penalty=l2;, score=nan total time=   1.4s
[CV 2/5] END ...........loss=log_loss, penalty=l2;, score=nan total time=   0.9s
[CV 3/5] END ...........loss=log_loss, penalty=l2;, score=nan total time=   0.9s
[CV 4/5] END ...........loss=log_loss, penalty=l2;, score=nan total time=   0.9s
[CV 5/5] END ...........loss=log_loss, penalty=l2;, score=nan total time=   0.9s
[CV 1/5] END ...........loss=log_loss, penalty=l1;, score=nan total time=   0.9s
[CV 2/5] END ...........loss=log_loss, penalty=l1;, score=nan total time=   0.9s
[CV 3/5] END ...........loss=log_loss, penalty=l1;, score=nan total time=   0.9s
[CV 4/5] END ...........loss=log_loss, penalty=l1;, score=nan total time=   0.8s
[CV 5/5] END ...........loss=log_loss, penalty=l1;, score=nan total time=   0.8s
-- Epoch 1
Norm: 2.17, NNZs: 70, Bias: -1.099702, T: 4296857, Avg. loss: 0.331477
Total training time: 2.51 seconds.
-- Epoch 2
No

10 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_stochastic_gradient.py", line 892, in fit
    sample_weight=sample_weight,
  File "/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_stochastic_gradient.py", line 649, in _fit
    self._validate_params()
  File "/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_stochastic_gradient.py", line 162, in _validate_params
    raise Va

-- Epoch 1
Norm: 534.70, NNZs: 49, Bias: -6.833865, T: 5371072, Avg. loss: 0.519284
Total training time: 4.94 seconds.
-- Epoch 2
Norm: 534.72, NNZs: 53, Bias: -6.274676, T: 10742144, Avg. loss: 0.434124
Total training time: 11.61 seconds.
-- Epoch 3
Norm: 534.73, NNZs: 52, Bias: -5.920185, T: 16113216, Avg. loss: 0.432961
Total training time: 18.15 seconds.
-- Epoch 4
Norm: 534.73, NNZs: 56, Bias: -5.737385, T: 21484288, Avg. loss: 0.432460
Total training time: 24.67 seconds.
-- Epoch 5
Norm: 534.74, NNZs: 55, Bias: -5.547517, T: 26855360, Avg. loss: 0.432194
Total training time: 31.19 seconds.
-- Epoch 6
Norm: 534.74, NNZs: 56, Bias: -5.395293, T: 32226432, Avg. loss: 0.432004
Total training time: 37.69 seconds.
Convergence after 6 epochs took 39.28 seconds


In [19]:
# Best Params
model.best_params_

{'loss': 'modified_huber', 'penalty': 'l1'}

In [23]:
model.best_score_

0.6036973805411391

In [26]:
# Prediction Score
y_pred = model.predict(X_val)

from sklearn.metrics import classification_report
print(classification_report(y_val, y_pred, target_names = ['Lose', 'Win']))

              precision    recall  f1-score   support

        Lose       0.81      0.99      0.89   1803308
         Win       0.94      0.44      0.60    754346

    accuracy                           0.83   2557654
   macro avg       0.88      0.71      0.75   2557654
weighted avg       0.85      0.83      0.80   2557654

