In [8]:
# !git clone --recursive https://github.com/microsoft/LightGBM
# !cd LightGBM
# !mkdir build
# !cd build
# !cmake -DUSE_CUDA=1 ..
# !make -j4

In [13]:
import os
import pickle
import wandb
import warnings
# from utility import *

environment = 'local'
if environment == 'paperspace':
    os.chdir('/notebooks/Scripts')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, PolynomialFeatures, SplineTransformer, KBinsDiscretizer, \
     StandardScaler, OrdinalEncoder, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_regression, r_regression, mutual_info_regression, SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, r2_score, get_scorer_names, accuracy_score, f1_score, precision_score, \
     confusion_matrix, recall_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split, StratifiedKFold, cross_validate, TimeSeriesSplit
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsRegressor, LocalOutlierFactor
import sklearn.linear_model as lm
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from lightgbm import LGBMClassifier
from hyperparams import *

RANDOM_STATE = 42
warnings.filterwarnings('ignore')
wandb.login()



True

In [2]:
run = wandb.init(
  project="Dream11", entity=None, job_type="modeling",
  notes="Modelling the Dream11 dataset (~40 games) with LGBMClassifier (7 classes) with feature embeddings",
  # notes = "setting benchmark using a Naive Classifier",
  tags=["embeddings", "multiclass_classification", "imbalanced_data", \
        "random_search", "LGBMClassifier", "StratifiedKFold"]
)

In [3]:
if environment == 'local':
    # train = pd.read_csv('../Inputs/ball-by-ball prediction/main.csv')
    train = pd.read_csv('../Inputs/ball-by-ball prediction/embfeats10K.csv')
else:
    train = pd.read_csv('embfeats10K.csv')
    train = pd.read_csv('main.csv')

In [4]:
def get_train_test_split(df, target = 'target', test_size=0.1):
    le = LabelEncoder()
    X, y = df.drop(target, axis=1), le.fit_transform(df[target])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=True, random_state=RANDOM_STATE)
    return X_train, X_test, y_train, y_test

In [5]:
X_train, X_test, y_train, y_test = get_train_test_split(train)

In [6]:
cat_features = X_train.select_dtypes(include=['object']).columns
num_features = X_train.select_dtypes(exclude=['object']).columns

In [7]:
cat_features, num_features

(Index([], dtype='object'),
 Index(['embfeat_1', 'embfeat_2', 'embfeat_3', 'embfeat_4', 'embfeat_5',
        'embfeat_6', 'embfeat_7', 'embfeat_8', 'embfeat_9', 'embfeat_10',
        ...
        'embfeat_1527', 'embfeat_1528', 'embfeat_1529', 'embfeat_1530',
        'embfeat_1531', 'embfeat_1532', 'embfeat_1533', 'embfeat_1534',
        'embfeat_1535', 'embfeat_1536'],
       dtype='object', length=1536))

In [30]:
numeric_transformer = Pipeline([
      # ('poly_feats', PolynomialFeatures(degree=2)),
      # ('b_splines', SplineTransformer()),
      # ('scaler', StandardScaler()),
    #   ('bin', KBinsDiscretizer(encode='ordinal')), #only improved Lars
      ('select_feats', SelectFromModel(lm.Lasso(random_state=RANDOM_STATE), threshold='median'))
])
categorical_transformer = Pipeline([
      ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
])

In [31]:
preprocessor = ColumnTransformer(
    transformers=[
        # ('new_feats', CustomFeatureTransformer(), num_features),
        ('num', numeric_transformer, num_features),
        # ('cat', categorical_transformer, cat_features)
    ]
)

In [32]:
# model = lm.LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5, max_iter=5, n_jobs=-1)
# model.fit(X_train, y_train)

In [33]:
#create a naive classifier
class NaiveClassifier(BaseEstimator):
    def __init__(self):
        pass

    def fit(self, X, y):
        self.classes_, self.counts_ = np.unique(y, return_counts=True)
        self.prior_ = self.counts_ / len(y)
        return self

    def predict(self, X):
        return np.random.choice(self.classes_, size=len(X), p=self.prior_)

    def score(self, X, y):
        return accuracy_score(y, self.predict(X))

In [34]:
# pipe_params={
#     'clf__n_estimators': np.linspace(200, 1000, 100, dtype=np.int16),
#     'clf__max_depth': np.linspace(2, 50, 10, dtype=np.int16),
#     'clf__min_samples_split': np.linspace(2, 20, 5, dtype=np.int16),
#     'clf__min_samples_leaf': np.linspace(2, 10, 5, dtype=np.int16),
#     'clf__max_features': np.linspace(0.1, 1, 10, dtype=np.float16),
#     # 'clf__learning_rate': np.linspace(0.01, 1, 50, dtype=np.float16),
#     'clf__criterion': ['gini', 'entropy', 'log_loss'],
#     # 'clf__bootstrap': [True, False],
#     # 'clf__loss': ['log_loss', 'exponential'],
#     'clf__max_samples': np.linspace(0.1, 1.0, 10, dtype=np.float16),
#     'clf__ccp_alpha': np.linspace(0.0, 5.0, 20, dtype=np.float16),
#     'clf__warm_start': [True, False],
#     # 'clf__n_iter_no_change': np.linspace(1, 10, 10, dtype=np.int16),
#     # 'clf__min_impurity_decrease': np.linspace(0.0001, 10.0, 10, dtype=np.float16),
# }

# params={
#     'n_estimators': np.linspace(200, 1000, 100, dtype=np.int16),
#     'max_depth': np.linspace(2, 50, 10, dtype=np.int16),
#     'min_samples_split': np.linspace(2, 20, 5, dtype=np.int16),
#     'min_samples_leaf': np.linspace(2, 10, 5, dtype=np.int16),
#     'max_features': np.linspace(0.1, 1, 10, dtype=np.float16),
#     # 'clf__learning_rate': np.linspace(0.01, 1, 50, dtype=np.float16),
#     'criterion': ['gini', 'entropy', 'log_loss'],
#     # 'clf__bootstrap': [True, False],
#     # 'clf__loss': ['log_loss', 'exponential'],
#     'max_samples': np.linspace(0.1, 1.0, 10, dtype=np.float16),
#     'ccp_alpha': np.linspace(0.0, 5.0, 20, dtype=np.float16),
#     'warm_start': [True, False],
#     # 'clf__n_iter_no_change': np.linspace(1, 10, 10, dtype=np.int16),
#     # 'clf__min_impurity_decrease': np.linspace(0.0001, 10.0, 10, dtype=np.float16),
# }

# lgbm_params={
#     'clf__n_estimators': np.linspace(10, 200, 20, dtype=np.int16),
#     'clf__max_depth': np.linspace(20, 100, 20, dtype=np.int16),
#     'clf__num_leaves': np.linspace(30, 50, 10, dtype=np.int16),
#     'clf__learning_rate': np.linspace(0.01, 0.5, 10, dtype=np.float16),
#     'clf__subsample': np.linspace(0.8, 1.0, 10, dtype=np.float16),
#     'clf__colsample_bytree': np.linspace(0.8, 1.0, 10, dtype=np.float16),
#     'clf__reg_alpha': np.linspace(0.0, 0.5, 10, dtype=np.float16),
#     'clf__reg_lambda': np.linspace(0.0, 0.5, 10, dtype=np.float16),
#     'clf__min_child_samples': np.linspace(20, 100, 10, dtype=np.int16),
#     'clf__min_child_weight': np.linspace(0.001, 0.1, 10, dtype=np.float16),
#     'clf__min_split_gain': np.linspace(0.0, 1.0, 10, dtype=np.float16),
#     'clf__subsample_freq': np.linspace(0, 10, 10, dtype=np.int16),
#     'clf__max_bin': np.linspace(400, 600, 10, dtype=np.int16),
#     'clf__boosting_type': ['gbdt', 'dart', 'rf', 'goss'],
#     'clf__boost_from_average': [True, False]
# }

In [8]:
#Randomized Search CV - LGBM
pipe = Pipeline([
    # ('prep', preprocessor),
    ('clf', LGBMClassifier(random_state=RANDOM_STATE))
])
pipe.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'clf', 'clf__boosting_type', 'clf__class_weight', 'clf__colsample_bytree', 'clf__importance_type', 'clf__learning_rate', 'clf__max_depth', 'clf__min_child_samples', 'clf__min_child_weight', 'clf__min_split_gain', 'clf__n_estimators', 'clf__n_jobs', 'clf__num_leaves', 'clf__objective', 'clf__random_state', 'clf__reg_alpha', 'clf__reg_lambda', 'clf__silent', 'clf__subsample', 'clf__subsample_for_bin', 'clf__subsample_freq'])

In [16]:
#get classifier name
pipe['clf'].__class__.__name__
# pipe['clf']

'LGBMClassifier'

In [10]:
# model = RandomForestClassifier(bootstrap=True, n_jobs=-1,random_state=420)
model = "LGBMClassifier"
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)
rs=RandomizedSearchCV(pipe,appendclf(lgbm_params), n_iter = 1, n_jobs=-1,cv=cv.split(X_train, y_train), scoring='f1_weighted',random_state=RANDOM_STATE)
rs.fit(X_train, y_train)

In [12]:
#check the validation f1 score
rs.best_score_, rs.best_params_

(0.33319195695951676,
 {'clf__subsample_freq': 5,
  'clf__subsample': 0.8667,
  'clf__reg_lambda': 0.6665,
  'clf__reg_alpha': 0.1111,
  'clf__num_leaves': 7,
  'clf__n_estimators': 535,
  'clf__min_split_gain': 0.8887,
  'clf__min_child_weight': 0.001,
  'clf__min_child_samples': 91,
  'clf__max_depth': 7,
  'clf__max_bin': 577,
  'clf__learning_rate': 0.34,
  'clf__colsample_bytree': 0.844,
  'clf__boosting_type': 'dart',
  'clf__boost_from_average': False})

In [None]:
#check the test f1 score
predictions = rs.predict(X_test)
f1_score(y_test, predictions, average='weighted')

0.3641460092626899

In [46]:
wandb.summary[f'cv_f1_score_{model}'] = rs['test_score'].mean()

predictions = rs['estimator'][0].predict(X_test)
wandb.summary[f'accuracy_test_{model}'] = accuracy_score(y_test, predictions)
wandb.summary[f'f1_score_test_{model}'] = f1_score(y_test, predictions, average='weighted')
wandb.summary[f'precision_test_{model}'] = precision_score(y_test, predictions, average='weighted')
wandb.summary[f'recall_test_{model}'] = recall_score(y_test, predictions, average='weighted')

# wandb.log('best_params', rs.best_params_)

In [47]:
run.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy_test_NaiveClassifier,0.273
cv_f1_score_NaiveClassifier,0.27281
f1_score_test_NaiveClassifier,0.27772
precision_test_NaiveClassifier,0.28338
recall_test_NaiveClassifier,0.273


In [128]:
best_params = rs.best_estimator_.get_params()['clf'].get_params()

In [13]:
data = pd.read_csv('embfeats10K.csv')
X_train, X_test, y_train, y_test = get_train_test_split(data)

ParserError: Error tokenizing data. C error: Expected 1537 fields in line 543, saw 1682


In [130]:
model = RandomForestClassifier(**best_params)
model.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
predictions = rs.predict(X_test)
f1_score(y_test, predictions, average='weighted')

In [51]:
X_train.columns

Index(['venue', 'innings', 'batting_team', 'bowling_team', 'striker',
       'non_striker', 'bowler', 'overs', 'balls', 'bat_0_runs', 'bat_1_runs',
       'bat_2_runs', 'bat_3_runs', 'bat_4_runs', 'bat_6_runs',
       'bat_num_dismissals', 'bat_wides', 'bat_total_balls', 'bowl_0_runs',
       'bowl_1_runs', 'bowl_2_runs', 'bowl_3_runs', 'bowl_4_runs',
       'bowl_6_runs', 'bowl_num_dismissals', 'bowl_wides', 'bowl_total_balls'],
      dtype='object')

## Next Steps:
1. Evaluate more classifiers (LGBM, XGBoost, Bagging, GBR, ExtraTrees) on the same dataset using StratifiedKFold, shuffle=True (in TTS)
2. Repeat step-1 using TimeSeriesSplit, shuffle=False (in TTS)
3. Use feature transformers (power, kbins, spline), repeat step-1,2
4. create ensemble models from step-1,2 and evaluate
5. create ensemble models using step-3 and evaluate
6. create new target using (dots, runs, four, six, wicket labels) repeat step-1to5