In [8]:
# !git clone --recursive https://github.com/microsoft/LightGBM
# !cd LightGBM
# !mkdir build
# !cd build
# !cmake -DUSE_CUDA=1 ..
# !make -j4

In [1]:
import os
import pickle
import wandb
import warnings
# from utility import *

environment = 'local'
if environment == 'paperspace':
    os.chdir('/notebooks/Scripts')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, PolynomialFeatures, SplineTransformer, KBinsDiscretizer, \
     StandardScaler, OrdinalEncoder, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_regression, r_regression, mutual_info_regression, SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, r2_score, get_scorer_names, accuracy_score, f1_score, precision_score, \
     confusion_matrix, recall_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split, StratifiedKFold, cross_validate, TimeSeriesSplit
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsRegressor, LocalOutlierFactor
import sklearn.linear_model as lm
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
# import lightgbm as lgb

RANDOM_STATE = 42
warnings.filterwarnings('ignore')
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mgsparsh[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [2]:
run = wandb.init(
  project="Dream11", entity=None, job_type="modeling",
  notes="Modelling the Dream11 dataset (~40 games) with RandomForestClassifier (7 classes) with feature embeddings",
  tags=["embeddings", "multiclass_classification", "imbalanced_data", \
        "random_search", "RandomForestClassifier", "StratifiedKFold"]
)

In [3]:
if environment == 'local':
    train = pd.read_csv('../Inputs/ball-by-ball prediction/embfeats10K.csv')
else:
    train = pd.read_csv('embfeats10K.csv')

In [114]:
def get_train_test_split(df, target = 'target', test_size=0.1):
    le = LabelEncoder()
    X, y = df.drop(target, axis=1), le.fit_transform(df[target])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=True, random_state=RANDOM_STATE)
    return X_train, X_test, y_train, y_test

In [115]:
X_train, X_test, y_train, y_test = get_train_test_split(train[:10000])

In [116]:
cat_features = X_train.select_dtypes(include=['object']).columns
num_features = X_train.select_dtypes(exclude=['object']).columns

In [117]:
cat_features, num_features

(Index(['venue', 'batting_team', 'bowling_team', 'striker', 'non_striker',
        'bowler'],
       dtype='object'),
 Index(['innings', 'overs', 'balls', 'bat_0_runs', 'bat_1_runs', 'bat_2_runs',
        'bat_3_runs', 'bat_4_runs', 'bat_6_runs', 'bat_num_dismissals',
        'bat_wides', 'bat_total_balls', 'bowl_0_runs', 'bowl_1_runs',
        'bowl_2_runs', 'bowl_3_runs', 'bowl_4_runs', 'bowl_6_runs',
        'bowl_num_dismissals', 'bowl_wides', 'bowl_total_balls'],
       dtype='object'))

In [118]:
numeric_transformer = Pipeline([
      # ('poly_feats', PolynomialFeatures(degree=2)),
      # ('b_splines', SplineTransformer()),
      ('scaler', StandardScaler()),
    #   ('bin', KBinsDiscretizer(encode='ordinal')), #only improved Lars
      ('select_feats', SelectFromModel(lm.Lasso(random_state=RANDOM_STATE), threshold='median'))
])
categorical_transformer = Pipeline([
      ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
])

In [119]:
preprocessor = ColumnTransformer(
    transformers=[
        # ('new_feats', CustomFeatureTransformer(), num_features),
        ('num', numeric_transformer, num_features),
        # ('cat', categorical_transformer, cat_features)
    ]
)

In [120]:
# model = lm.LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5, max_iter=5, n_jobs=-1)
# model.fit(X_train, y_train)

In [121]:
pipe_params={
    'clf__n_estimators': np.linspace(200, 1000, 100, dtype=np.int16),
    'clf__max_depth': np.linspace(2, 50, 10, dtype=np.int16),
    'clf__min_samples_split': np.linspace(2, 20, 5, dtype=np.int16),
    'clf__min_samples_leaf': np.linspace(2, 10, 5, dtype=np.int16),
    'clf__max_features': np.linspace(0.1, 1, 10, dtype=np.float16),
    # 'clf__learning_rate': np.linspace(0.01, 1, 50, dtype=np.float16),
    'clf__criterion': ['gini', 'entropy', 'log_loss'],
    # 'clf__bootstrap': [True, False],
    # 'clf__loss': ['log_loss', 'exponential'],
    'clf__max_samples': np.linspace(0.1, 1.0, 10, dtype=np.float16),
    'clf__ccp_alpha': np.linspace(0.0, 5.0, 20, dtype=np.float16),
    'clf__warm_start': [True, False],
    # 'clf__n_iter_no_change': np.linspace(1, 10, 10, dtype=np.int16),
    # 'clf__min_impurity_decrease': np.linspace(0.0001, 10.0, 10, dtype=np.float16),
}

params={
    'n_estimators': np.linspace(200, 1000, 100, dtype=np.int16),
    'max_depth': np.linspace(2, 50, 10, dtype=np.int16),
    'min_samples_split': np.linspace(2, 20, 5, dtype=np.int16),
    'min_samples_leaf': np.linspace(2, 10, 5, dtype=np.int16),
    'max_features': np.linspace(0.1, 1, 10, dtype=np.float16),
    # 'clf__learning_rate': np.linspace(0.01, 1, 50, dtype=np.float16),
    'criterion': ['gini', 'entropy', 'log_loss'],
    # 'clf__bootstrap': [True, False],
    # 'clf__loss': ['log_loss', 'exponential'],
    'max_samples': np.linspace(0.1, 1.0, 10, dtype=np.float16),
    'ccp_alpha': np.linspace(0.0, 5.0, 20, dtype=np.float16),
    'warm_start': [True, False],
    # 'clf__n_iter_no_change': np.linspace(1, 10, 10, dtype=np.int16),
    # 'clf__min_impurity_decrease': np.linspace(0.0001, 10.0, 10, dtype=np.float16),
}

In [122]:
#Randomized Search CV - RF
pipe = Pipeline([
    ('prep', preprocessor),
    ('clf', RandomForestClassifier(bootstrap=True, n_jobs=-1,random_state=RANDOM_STATE))
])
pipe.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'prep', 'clf', 'prep__n_jobs', 'prep__remainder', 'prep__sparse_threshold', 'prep__transformer_weights', 'prep__transformers', 'prep__verbose', 'prep__verbose_feature_names_out', 'prep__num', 'prep__num__memory', 'prep__num__steps', 'prep__num__verbose', 'prep__num__scaler', 'prep__num__select_feats', 'prep__num__scaler__copy', 'prep__num__scaler__with_mean', 'prep__num__scaler__with_std', 'prep__num__select_feats__estimator__alpha', 'prep__num__select_feats__estimator__copy_X', 'prep__num__select_feats__estimator__fit_intercept', 'prep__num__select_feats__estimator__max_iter', 'prep__num__select_feats__estimator__positive', 'prep__num__select_feats__estimator__precompute', 'prep__num__select_feats__estimator__random_state', 'prep__num__select_feats__estimator__selection', 'prep__num__select_feats__estimator__tol', 'prep__num__select_feats__estimator__warm_start', 'prep__num__select_feats__estimator', 'prep__num__select_feats__importance_getter'

In [123]:
# model = RandomForestClassifier(bootstrap=True, n_jobs=-1,random_state=420)
model = "RandomForestClassifier"
cv = StratifiedKFold(n_splits=3)
rs=RandomizedSearchCV(pipe,pipe_params, n_iter = 50, n_jobs=-1,cv=cv.split(X_train, y_train), scoring='f1_weighted',random_state=RANDOM_STATE)
rs.fit(X_train, y_train)

In [124]:
#check the validation f1 score
rs.best_score_

0.35410716226741085

In [125]:
#check the test f1 score
predictions = rs.predict(X_test)
f1_score(y_test, predictions, average='weighted')

0.3641460092626899

In [126]:
wandb.summary[f'cv_f1_score_{model}'] = rs.best_score_

predictions = rs.predict(X_test)
wandb.summary[f'accuracy_test_{model}'] = accuracy_score(y_test, predictions)
wandb.summary[f'f1_score_test_{model}'] = f1_score(y_test, predictions, average='weighted')
wandb.summary[f'precision_test_{model}'] = precision_score(y_test, predictions, average='weighted')
wandb.summary[f'recall_test_{model}'] = recall_score(y_test, predictions, average='weighted')

wandb.log('best_params', rs.best_params_)

In [127]:
run.finish()

VBox(children=(Label(value='0.001 MB of 0.004 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.233772…

0,1
accuracy_test_RandomForestClassifier,0.422
cv_f1_score_RandomForestClassifier,0.35411
f1_score_test_RandomForestClassifier,0.36415
precision_test_RandomForestClassifier,0.34118
recall_test_RandomForestClassifier,0.422


In [128]:
best_params = rs.best_estimator_.get_params()['clf'].get_params()

In [13]:
data = pd.read_csv('embfeats10K.csv')
X_train, X_test, y_train, y_test = get_train_test_split(data)

ParserError: Error tokenizing data. C error: Expected 1537 fields in line 543, saw 1682


In [130]:
model = RandomForestClassifier(**best_params)
model.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
predictions = rs.predict(X_test)
f1_score(y_test, predictions, average='weighted')

In [51]:
X_train.columns

Index(['venue', 'innings', 'batting_team', 'bowling_team', 'striker',
       'non_striker', 'bowler', 'overs', 'balls', 'bat_0_runs', 'bat_1_runs',
       'bat_2_runs', 'bat_3_runs', 'bat_4_runs', 'bat_6_runs',
       'bat_num_dismissals', 'bat_wides', 'bat_total_balls', 'bowl_0_runs',
       'bowl_1_runs', 'bowl_2_runs', 'bowl_3_runs', 'bowl_4_runs',
       'bowl_6_runs', 'bowl_num_dismissals', 'bowl_wides', 'bowl_total_balls'],
      dtype='object')

## Next Steps:
1. Evaluate more classifiers (LGBM, XGBoost, Bagging, GBR, ExtraTrees) on the same dataset using StratifiedKFold, shuffle=True (in TTS)
2. Repeat step-1 using TimeSeriesSplit, shuffle=False (in TTS)
3. Use feature transformers (power, kbins, spline), repeat step-1,2
4. create ensemble models from step-1,2 and evaluate
5. create ensemble models using step-3 and evaluate
6. create new target using (dots, runs, four, six, wicket labels) repeat step-1to5