In [1]:
import os
import pickle
import wandb
import warnings
# from utility import *

environment = 'local'
if environment == 'paperspace':
    os.chdir('/notebooks/Scripts')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, PolynomialFeatures, SplineTransformer, KBinsDiscretizer, \
     StandardScaler, OrdinalEncoder, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_regression, r_regression, mutual_info_regression, SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, r2_score, get_scorer_names, accuracy_score, f1_score, precision_score, \
     confusion_matrix, recall_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split, StratifiedKFold, cross_validate, TimeSeriesSplit
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsRegressor, LocalOutlierFactor
import sklearn.linear_model as lm
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from hyperparams import *
from util import *

from imblearn.under_sampling import NearMiss, CondensedNearestNeighbour, TomekLinks, RandomUnderSampler, \
     EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, InstanceHardnessThreshold, NeighbourhoodCleaningRule
# from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbPipeline
from imblearn.combine import SMOTETomek, SMOTEENN

from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

RANDOM_STATE = 42
warnings.filterwarnings('ignore')
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mgsparsh[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [2]:
# run = wandb.init(
#   project="Dream11", entity=None, job_type="modeling",
#   notes="Modelling the ipl2022 with XGBClassifier (5 classes) w/o feature embeddings",
#   tags = ['nosampling', 'xgboost', '5_classes', 'no_embeddings', 'n_iter100', 'custom_metric'],
# )

In [3]:
if environment == 'local':
    train = pd.read_csv('../Inputs/ball-by-ball prediction/ipl2022.csv')
    # train = pd.read_csv('../Inputs/ball-by-ball prediction/embfeats10K_pca_clustering.csv')
else:
    train = pd.read_csv('embfeats10K.csv')
    train = pd.read_csv('main.csv')

In [4]:
def get_train_test_split(df, target = 'target', test_size=0.1):
    le = LabelEncoder()
    X, y = df.drop(target, axis=1), le.fit_transform(df[target])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=True, random_state=RANDOM_STATE)
    X_train, X_test = X_train.reset_index(drop=True), X_test.reset_index(drop=True)
    return X_train, X_test, y_train, y_test, le.classes_

def split_match_phases(df, phase='pp'):
    # split data into power play, middle overs and death overs
    if phase == 'pp':
        df = df[df["overs"].isin([1, 2, 3, 4, 5, 6])]
    elif phase == 'mo':
        df = df[df["overs"].isin([7, 8, 9, 10, 11, 12, 13, 14, 15])]
    elif phase == 'd':
        df = df[df["overs"].isin([16, 17, 18, 19, 20])]
    df.reset_index(drop=True, inplace=True)
    return df

def get_match_phases(df, phase='pp'):
    X_train, X_test, y_train, y_test, labels = get_train_test_split(df)
    df_train_ = pd.concat([X_train, pd.DataFrame(y_train, columns=['target'])], axis=1)
    df_test_ = pd.concat([X_test, pd.DataFrame(y_test, columns=['target'])], axis=1)
    df_train, df_test = split_match_phases(df_train_, phase), split_match_phases(df_test_, phase)
    X_train, y_train = df_train.drop('target', axis=1), df_train['target']
    X_test, y_test = df_test.drop('target', axis=1), df_test['target']
    return X_train, X_test, y_train, y_test, labels
    

def calc_metrics(rs, label_dict, training=True, metric="precision"):
    if training:
        y_pred = rs.predict(X_train)
        y_true = y_train
    else:
        y_pred = rs.predict(X_test)
        y_true = y_test

    preds = [label_dict[pred] for pred in y_pred]
    true = [label_dict[pred] for pred in y_true]

    if metric == "precision":
        class_prec = {
            le: ps * 100
            for ps, le in zip(
                precision_score(true, preds, average=None), label_dict.values()
            )
        }
        return class_prec
    elif metric == "recall":
        class_rec = {
            le: ps * 100
            for ps, le in zip(
                recall_score(true, preds, average=None), label_dict.values()
            )
        }
        return class_rec

In [None]:
train.sort_values(by=['match_id', 'innings', 'over', 'ball'], inplace=True)

In [5]:
# X_train, X_test, y_train, y_test, labels = get_match_phases(train, phase='pp')

In [6]:
X_train, X_test, y_train, y_test, labels = get_train_test_split(train)

In [7]:
cat_features = X_train.select_dtypes(include=['object']).columns.tolist()
cat_features = cat_features #+ ['cluster']
num_features = X_train.drop(columns=cat_features).select_dtypes(exclude=['object']).columns

In [8]:
len(num_features), num_features, cat_features

(21,
 Index(['innings', 'overs', 'balls', 'bat_0_runs', 'bat_1_runs', 'bat_2_runs',
        'bat_3_runs', 'bat_4_runs', 'bat_6_runs', 'bat_num_dismissals',
        'bat_wides', 'bat_total_balls', 'bowl_0_runs', 'bowl_1_runs',
        'bowl_2_runs', 'bowl_3_runs', 'bowl_4_runs', 'bowl_6_runs',
        'bowl_num_dismissals', 'bowl_wides', 'bowl_total_balls'],
       dtype='object'),
 ['venue', 'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler'])

In [9]:
# # assuming numerical features are stored in a list called "numerical_features"
# num_cols = 3 # number of columns in the subplot grid
# num_rows = (len(num_features) + num_cols - 1) // num_cols # number of rows in the subplot grid

# fig, axs = plt.subplots(num_rows, num_cols, figsize=(15, 4*num_rows)) # create the subplot grid

# for i, feature in enumerate(num_features):
#     row = i // num_cols
#     col = i % num_cols
#     sns.histplot(data=X, x=feature, hue='target', kde=True, ax=axs[row, col]) # plot the histogram

# plt.tight_layout() # adjust spacing between subplots
# plt.show()


In [10]:
# # subset = ['bat_0_runs', 'bat_1_runs', 'bat_2_runs', 'bat_3_runs', 'bat_4_runs', 'bat_6_runs']

# # Create a pairplot for the subset
# sns.pairplot(X)
# plt.show()

In [84]:
# # create subplots for each numerical feature
# fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(15, 15))

# # loop through each subplot and create a boxplot of the target variable vs. the numerical feature
# for i, ax in enumerate(axes.flatten()):
#     sns.boxplot(x='target', y=num_features[i], data=X, ax=ax)
#     ax.set_title(f'{num_features[i]} vs. Target')
    
# # display the plot
# plt.show()

In [85]:
# # create subplots for each numerical feature
# fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(15, 15))

# # loop through each subplot and create a scatterplot of the numerical feature vs. the target variable
# for i, ax in enumerate(axes.flatten()):
#     sns.scatterplot(x=num_features[i], y='target', data=X, ax=ax)
#     ax.set_title(f'{num_features[i]} vs. Target')
    
# # display the plot
# plt.show()



In [86]:
# X = preprocessor.fit_transform(train.drop('target', axis=1))

In [87]:
# X.shape

In [88]:
# model = TSNE(n_components=2, random_state=RANDOM_STATE, perplexity=500)
# tsne = model.fit_transform(X)

In [89]:
# # pca = PCA(n_components=0.50)
# # pca.fit(X)
# # pc = pca.transform(X)
# kmeans = KMeans(n_clusters=8, algorithm='lloyd', random_state=RANDOM_STATE)
# kmeans.fit(X)

# fr = pd.DataFrame({'tsne1': tsne[:,0], 'tsne2': tsne[:, 1], 'cluster': kmeans.labels_})
# sns.lmplot(data=fr, x='tsne1', y='tsne2', hue='cluster', fit_reg=False)
# # print(np.sum(pca.explained_variance_ratio_))

In [90]:
#add cluster to train
# X = pd.DataFrame(X, columns=[f'pca_{i}' for i in range(X.shape[1])])

In [91]:
#add cluster to train
# X['cluster'] = kmeans.labels_

In [92]:
# X['target'] = train.target

In [93]:
# X.to_csv('../Inputs/ball-by-ball prediction/embfeats10K_pca_clustering.csv', index=False)

In [11]:
cat_features, num_features

(['venue', 'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler'],
 Index(['innings', 'overs', 'balls', 'bat_0_runs', 'bat_1_runs', 'bat_2_runs',
        'bat_3_runs', 'bat_4_runs', 'bat_6_runs', 'bat_num_dismissals',
        'bat_wides', 'bat_total_balls', 'bowl_0_runs', 'bowl_1_runs',
        'bowl_2_runs', 'bowl_3_runs', 'bowl_4_runs', 'bowl_6_runs',
        'bowl_num_dismissals', 'bowl_wides', 'bowl_total_balls'],
       dtype='object'))

In [21]:
numeric_transformer = imbPipeline([
      ("poly", PolynomialFeatures(degree=2)),
      # ("splines", SplineTransformer()),
      # ("log", LogTransformer()),
      ("scaler", MinMaxScaler()),
      # ("bins", KBinsDiscretizer(encode="ordinal")),
      # ("feats", SelectFromModel(lm.Lasso(random_state=RANDOM_STATE), threshold='median'))
      # ('pca', PCA(n_components=0.94))
])

categorical_transformer = imbPipeline([
      ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
      # ('encoder', OneHotEncoder(handle_unknown='ignore')),
])

In [22]:
preprocessor = ColumnTransformer(
    transformers=[
        # ('new_feats', CustomFeatureTransformer(), num_features),
        ('num', numeric_transformer, num_features),
        ('cat', categorical_transformer, cat_features)
    ]
)

In [23]:
#Randomized Search CV - LGBM
pipe = imbPipeline([
    ('prep', preprocessor),
    ('undersample', get_sampler(algo="none")),
    ('clf', XGBClassifier(booster='gbtree', tree_method='hist', #metric='multiclass', eval_metric='mlogloss',\
                        #    multi_strategy='multi_output_tree', 
                           random_state=RANDOM_STATE))
])
pipe.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'prep', 'undersample', 'clf', 'prep__n_jobs', 'prep__remainder', 'prep__sparse_threshold', 'prep__transformer_weights', 'prep__transformers', 'prep__verbose', 'prep__verbose_feature_names_out', 'prep__num', 'prep__cat', 'prep__num__memory', 'prep__num__steps', 'prep__num__verbose', 'prep__num__poly', 'prep__num__scaler', 'prep__num__poly__degree', 'prep__num__poly__include_bias', 'prep__num__poly__interaction_only', 'prep__num__poly__order', 'prep__num__scaler__clip', 'prep__num__scaler__copy', 'prep__num__scaler__feature_range', 'prep__cat__memory', 'prep__cat__steps', 'prep__cat__verbose', 'prep__cat__encoder', 'prep__cat__encoder__categories', 'prep__cat__encoder__dtype', 'prep__cat__encoder__encoded_missing_value', 'prep__cat__encoder__handle_unknown', 'prep__cat__encoder__unknown_value', 'clf__objective', 'clf__use_label_encoder', 'clf__base_score', 'clf__booster', 'clf__callbacks', 'clf__colsample_bylevel', 'clf__colsample_bynode', 'clf__c

In [24]:
label_dict = {i:labels[i] for i in range(len(labels))}
label_dict

{0: 'four', 1: 'norun', 2: 'run', 3: 'six', 4: 'wicket'}

In [25]:
important_classes = [0,3,4]
custom_f1 = get_custom_scorer(important_classes)

In [26]:
# model = RandomForestClassifier(bootstrap=True, n_jobs=-1,random_state=420)
model = pipe['clf'].__class__.__name__
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)
rs=RandomizedSearchCV(pipe,params_wrapper('xgb'), n_iter = 20, n_jobs=-1,cv=cv.split(X_train, y_train), scoring=custom_f1,random_state=RANDOM_STATE)
rs.fit(X_train, y_train)

In [27]:
#check the validation f1 score
rs.best_score_, rs.best_params_

(0.08100041756935322,
 {'prep__num__poly__interaction_only': True,
  'prep__num__poly__include_bias': True,
  'prep__num__poly__degree': 1,
  'clf__subsample': 0.323,
  'clf__reg_lambda': 0.3691,
  'clf__reg_alpha': 0.7373,
  'clf__n_estimators': 277,
  'clf__monotone_constraints': (-1, 1),
  'clf__min_child_weight': 0.3083,
  'clf__learning_rate': 0.4,
  'clf__grow_policy': 'depthwise',
  'clf__gamma': 0.4685,
  'clf__colsample_bytree': 0.769,
  'clf__colsample_bynode': 0.631,
  'clf__colsample_bylevel': 0.3076})

In [28]:
#check the test f1 score
predictions = rs.predict(X_test)
custom_scorer(y_test, predictions, important_classes)

0.06669516208521065

In [30]:
calc_metrics(rs, label_dict, training=True, metric='precision'), calc_metrics(rs, label_dict, training=False, metric='precision')

({'four': 59.010989010989015,
  'norun': 64.10910831073537,
  'run': 65.41833667334669,
  'six': 58.691206543967276,
  'wicket': 77.99145299145299},
 {'four': 12.121212121212121,
  'norun': 41.66666666666667,
  'run': 45.98870056497175,
  'six': 11.320754716981133,
  'wicket': 8.571428571428571})

In [21]:
cm = wandb.plot.confusion_matrix(
    y_true=y_test,
    preds=predictions,
    class_names=labels)

wandb.log({
    f"cv_custom_f1_{model}": rs.best_score_,
    f"recall_test_{model}": calc_metrics(rs, label_dict, training=False, metric='recall'),
    f"precision_test_{model}": calc_metrics(rs, label_dict, training=False, metric='precision'),
    'best_params': rs.best_params_,
    "conf_mat": cm,
})

In [22]:
run.finish()

VBox(children=(Label(value='0.003 MB of 0.014 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.180233…

0,1
cv_custom_f1_XGBClassifier,▁

0,1
cv_custom_f1_XGBClassifier,0.09235
