In [3]:
environment = 'local'
if environment == 'paperspace':
    os.chdir('/notebooks/Scripts')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, PolynomialFeatures, SplineTransformer, KBinsDiscretizer, \
     StandardScaler, OrdinalEncoder, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_regression, r_regression, mutual_info_regression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, r2_score, get_scorer_names, accuracy_score, f1_score, precision_score, \
     confusion_matrix, recall_score, roc_auc_score, roc_curve, auc
from sklearn.model_selection import RandomizedSearchCV, train_test_split, cross_val_score, StratifiedKFold, cross_validate, TimeSeriesSplit
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsRegressor, LocalOutlierFactor
import sklearn.linear_model as lm
from sklearn.ensemble import RandomForestClassifier

import os
import pickle
import wandb
import warnings
import utility as ut
warnings.filterwarnings('ignore')
wandb.login()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [4]:
run = wandb.init(
  project=ut.PROJECT_NAME, entity=ut.ENTITY, job_type="modeling",
  notes="Modeling with hyperparameter tuning",
  tags=["no_feature_eng", "multiclass_classification", "imbalanced_data", "hyperparameter_tuning", "random_search", "random_forest"]
)

[34m[1mwandb[0m: Currently logged in as: [33mgsparsh[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
if environment == 'local':
    train = pd.read_csv('../Inputs/ball-by-ball prediction/train.csv')
else:
    train = pd.read_csv('train.csv')

In [12]:
def get_train_test_split(df, target = 'target', test_size=0.1):
    le = LabelEncoder()
    X, y = df.drop(target, axis=1), le.fit_transform(df[target])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=False)
    return X_train, X_test, y_train, y_test

In [13]:
X_train, X_test, y_train, y_test = get_train_test_split(train)

In [14]:
cat_features = X_train.select_dtypes(include=['object']).columns
num_features = X_train.select_dtypes(exclude=['object']).columns

In [15]:
cat_features, num_features

(Index(['venue', 'batting_team', 'bowling_team', 'striker', 'non_striker',
        'bowler'],
       dtype='object'),
 Index(['innings', 'overs', 'balls', 'bat_0_runs', 'bat_1_runs', 'bat_2_runs',
        'bat_3_runs', 'bat_4_runs', 'bat_6_runs', 'bat_num_dismissals',
        'bat_wides', 'bat_total_balls', 'bowl_0_runs', 'bowl_1_runs',
        'bowl_2_runs', 'bowl_3_runs', 'bowl_4_runs', 'bowl_6_runs',
        'bowl_num_dismissals', 'bowl_wides', 'bowl_total_balls'],
       dtype='object'))

In [16]:
numeric_transformer = Pipeline([
      # ('poly_feats', PolynomialFeatures(degree=2)),
      # ('b_splines', SplineTransformer()),
      ('scaler', StandardScaler()),
    #   ('bin', KBinsDiscretizer(encode='ordinal')), #only improved Lars
      # ('select_feats', SelectKBest(f_regression, k=10))
])
categorical_transformer = Pipeline([
      ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
])

In [17]:
preprocessor = ColumnTransformer(
    transformers=[
        # ('new_feats', CustomFeatureTransformer(), num_features),
        ('num', numeric_transformer, num_features),
        ('cat', categorical_transformer, cat_features)
    ]
)

In [34]:
params={
    'rf__n_estimators': np.linspace(200, 1000, 10, dtype=np.int64),
    'rf__max_depth': np.linspace(10, 100, 10, dtype=np.int64),
    'rf__min_samples_split': np.linspace(5, 30, 10, dtype=np.int64),
    'rf__min_samples_leaf': np.linspace(2, 30, 10, dtype=np.int64),
    'rf__max_features': ['log2', 'sqrt'],
    'rf__criterion': ['gini', 'entropy'],
    'rf__bootstrap': [True, False],
    'rf__class_weight': ['balanced', 'balanced_subsample'],
    'rf__max_samples': np.linspace(0.1, 1.0, 10),
    'rf__ccp_alpha': np.linspace(0.0, 5.0, 20)
}

In [35]:
#Randomized Search CV - RF
pipe = Pipeline([
    ('prep', preprocessor),
    ('rf', RandomForestClassifier( n_jobs=-1, random_state=3142))
])
pipe.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'prep', 'rf', 'prep__n_jobs', 'prep__remainder', 'prep__sparse_threshold', 'prep__transformer_weights', 'prep__transformers', 'prep__verbose', 'prep__verbose_feature_names_out', 'prep__num', 'prep__cat', 'prep__num__memory', 'prep__num__steps', 'prep__num__verbose', 'prep__num__scaler', 'prep__num__scaler__copy', 'prep__num__scaler__with_mean', 'prep__num__scaler__with_std', 'prep__cat__memory', 'prep__cat__steps', 'prep__cat__verbose', 'prep__cat__encoder', 'prep__cat__encoder__categories', 'prep__cat__encoder__dtype', 'prep__cat__encoder__encoded_missing_value', 'prep__cat__encoder__handle_unknown', 'prep__cat__encoder__unknown_value', 'rf__bootstrap', 'rf__ccp_alpha', 'rf__class_weight', 'rf__criterion', 'rf__max_depth', 'rf__max_features', 'rf__max_leaf_nodes', 'rf__max_samples', 'rf__min_impurity_decrease', 'rf__min_samples_leaf', 'rf__min_samples_split', 'rf__min_weight_fraction_leaf', 'rf__n_estimators', 'rf__n_jobs', 'rf__oob_score', 'rf

In [40]:
model = 'RF'
cv = TimeSeriesSplit(n_splits=5)
rs=RandomizedSearchCV(pipe,params, n_iter = 50, n_jobs=-1,cv=cv.split(X_train, y_train),random_state=420)
rs.fit(X_train, y_train)

In [39]:
# predictions = rs_log.predict(X_test)
# f1_score(y_test, predictions, average='weighted')

0.008958089816454103

In [41]:
predictions = rs.predict(X_test)
wandb.summary[f'accuracy_test_{model}'] = accuracy_score(y_test, predictions)
wandb.summary[f'f1_score_test_{model}'] = f1_score(y_test, predictions, average='weighted')
wandb.summary[f'precision_test_{model}'] = precision_score(y_test, predictions, average='weighted')
wandb.summary[f'recall_test_{model}'] = recall_score(y_test, predictions, average='weighted')

In [42]:
run.finish()

0,1
accuracy_test_RF,0.36313
f1_score_test_RF,0.19347
precision_test_RF,0.13187
recall_test_RF,0.36313


Next Steps:
1. Evaluate more classifiers (LGBM, XGBoost, Bagging, GBR, ExtraTrees) on the same dataset using StratifiedKFold, shuffle=True (in TTS)
2. Repeat step-1 using TimeSeriesSplit, shuffle=False (in TTS)
3. Use feature transformers (power, kbins, spline), repeat step-1,2
4. create ensemble models from step-1,2 and evaluate
5. create ensemble models using step-3 and evaluate
6. create new target using (dots, runs, four, six, wicket labels) repeat step-1to5