## ============   Scikit Learn: Ensembling & GridSearch    ===============

##### Random Forest: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

##### Grid Search: https://scikit-learn.org/stable/modules/grid_search.html
#####              &emsp;&emsp;&emsp;&emsp;&emsp;&emsp; https://scikit-learn.org/dev/modules/generated/sklearn.model_selection.GridSearchCV.html

##### only accepts float attribute values; i.e., one hot encoding; therefore using use_one_hot_encoding = 1

##### I think uses a different pruning technique; Link: https://scikit-learn.org/stable/modules/tree.html#minimal-cost-complexity-pruning

In [1]:
use_grid_search      = 1     # searches for various combinations of parameters to get best validation accuracy
use_default          = 1     # uses default parameters for the random forest
depth_limit          = 70    # grow the tree only upto this depth
num_trees            = 100   # number of trees in the forest
use_one_hot_encoding = 1     # i do not understand, why one hot encoding can be useful here
                             # it makes more sense with neural network
                             # you can use label encoding; but how it is useful if number of splits same for all features in DT-sciket learn
merge_train_valid    = 0     # add validation set to training set since validation fraction is taken from train set only
                             # after including it; why the accuracy on validation set not close to 1 but it is ~ 0.66?
# default accuracy                 = 0.61
# + depth_limit > 20               = 0.61 and train > 0.95
# + depth_limit = 15               = 0.59 and train = 0.90
# using grid search + max_features = 0.642                       ====== BEST

assert(depth_limit          >= 0)
assert(use_one_hot_encoding == 0   or    use_one_hot_encoding == 1)

need_label_encoding = ['team','host','opp','month', 'day_match']

### Loading Data

In [2]:
import pandas as pd
df = pd.read_csv('dataset_cricket_match/train.csv')
df = df.drop("Unnamed: 0", axis = 1)
df.head(5)
# fow: fall of wicket;   rpo: run per over or run rate

if merge_train_valid == 1:
    valid_df = pd.read_csv('dataset_cricket_match/val.csv')
    valid_df = valid_df.drop("Unnamed: 0", axis = 1)
    df = pd.concat([df,valid_df])

### Convert Categorical Data to One Hot Encoding

In [3]:
def convert_df_to_one_hot(df):
    for attr_name in need_label_encoding:
        unique_attr = df[attr_name].unique()
        for attr_val in unique_attr:
            new_attr_name = attr_name + "_" + str(attr_val)
            df[new_attr_name] = 0
            df.loc[df[attr_name] == attr_val, new_attr_name] = 1
        df = df.drop(attr_name, axis = 1) # drops columns
    return df
    
if use_one_hot_encoding == 1:
   df = convert_df_to_one_hot(df) 
df.head()

Unnamed: 0,year,toss,bat_first,format,fow,score,rpo,result,team_australia,team_india,...,month_mar,month_jun,month_oct,month_feb,month_apr,month_may,month_dec,day_match_0,day_match_1,day_match_2
0,2012,1,0,1,5,146,7.3,1,1,0,...,0,0,0,0,0,0,0,1,0,0
1,2020,0,1,0,6,340,6.8,1,0,1,...,0,0,0,0,0,0,0,1,0,0
2,2009,1,0,0,4,286,5.72,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,1987,1,1,0,6,225,4.5,1,1,0,...,0,0,0,0,0,0,0,0,1,0
4,2009,0,0,1,5,153,7.65,0,0,0,...,0,0,0,0,0,0,0,1,0,0


### STATS

In [4]:
print ("number of rows         : ", df.shape[0])
print ("number of features     : ", df.shape[1]-1)  # excluding the class labels
print ("number of class labels : ", 2)
#df.info()

number of rows         :  7827
number of features     :  79
number of class labels :  2


### Identify Attributes as Categorical or Continuous

In [5]:
attr_list = df.axes[1].tolist()
attr_list.remove('result')
cont_attr = ['fow', 'score', 'rpo']

### Convert to Numpy Arrays

In [6]:
y_train = df['result'].to_numpy()
df      = df.drop("result", axis = 1)
x_train = df.to_numpy()
print ("data and features shape: ", x_train.shape)
print ("class lables shape     : ", y_train.shape)

data and features shape:  (7827, 79)
class lables shape     :  (7827,)


### Scikit Train

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# default max_depth = None;  nodes are expanded until all leaves are pure or until all leaves contain less than 2 samples.
# default n_estimators = 100; number of trees in the forest
# default max_features     = sqrt(num_features); number of features to choose the best split
# default min_sample_split = 2; number of childeren to split into
if use_default == 1:
    baseClf = RandomForestClassifier(random_state=0)
else:
    baseClf = RandomForestClassifier(random_state=0, max_depth = depth_limit, n_estimators=num_trees)

if use_grid_search == 1:
    param_grid = {'max_depth':        [10, 15, 50, 80], 
                  #'n_estimators':     [50, 100],
                  'max_features':     [0.5, 1],
                  'min_samples_split':[2, 4, 10]
                 }      
    # this might not work better than default sometime since cv error is minimized
    
    #param_grid = {'max_depth': [80]}                             # this must work the same as default
    clf       = GridSearchCV(baseClf, param_grid, cv=5)           # cv = number of cross-validation folds (more value; more overfitting)
else:
    clf = baseClf

#print("specifications of the classifier: \n\n", vars(clf))   
clf.fit(x_train, y_train)                            

### Scikit Predict

In [8]:
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

train_df = pd.read_csv('dataset_cricket_match/train.csv')
valid_df = pd.read_csv('dataset_cricket_match/val.csv')
test_df  = pd.read_csv('dataset_cricket_match/test.csv')
df_list  = [train_df, valid_df, test_df] 
x_list   = []
y_list   = []

index = 0
for df in df_list:
    df = df.drop("Unnamed: 0", axis = 1)
    if use_one_hot_encoding == 1:           # converting categorical attributes to one hot encoded vector
        df = convert_df_to_one_hot(df) 
    print ("number of rows         : ", df.shape[0])
    
    y_data = df['result'].to_numpy()
    df     = df.drop("result", axis = 1)
    x_data = df.to_numpy()

    x_list.append(x_data)
    y_list.append(y_data)
    

number of rows         :  7827
number of rows         :  870
number of rows         :  967


In [9]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
def getAccuracy(index, enable_prints):
    y_pred = clf.predict(x_list[index])
    y_pred = y_pred.tolist()
    y_true = y_list[index].tolist()
    classes = [0,1]
    if enable_prints == 1:
        print("PR Report         : \n", classification_report(y_true, y_pred, labels=classes, zero_division=0))
        print("Confusion Matrix  : \n", confusion_matrix(y_true, y_pred))
        #print("\nAccuracy        : ", accuracy_score(y_true, y_pred))
    return accuracy_score(y_true, y_pred)

df_list = [df]
for index in range(0, len(x_list)):
    print ("\naccuracy: ", getAccuracy(index,1))

PR Report         : 
               precision    recall  f1-score   support

           0       0.96      0.96      0.96      3887
           1       0.96      0.96      0.96      3940

    accuracy                           0.96      7827
   macro avg       0.96      0.96      0.96      7827
weighted avg       0.96      0.96      0.96      7827

Confusion Matrix  : 
 [[3733  154]
 [ 139 3801]]

accuracy:  0.9625654784719561
PR Report         : 
               precision    recall  f1-score   support

           0       0.67      0.69      0.68       458
           1       0.64      0.62      0.63       412

    accuracy                           0.65       870
   macro avg       0.65      0.65      0.65       870
weighted avg       0.65      0.65      0.65       870

Confusion Matrix  : 
 [[314 144]
 [158 254]]

accuracy:  0.6528735632183909
PR Report         : 
               precision    recall  f1-score   support

           0       0.63      0.71      0.67       487
           1   