## Regression on bowler features

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
from math import sqrt
from sklearn.metrics import mean_squared_error as MSE 
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

1. Read data from Bowler dataset

In [None]:
df1 = pd.read_csv('bowler_all3.csv')

# Drop unwanted columns
df1 = df1.drop(columns=['catch','Direct_runout','Indirect_runout','Stumping','lbw/bowled'])
df1

Unnamed: 0,match_id,bowler,inning,batting_team,total_bowler_runs,maiden,economy,overs,wickets,venue,fp_points,fp_avg,opp_avg,ven_avg,inn_avg,run_avg,eco_avg,maiden_avg,wicket_avg,over_avg
0,1,P Kumar,1,Kolkata Knight Riders,38.0,0.0,9.500000,4,0,M Chinnaswamy Stadium,8.0,0.000000,8.000000,8.000000,8.000000,38.000000,9.500000,0.000000,0.000000,4.000000
1,1,I Sharma,2,Royal Challengers Bangalore,7.0,0.0,2.333333,3,1,M Chinnaswamy Stadium,39.0,0.000000,39.000000,39.000000,39.000000,7.000000,2.333333,0.000000,1.000000,3.000000
2,1,AB Agarkar,2,Royal Challengers Bangalore,25.0,0.0,6.250000,4,3,M Chinnaswamy Stadium,87.0,0.000000,87.000000,87.000000,87.000000,25.000000,6.250000,0.000000,3.000000,4.000000
3,1,AB Dinda,2,Royal Challengers Bangalore,9.0,0.0,3.000000,3,2,M Chinnaswamy Stadium,64.0,0.000000,64.000000,64.000000,64.000000,9.000000,3.000000,0.000000,2.000000,3.000000
4,1,Z Khan,1,Kolkata Knight Riders,38.0,0.0,9.500000,4,1,M Chinnaswamy Stadium,25.0,0.000000,25.000000,25.000000,25.000000,38.000000,9.500000,0.000000,1.000000,4.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5324,786,A Nortje,2,Rajasthan Royals,33.0,0.0,8.250000,4,2,Dubai International Cricket Stadium,66.0,30.333333,46.500000,47.500000,44.857143,29.375000,7.343750,0.000000,1.250000,4.000000
5325,786,JD Unadkat,1,Delhi Capitals,32.0,0.0,10.666667,3,2,Dubai International Cricket Stadium,48.0,30.333333,24.000000,33.200000,37.250000,30.435897,8.876068,0.038462,1.128205,3.500000
5326,786,JC Archer,1,Delhi Capitals,19.0,0.0,4.750000,4,3,Dubai International Cricket Stadium,109.0,47.000000,63.333333,66.666667,55.789474,28.482759,7.160920,0.103448,1.344828,3.965517
5327,786,R Tewatia,1,Delhi Capitals,23.0,0.0,7.666667,3,0,Dubai International Cricket Stadium,8.0,30.000000,40.000000,13.666667,27.307692,20.576923,7.791667,0.038462,0.846154,2.807692


2. Encoding Categorical values into Numerical Values

In [None]:
df=df1
df["batting_team"]= df["batting_team"].astype("category")
df["batting_team_code"] = df["batting_team"].cat.codes

df["venue"]= df["venue"].astype("category")
df["venue_code"] = df["venue"].cat.codes

df["bowler"]= df["bowler"].astype("category")
df["bowler_code"] = df["bowler"].cat.codes

# Dictionary of Bowler name to respective code
bowler_zip = dict(zip(df['bowler_code'],df['bowler']))
print(bowler_zip)

# Drop Unnecessary columns
df_ready = df.drop(columns=["venue","batting_team","bowler"])
df_ready

{141: 'P Kumar', 71: 'I Sharma', 10: 'AB Agarkar', 11: 'AB Dinda', 221: 'Z Khan', 179: 'S Sreesanth', 215: 'WA Mota', 112: 'M Muralitharan', 139: 'P Amarnath', 125: 'MS Gony', 33: 'B Lee', 123: 'MM Patel', 186: 'SK Warne', 185: 'SK Trivedi', 50: 'D Salunkhe', 30: 'B Geeves', 62: 'GD McGrath', 109: 'M Kartik', 148: 'PP Ojha', 161: 'RP Singh', 217: 'WPUJC Vaas', 58: 'DS Kulkarni', 155: 'R Vinay Kumar', 4: 'A Nehra', 67: 'Harbhajan Singh', 29: 'B Akhil', 150: 'Pankaj Singh', 127: 'Mohammad Asif', 214: 'VS Yeligati', 116: 'MA Khote', 49: 'D Kalyankrishna', 212: 'VRV Singh', 197: 'Sohail Tanvir', 1: 'A Kumble', 55: 'DNT Zoysa', 59: 'DW Steyn', 47: 'CRD Fernando', 209: 'Umar Gul', 64: 'Gagandeep Singh', 147: 'PJ Sangwan', 113: 'M Ntini', 57: 'DP Vijaykumar', 104: 'L Balaji', 2: 'A Mishra', 196: 'Shoaib Akhtar', 22: 'Abdur Razzak', 163: 'RR Powar', 5: 'A Nel', 36: 'BAW Mendis', 200: 'T Henderson', 101: 'Kamran Khan', 203: 'T Thushara', 187: 'SL Malinga', 219: 'YA Abdulla', 56: 'DP Nannes', 16

Unnamed: 0,match_id,inning,total_bowler_runs,maiden,economy,overs,wickets,fp_points,fp_avg,opp_avg,ven_avg,inn_avg,run_avg,eco_avg,maiden_avg,wicket_avg,over_avg,batting_team_code,venue_code,bowler_code
0,1,1,38.0,0.0,9.500000,4,0,8.0,0.000000,8.000000,8.000000,8.000000,38.000000,9.500000,0.000000,0.000000,4.000000,7,14,141
1,1,2,7.0,0.0,2.333333,3,1,39.0,0.000000,39.000000,39.000000,39.000000,7.000000,2.333333,0.000000,1.000000,3.000000,13,14,71
2,1,2,25.0,0.0,6.250000,4,3,87.0,0.000000,87.000000,87.000000,87.000000,25.000000,6.250000,0.000000,3.000000,4.000000,13,14,10
3,1,2,9.0,0.0,3.000000,3,2,64.0,0.000000,64.000000,64.000000,64.000000,9.000000,3.000000,0.000000,2.000000,3.000000,13,14,11
4,1,1,38.0,0.0,9.500000,4,1,25.0,0.000000,25.000000,25.000000,25.000000,38.000000,9.500000,0.000000,1.000000,4.000000,7,14,221
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5324,786,2,33.0,0.0,8.250000,4,2,66.0,30.333333,46.500000,47.500000,44.857143,29.375000,7.343750,0.000000,1.250000,4.000000,10,6,6
5325,786,1,32.0,0.0,10.666667,3,2,48.0,30.333333,24.000000,33.200000,37.250000,30.435897,8.876068,0.038462,1.128205,3.500000,2,6,80
5326,786,1,19.0,0.0,4.750000,4,3,109.0,47.000000,63.333333,66.666667,55.789474,28.482759,7.160920,0.103448,1.344828,3.965517,2,6,79
5327,786,1,23.0,0.0,7.666667,3,0,8.0,30.000000,40.000000,13.666667,27.307692,20.576923,7.791667,0.038462,0.846154,2.807692,2,6,154


3. Selecting Features from dataset

In [None]:
# Features
df2 = df1[['wicket_avg','maiden_avg','eco_avg','inn_avg','ven_avg','opp_avg','fp_avg','over_avg','bowler_code','batting_team_code','venue_code','inning']]
df3 = df2
df3 

Unnamed: 0,wicket_avg,maiden_avg,eco_avg,inn_avg,ven_avg,opp_avg,fp_avg,over_avg,bowler_code,batting_team_code,venue_code,inning
0,0.000000,0.000000,9.500000,8.000000,8.000000,8.000000,0.000000,4.000000,141,7,14,1
1,1.000000,0.000000,2.333333,39.000000,39.000000,39.000000,0.000000,3.000000,71,13,14,2
2,3.000000,0.000000,6.250000,87.000000,87.000000,87.000000,0.000000,4.000000,10,13,14,2
3,2.000000,0.000000,3.000000,64.000000,64.000000,64.000000,0.000000,3.000000,11,13,14,2
4,1.000000,0.000000,9.500000,25.000000,25.000000,25.000000,0.000000,4.000000,221,7,14,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5324,1.250000,0.000000,7.343750,44.857143,47.500000,46.500000,30.333333,4.000000,6,10,6,2
5325,1.128205,0.038462,8.876068,37.250000,33.200000,24.000000,30.333333,3.500000,80,2,6,1
5326,1.344828,0.103448,7.160920,55.789474,66.666667,63.333333,47.000000,3.965517,79,2,6,1
5327,0.846154,0.038462,7.791667,27.307692,13.666667,40.000000,30.000000,2.807692,154,2,6,1


4. Normalization of features 

In [None]:
X = df3.values

# Normalization using sklearn library
X = preprocessing.StandardScaler().fit(X).transform(X.astype(float))
X

array([[-2.70680789, -0.5747821 ,  1.42986532, ...,  0.0200465 ,
        -0.43502576, -1.00658955],
       [-0.32152423, -0.5747821 , -4.77604047, ...,  1.38963335,
        -0.43502576,  0.99345359],
       [ 4.4490431 , -0.5747821 , -1.38444079, ...,  1.38963335,
        -0.43502576,  0.99345359],
       ...,
       [ 0.50098738,  0.79903013, -0.59563882, ..., -1.12127588,
        -1.23332199, -1.00658955],
       [-0.68849094, -0.06400576, -0.04944943, ..., -1.12127588,
        -1.23332199, -1.00658955],
       [ 2.24724279, -0.5747821 , -0.01059314, ...,  0.70483992,
        -1.23332199,  0.99345359]])

5. Labels separation

In [None]:
y = df1['fp_points'].values
y

array([  8.,  39.,  87., ..., 109.,   8.,  27.])

6. Spliting data into train and test set

In [None]:
# Randomized split using sklearn in 80:20 ratio
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (4263, 12) (4263,)
Test set: (1066, 12) (1066,)


7. Building Model

In [None]:
# Random Forest Regressor model with 500 trees 
model_rf = RandomForestRegressor(n_estimators=500, oob_score=True, random_state=100)

# Training model
model_rf.fit(X_train, y_train) 

# Testing on train set for train accuracy
pred_train_rf= model_rf.predict(X_train)
print('Training Accuracy:')
print(np.sqrt(mean_squared_error(y_train,pred_train_rf)))   # Mean Square error
print(r2_score(y_train, pred_train_rf))                     # R2 Score

print()

# Testing on test set for accuracy
pred_test_rf = model_rf.predict(X_test)
print('Test Accuracy:')
print(np.sqrt(mean_squared_error(y_test,pred_test_rf)))
print(r2_score(y_test, pred_test_rf))

Training Accuracy:
7.07400927754113
0.9521949706432913

Test Accuracy:
19.58566934693207
0.6658073234066464


8. K-fold cross validation to test generalization

In [None]:
# K=5 fold

scores = cross_val_score(model_rf, X, y, cv=5)
print(scores)

[0.80246047 0.64706307 0.5607103  0.59503028 0.62120265]


9. Hyper Parameter Tuning

In [None]:
from pprint import pprint

# Current parameters of the forest
print('Parameters currently in use:\n')
pprint(model_rf.get_params())

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 500,
 'n_jobs': None,
 'oob_score': True,
 'random_state': 100,
 'verbose': 0,
 'warm_start': False}


In [None]:
# Grid search on all hyper parameters
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [None]:
# Grid search for the model
rf_random = RandomizedSearchCV(estimator = model_rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train,y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 162 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 22.1min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=500,
                              

In [None]:
# Print best parameter for the model
rf_random.best_params_

{'bootstrap': True,
 'max_depth': 10,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 1600}

In [None]:
# Train accuracy for the model with best parameters
pred_train_rf= rf_random.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_rf)))
print(r2_score(y_train, pred_train_rf))

# Test accuracy for the model with best parameters
pred_test_rf = rf_random.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_rf)))
print(r2_score(y_test, pred_test_rf))

13.630730122322543
0.8225072666473775
19.400329044407712
0.6721023652345232
