In [183]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [184]:
data_backup = pd.read_csv('./deliveries.csv')
prev_data = pd.read_csv('./previous_data_per.csv')
stadium_data = pd.read_csv('./stadium_details.csv')

In [185]:
prev_data['%'] = prev_data['Won'] / prev_data['Mat']
prev_data['%'] = prev_data['%'].apply(lambda x: round(x, 2))

In [186]:
data_backup.head(5)

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
0,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.1,England,New Zealand,JM Bairstow,DJ Malan,...,0,,,,,,,,,
1,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.2,England,New Zealand,JM Bairstow,DJ Malan,...,0,,,,,,,,,
2,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.3,England,New Zealand,JM Bairstow,DJ Malan,...,0,,,,,,,,,
3,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.4,England,New Zealand,DJ Malan,JM Bairstow,...,0,,,,,,,,,
4,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.5,England,New Zealand,JM Bairstow,DJ Malan,...,0,,,,,,,,,


In [187]:
stadium_data.head(5)

Unnamed: 0,venue,won_after_bat_first,won_after_chase,first_inning_score,second_inning_score
0,"Narendra Modi Stadium, Ahmedabad",17,15,237,208
1,"Rajiv Gandhi International Stadium, Uppal, Hyd...",6,4,296,261
2,"Himachal Pradesh Cricket Association Stadium, ...",4,5,253,228
3,"Arun Jaitley Stadium, Delhi",16,16,239,208
4,"MA Chidambaram Stadium, Chepauk, Chennai",18,20,229,208


In [188]:
data = data_backup.copy()
data.columns

Index(['match_id', 'season', 'start_date', 'venue', 'innings', 'ball',
       'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler',
       'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes',
       'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type',
       'other_player_dismissed'],
      dtype='object')

In [189]:
original_df = data
# Convert 'start_date' to datetime if it's not already in datetime format
original_df['start_date'] = pd.to_datetime(original_df['start_date'])

# Group by match_id, innings, and venue
grouped_df = original_df.groupby(['match_id', 'innings', 'venue'])

# Calculate the required statistics for each group
result_df = grouped_df.agg({
    'batting_team': 'first',
    'bowling_team': 'first',
    'wides': 'sum',
    'noballs': 'sum',
    'runs_off_bat': 'sum',
    'ball': 'max',
    'wicket_type': lambda x: x.notnull().sum(),
    'start_date': 'first'
}).reset_index()

# Rename columns
result_df.columns = ['match_id', 'innings', 'venue', 'batting_team', 'bowling_team',
                      'total_wides', 'total_noballs', 'total_runs_per_innings_match',
                      'last_ball', 'total_wickets', 'start_date']

# Calculate 'total_overs_played' based on the 'ball' column
result_df['total_overs_played'] = result_df['last_ball'].apply(lambda x: min((int(x) + round((x % 1) * 10, 4) / 6), 50.0))

# Change dtype of 'total_wides' and 'total_noballs' to int
result_df[['total_wides', 'total_noballs']] = result_df[['total_wides', 'total_noballs']].astype(int)

# Add 'total_wides' and 'total_noballs' to 'total_runs_per_innings_match' per innings
result_df['total_runs_per_innings_match'] += result_df['total_wides'] + result_df['total_noballs']

In [190]:
result_df.head(5)

Unnamed: 0,match_id,innings,venue,batting_team,bowling_team,total_wides,total_noballs,total_runs_per_innings_match,last_ball,total_wickets,start_date,total_overs_played
0,1,1,"Narendra Modi Stadium, Ahmedabad",England,New Zealand,6,0,282,49.7,9,2023-10-05,50.0
1,1,2,"Narendra Modi Stadium, Ahmedabad",New Zealand,England,3,0,278,36.2,1,2023-10-05,36.333333
2,2,1,"Rajiv Gandhi International Stadium, Uppal, Hyd...",Pakistan,Netherlands,8,1,286,48.6,10,2023-10-06,49.0
3,2,2,"Rajiv Gandhi International Stadium, Uppal, Hyd...",Netherlands,Pakistan,9,0,205,40.7,10,2023-10-06,41.166667
4,3,1,"Himachal Pradesh Cricket Association Stadium, ...",Afghanistan,Bangladesh,8,0,156,37.2,10,2023-10-07,37.333333


In [191]:
#according to the stadium and innings add avg score
result_df = pd.merge(result_df, stadium_data, on='venue', how='left')
result_df.head(5)

Unnamed: 0,match_id,innings,venue,batting_team,bowling_team,total_wides,total_noballs,total_runs_per_innings_match,last_ball,total_wickets,start_date,total_overs_played,won_after_bat_first,won_after_chase,first_inning_score,second_inning_score
0,1,1,"Narendra Modi Stadium, Ahmedabad",England,New Zealand,6,0,282,49.7,9,2023-10-05,50.0,17,15,237,208
1,1,2,"Narendra Modi Stadium, Ahmedabad",New Zealand,England,3,0,278,36.2,1,2023-10-05,36.333333,17,15,237,208
2,2,1,"Rajiv Gandhi International Stadium, Uppal, Hyd...",Pakistan,Netherlands,8,1,286,48.6,10,2023-10-06,49.0,6,4,296,261
3,2,2,"Rajiv Gandhi International Stadium, Uppal, Hyd...",Netherlands,Pakistan,9,0,205,40.7,10,2023-10-06,41.166667,6,4,296,261
4,3,1,"Himachal Pradesh Cricket Association Stadium, ...",Afghanistan,Bangladesh,8,0,156,37.2,10,2023-10-07,37.333333,4,5,253,228


In [192]:
result_df = result_df[result_df['innings'] == 1]

In [193]:
result_df.head(5)

Unnamed: 0,match_id,innings,venue,batting_team,bowling_team,total_wides,total_noballs,total_runs_per_innings_match,last_ball,total_wickets,start_date,total_overs_played,won_after_bat_first,won_after_chase,first_inning_score,second_inning_score
0,1,1,"Narendra Modi Stadium, Ahmedabad",England,New Zealand,6,0,282,49.7,9,2023-10-05,50.0,17,15,237,208
2,2,1,"Rajiv Gandhi International Stadium, Uppal, Hyd...",Pakistan,Netherlands,8,1,286,48.6,10,2023-10-06,49.0,6,4,296,261
4,3,1,"Himachal Pradesh Cricket Association Stadium, ...",Afghanistan,Bangladesh,8,0,156,37.2,10,2023-10-07,37.333333,4,5,253,228
6,4,1,"Arun Jaitley Stadium, Delhi",South Africa,Sri Lanka,21,1,427,49.7,5,2023-10-07,50.0,16,16,239,208
8,5,1,"MA Chidambaram Stadium, Chepauk, Chennai",Australia,India,6,0,193,49.3,10,2023-10-08,49.5,18,20,229,208


In [194]:
result_df.drop(['match_id','innings','total_wides','total_noballs','last_ball','total_wickets','start_date','second_inning_score','won_after_bat_first','won_after_chase'],axis=1,inplace=True)

In [195]:
result_df.head(5)

Unnamed: 0,venue,batting_team,bowling_team,total_runs_per_innings_match,total_overs_played,first_inning_score
0,"Narendra Modi Stadium, Ahmedabad",England,New Zealand,282,50.0,237
2,"Rajiv Gandhi International Stadium, Uppal, Hyd...",Pakistan,Netherlands,286,49.0,296
4,"Himachal Pradesh Cricket Association Stadium, ...",Afghanistan,Bangladesh,156,37.333333,253
6,"Arun Jaitley Stadium, Delhi",South Africa,Sri Lanka,427,50.0,239
8,"MA Chidambaram Stadium, Chepauk, Chennai",Australia,India,193,49.5,229


In [196]:
#in this result_df add winning_prob for batting team from previous_data_per.csv
#merge if batting_team == Team and bowling_team == Opponent
result_df = pd.merge(result_df,prev_data,left_on=['batting_team','bowling_team'],right_on=['Team','Opposition'],how='left')

In [197]:
result_df.head(5)

Unnamed: 0,venue,batting_team,bowling_team,total_runs_per_innings_match,total_overs_played,first_inning_score,Team,Opposition,Mat,Won,Lost,%
0,"Narendra Modi Stadium, Ahmedabad",England,New Zealand,282,50.0,237,England,New Zealand,96.0,44.0,45.0,0.46
1,"Rajiv Gandhi International Stadium, Uppal, Hyd...",Pakistan,Netherlands,286,49.0,296,,,,,,
2,"Himachal Pradesh Cricket Association Stadium, ...",Afghanistan,Bangladesh,156,37.333333,253,Afghanistan,Bangladesh,16.0,6.0,10.0,0.38
3,"Arun Jaitley Stadium, Delhi",South Africa,Sri Lanka,427,50.0,239,South Africa,Sri Lanka,81.0,46.0,33.0,0.57
4,"MA Chidambaram Stadium, Chepauk, Chennai",Australia,India,193,49.5,229,Australia,India,150.0,83.0,57.0,0.55


In [198]:
result_df.head(5)

result_df.drop(['Team','Opposition','Mat','Won','Lost'],axis=1,inplace=True)
result_df['%'].fillna(0.50,inplace=True)
result_df.head(5)

Unnamed: 0,venue,batting_team,bowling_team,total_runs_per_innings_match,total_overs_played,first_inning_score,%
0,"Narendra Modi Stadium, Ahmedabad",England,New Zealand,282,50.0,237,0.46
1,"Rajiv Gandhi International Stadium, Uppal, Hyd...",Pakistan,Netherlands,286,49.0,296,0.5
2,"Himachal Pradesh Cricket Association Stadium, ...",Afghanistan,Bangladesh,156,37.333333,253,0.38
3,"Arun Jaitley Stadium, Delhi",South Africa,Sri Lanka,427,50.0,239,0.57
4,"MA Chidambaram Stadium, Chepauk, Chennai",Australia,India,193,49.5,229,0.55


In [199]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [200]:
mapping = {}
categorical_columns = ['venue','batting_team','bowling_team']

for column in categorical_columns:
    result_df[column] = le.fit_transform(result_df[column])
    mapping[column] = dict(zip(le.classes_, le.transform(le.classes_)))

In [201]:
result_df.head(5)

Unnamed: 0,venue,batting_team,bowling_team,total_runs_per_innings_match,total_overs_played,first_inning_score,%
0,7,3,6,282,50.0,237,0.46
1,8,7,5,286,49.0,296,0.5
2,3,0,2,156,37.333333,253,0.38
3,0,8,9,427,50.0,239,0.57
4,5,1,4,193,49.5,229,0.55


In [202]:
result_df.head(5)

Unnamed: 0,venue,batting_team,bowling_team,total_runs_per_innings_match,total_overs_played,first_inning_score,%
0,7,3,6,282,50.0,237,0.46
1,8,7,5,286,49.0,296,0.5
2,3,0,2,156,37.333333,253,0.38
3,0,8,9,427,50.0,239,0.57
4,5,1,4,193,49.5,229,0.55


In [203]:
X=result_df.drop('total_runs_per_innings_match',axis='columns')
y=result_df['total_runs_per_innings_match']

In [204]:
X.head()

Unnamed: 0,venue,batting_team,bowling_team,total_overs_played,first_inning_score,%
0,7,3,6,50.0,237,0.46
1,8,7,5,49.0,296,0.5
2,3,0,2,37.333333,253,0.38
3,0,8,9,50.0,239,0.57
4,5,1,4,49.5,229,0.55


In [216]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1,random_state=42)

X_train[['total_overs_played','first_inning_score']] = scaler.fit_transform(X_train[['total_overs_played','first_inning_score']])
# X_test[['total_overs_played','first_inning_score']] = scaler.transform(X_test[['total_overs_played','first_inning_score']])

X_train.shape, X_test.shape

((28, 6), (4, 6))

In [217]:
#store standard scaler
import joblib
joblib.dump(scaler, 'scaler_inning1.pkl')

['scaler_inning1.pkl']

In [210]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV

elastic = ElasticNet()

elastic_params = {'alpha':[1e-15,1e-10,1e-8,1e-4,1e-2,1,5,10,20,30,35,40,45,50,55,100],'l1_ratio':[0.1,0.3,0.5,0.7,0.9,1]}

elastic_regressor = GridSearchCV(elastic,elastic_params,scoring='neg_mean_squared_error',cv=5)

elastic_regressor.fit(X_train,y_train)

print(elastic_regressor.best_params_)
print(elastic_regressor.best_score_)

#print train and test mae for elastic net regression

from sklearn.metrics import mean_absolute_error
y_pred_train = elastic_regressor.predict(X_train)
y_pred_test = elastic_regressor.predict(X_test)

print('Train MAE:',mean_absolute_error(y_train,y_pred_train))
print('Test MAE:',mean_absolute_error(y_test,y_pred_test))

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

{'alpha': 0.01, 'l1_ratio': 0.1}
-4033.589945290485
Train MAE: 38.084257546625494
Test MAE: 49.63839695022443


In [133]:
from sklearn.metrics import mean_absolute_error
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test,y_pred)

print("Test MAE: ",mae)

y_pred_train = model.predict(X_train)
mae = mean_absolute_error(y_train,y_pred_train)

print("Train MAE: ",mae)

Test MAE:  69.778076171875
Train MAE:  74.0991794041225


In [181]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler


model = keras.Sequential([
    layers.Input(shape=(X_train.shape[1],)),  # Input layer
    # layers.Dense(64, activation='relu'),
    # layers.Dense(64, activation='relu'),
    # layers.Dense(64, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(1, activation='linear')  # Output layer with 1 unit and linear activation
])

model.compile(optimizer='adam', loss='mse', metrics=['mae'])

early_stopping = EarlyStopping(monitor='val_loss', patience=10)

model.fit(X_train, y_train, epochs=750, batch_size=4, verbose=1, callbacks=[early_stopping], validation_split=0.2)

Epoch 1/750
Epoch 2/750
Epoch 3/750
Epoch 4/750
Epoch 5/750
Epoch 6/750
Epoch 7/750
Epoch 8/750
Epoch 9/750
Epoch 10/750
Epoch 11/750
Epoch 12/750
Epoch 13/750
Epoch 14/750
Epoch 15/750
Epoch 16/750
Epoch 17/750
Epoch 18/750
Epoch 19/750
Epoch 20/750
Epoch 21/750
Epoch 22/750
Epoch 23/750
Epoch 24/750
Epoch 25/750
Epoch 26/750
Epoch 27/750
Epoch 28/750
Epoch 29/750
Epoch 30/750
Epoch 31/750
Epoch 32/750
Epoch 33/750
Epoch 34/750
Epoch 35/750
Epoch 36/750
Epoch 37/750
Epoch 38/750
Epoch 39/750
Epoch 40/750
Epoch 41/750
Epoch 42/750
Epoch 43/750
Epoch 44/750
Epoch 45/750
Epoch 46/750
Epoch 47/750
Epoch 48/750
Epoch 49/750
Epoch 50/750
Epoch 51/750
Epoch 52/750
Epoch 53/750
Epoch 54/750
Epoch 55/750
Epoch 56/750
Epoch 57/750
Epoch 58/750
Epoch 59/750
Epoch 60/750
Epoch 61/750
Epoch 62/750
Epoch 63/750
Epoch 64/750
Epoch 65/750
Epoch 66/750
Epoch 67/750
Epoch 68/750
Epoch 69/750
Epoch 70/750
Epoch 71/750
Epoch 72/750
Epoch 73/750
Epoch 74/750
Epoch 75/750
Epoch 76/750
Epoch 77/750
Epoch 78

<keras.src.callbacks.History at 0x271f95eb990>

In [182]:
train_loss, train_mae = model.evaluate(X_train, y_train, verbose=1)
print(f"Training MAE: {train_mae}")

# Evaluate on the test set
test_loss, test_mae = model.evaluate(X_test, y_test, verbose=1)
print(f"Test MAE: {test_mae}")

Training MAE: 44.749267578125
Test MAE: 24.324642181396484


In [176]:
#export this model as pkl
import pickle
pickle.dump(model, open('inning_1_run.pkl','wb'))

In [160]:
# Predict on the test set
y_pred = model.predict(X_test)
y_pred



array([[261.93753],
       [263.98575],
       [250.54686],
       [343.10767],
       [214.84103]], dtype=float32)

In [161]:
y_test

29    239
15    284
24    153
17    356
8     267
Name: total_runs_per_innings_match, dtype: int64

In [172]:
#apply random forest regressor with hyperparameter tuning

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

rf = RandomForestRegressor()

n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
max_features = ['auto','sqrt']
max_depth = [int(x) for x in np.linspace(5,30,num=6)]
min_samples_split = [2,5,10,15,100]
min_samples_leaf = [1,2,5,10]

random_grid = {'n_estimators':n_estimators,
                'max_features':max_features,
                'max_depth':max_depth,
                'min_samples_split':min_samples_split,
                'min_samples_leaf':min_samples_leaf}

rf_random = RandomizedSearchCV(estimator=rf,param_distributions=random_grid,scoring='neg_mean_squared_error',n_iter=10,cv=5,verbose=2,random_state=42,n_jobs=1)

rf_random.fit(X_train,y_train)

print(rf_random.best_params_)

print(rf_random.best_score_)

predictions = rf_random.predict(X_test)
print(predictions)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=   0.7s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=   0.6s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=   0.5s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=   0.5s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=   0.6s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1100; total time=   0.7s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1100; total time=   0.7s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimator

20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\DEVANSH\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\DEVANSH\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "c:\Users\DEVANSH\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\DEVANSH\AppData\Local\Programs\Python\Python

{'n_estimators': 1000, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 25}
-3111.730053638518
[324.037 288.142 231.497 341.678]


In [173]:
from sklearn.metrics import mean_absolute_error

predictions = rf_random.predict(X_train)
print("Train error: ", mean_absolute_error(y_train,predictions))

predictions = rf_random.predict(X_test)
print("Test error: ",mean_absolute_error(y_test,predictions))

Train error:  16.320821428571424
Test error:  45.4995
