In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data_backup = pd.read_csv('./deliveries.csv')
data = data_backup.copy()
data.columns

Index(['match_id', 'season', 'start_date', 'venue', 'innings', 'ball',
       'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler',
       'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes',
       'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type',
       'other_player_dismissed'],
      dtype='object')

In [3]:
prev_data = pd.read_csv('./previous_data_per.csv')
prev_data['%'] = prev_data['Won'] / prev_data['Mat']
prev_data['%'] = prev_data['%'].apply(lambda x: round(x, 2))

In [4]:
stadium = pd.read_csv('./stadium_details.csv')

In [5]:
original_df = data
# Convert 'start_date' to datetime if it's not already in datetime format
original_df['start_date'] = pd.to_datetime(original_df['start_date'])

# Group by match_id, innings, and venue
grouped_df = original_df.groupby(['match_id', 'innings', 'venue'])

# Calculate the required statistics for each group
result_df = grouped_df.agg({
    'batting_team': 'first',
    'bowling_team': 'first',
    'wides': 'sum',
    'noballs': 'sum',
    'byes': 'sum',
    'legbyes': 'sum',
    'runs_off_bat': 'sum',
    'ball': 'max',
    'wicket_type': lambda x: x.notnull().sum(),
    'start_date': 'first'
}).reset_index()

result_df.columns = ['match_id', 'innings', 'venue', 'batting_team', 'bowling_team',
                      'total_wides', 'total_noballs', 'total_byes', 'total_legbyes', 'total_runs_per_innings_match',
                       'last_ball', 'total_wickets', 'date']

result_df['total_overs_played'] = result_df['last_ball'].apply(lambda x: min((int(x) + round((x % 1) * 10, 4) / 6), 50.0))

result_df[['total_wides', 'total_noballs', 'total_byes', 'total_legbyes']] = result_df[['total_wides', 'total_noballs', 'total_byes', 'total_legbyes']].astype(int)

result_df['total_runs_per_innings_match'] += result_df['total_wides'] + result_df['total_noballs'] + result_df['total_byes'] + result_df['total_legbyes']

result_df.drop(['last_ball'], axis=1, inplace=True)

In [6]:
result_df.head()

Unnamed: 0,match_id,innings,venue,batting_team,bowling_team,total_wides,total_noballs,total_byes,total_legbyes,total_runs_per_innings_match,total_wickets,date,total_overs_played
0,1,1,"Narendra Modi Stadium, Ahmedabad",England,New Zealand,6,0,0,0,282,9,2023-10-05,50.0
1,1,2,"Narendra Modi Stadium, Ahmedabad",New Zealand,England,3,0,4,1,283,1,2023-10-05,36.333333
2,2,1,"Rajiv Gandhi International Stadium, Uppal, Hyd...",Pakistan,Netherlands,8,1,0,0,286,10,2023-10-06,49.0
3,2,2,"Rajiv Gandhi International Stadium, Uppal, Hyd...",Netherlands,Pakistan,9,0,0,0,205,10,2023-10-06,41.166667
4,3,1,"Himachal Pradesh Cricket Association Stadium, ...",Afghanistan,Bangladesh,8,0,0,0,156,10,2023-10-07,37.333333


In [7]:
result_df = pd.merge(result_df, stadium, on='venue', how='left')

In [8]:
final_df = result_df[result_df['innings'] == 2]

In [9]:
#extract run for innings 1 from result_df and add it to final_df
first_innings = result_df[result_df['innings'] == 1]
first_innings = first_innings[['match_id', 'total_runs_per_innings_match']]
first_innings.columns = ['match_id', 'total_runs_in_innings1']
final_df = pd.merge(final_df, first_innings, on='match_id')
final_df.head()

Unnamed: 0,match_id,innings,venue,batting_team,bowling_team,total_wides,total_noballs,total_byes,total_legbyes,total_runs_per_innings_match,total_wickets,date,total_overs_played,won_after_bat_first,won_after_chase,first_inning_score,second_inning_score,total_runs_in_innings1
0,1,2,"Narendra Modi Stadium, Ahmedabad",New Zealand,England,3,0,4,1,283,1,2023-10-05,36.333333,17,15,237,208,282
1,2,2,"Rajiv Gandhi International Stadium, Uppal, Hyd...",Netherlands,Pakistan,9,0,0,0,205,10,2023-10-06,41.166667,6,4,296,261,286
2,3,2,"Himachal Pradesh Cricket Association Stadium, ...",Bangladesh,Afghanistan,4,1,0,3,158,4,2023-10-07,34.666667,4,5,253,228,156
3,4,2,"Arun Jaitley Stadium, Delhi",Sri Lanka,South Africa,14,1,0,5,326,10,2023-10-07,44.833333,16,16,239,208,428
4,5,2,"MA Chidambaram Stadium, Chepauk, Chennai",India,Australia,5,1,1,1,201,4,2023-10-08,41.333333,18,20,229,208,199


In [10]:
final_df.drop(['match_id','innings','total_wides','total_noballs','total_noballs','total_byes','total_legbyes','date'],axis=1,inplace=True)

In [11]:
final_df.head()

Unnamed: 0,venue,batting_team,bowling_team,total_runs_per_innings_match,total_wickets,total_overs_played,won_after_bat_first,won_after_chase,first_inning_score,second_inning_score,total_runs_in_innings1
0,"Narendra Modi Stadium, Ahmedabad",New Zealand,England,283,1,36.333333,17,15,237,208,282
1,"Rajiv Gandhi International Stadium, Uppal, Hyd...",Netherlands,Pakistan,205,10,41.166667,6,4,296,261,286
2,"Himachal Pradesh Cricket Association Stadium, ...",Bangladesh,Afghanistan,158,4,34.666667,4,5,253,228,156
3,"Arun Jaitley Stadium, Delhi",Sri Lanka,South Africa,326,10,44.833333,16,16,239,208,428
4,"MA Chidambaram Stadium, Chepauk, Chennai",India,Australia,201,4,41.333333,18,20,229,208,199


In [12]:
final_df = pd.merge(final_df,prev_data,left_on=['batting_team','bowling_team'],right_on=['Team','Opposition'],how='left')
final_df.drop(['Team','Opposition','Mat','Won','Lost','won_after_bat_first','won_after_chase'],axis=1,inplace=True)
final_df['%'].fillna(0.50,inplace=True)
final_df.head()

Unnamed: 0,venue,batting_team,bowling_team,total_runs_per_innings_match,total_wickets,total_overs_played,first_inning_score,second_inning_score,total_runs_in_innings1,%
0,"Narendra Modi Stadium, Ahmedabad",New Zealand,England,283,1,36.333333,237,208,282,0.47
1,"Rajiv Gandhi International Stadium, Uppal, Hyd...",Netherlands,Pakistan,205,10,41.166667,296,261,286,0.5
2,"Himachal Pradesh Cricket Association Stadium, ...",Bangladesh,Afghanistan,158,4,34.666667,253,228,156,0.62
3,"Arun Jaitley Stadium, Delhi",Sri Lanka,South Africa,326,10,44.833333,239,208,428,0.41
4,"MA Chidambaram Stadium, Chepauk, Chennai",India,Australia,201,4,41.333333,229,208,199,0.38


In [13]:
final_df.drop(['total_wickets'],axis=1,inplace=True)
final_df.head()

Unnamed: 0,venue,batting_team,bowling_team,total_runs_per_innings_match,total_overs_played,first_inning_score,second_inning_score,total_runs_in_innings1,%
0,"Narendra Modi Stadium, Ahmedabad",New Zealand,England,283,36.333333,237,208,282,0.47
1,"Rajiv Gandhi International Stadium, Uppal, Hyd...",Netherlands,Pakistan,205,41.166667,296,261,286,0.5
2,"Himachal Pradesh Cricket Association Stadium, ...",Bangladesh,Afghanistan,158,34.666667,253,228,156,0.62
3,"Arun Jaitley Stadium, Delhi",Sri Lanka,South Africa,326,44.833333,239,208,428,0.41
4,"MA Chidambaram Stadium, Chepauk, Chennai",India,Australia,201,41.333333,229,208,199,0.38


In [14]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

mapping = {}
categorical_columns = ['venue','batting_team','bowling_team']

for column in categorical_columns:
    final_df[column] = le.fit_transform(final_df[column])
    mapping[column] = dict(zip(le.classes_, le.transform(le.classes_)))

In [15]:
mapping

{'venue': {'Arun Jaitley Stadium, Delhi': 0,
  'Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium, Lucknow': 1,
  'Eden Gardens, Kolkata': 2,
  'Himachal Pradesh Cricket Association Stadium, Dharamsala': 3,
  'M Chinnaswamy Stadium, Bengaluru': 4,
  'MA Chidambaram Stadium, Chepauk, Chennai': 5,
  'Maharashtra Cricket Association Stadium, Pune': 6,
  'Narendra Modi Stadium, Ahmedabad': 7,
  'Rajiv Gandhi International Stadium, Uppal, Hyderabad': 8,
  'Wankhede Stadium, Mumbai': 9},
 'batting_team': {'Afghanistan': 0,
  'Australia': 1,
  'Bangladesh': 2,
  'England': 3,
  'India': 4,
  'Netherlands': 5,
  'New Zealand': 6,
  'Pakistan': 7,
  'South Africa': 8,
  'Sri Lanka': 9},
 'bowling_team': {'Afghanistan': 0,
  'Australia': 1,
  'Bangladesh': 2,
  'England': 3,
  'India': 4,
  'Netherlands': 5,
  'New Zealand': 6,
  'Pakistan': 7,
  'South Africa': 8,
  'Sri Lanka': 9}}

In [16]:
X=final_df.drop('total_runs_per_innings_match',axis='columns')
y=final_df['total_runs_per_innings_match']

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

X_train[['total_overs_played','first_inning_score','second_inning_score','total_runs_in_innings1']] = scaler.fit_transform(X_train[['total_overs_played','first_inning_score','second_inning_score','total_runs_in_innings1']])
X_test[['total_overs_played','first_inning_score','second_inning_score','total_runs_in_innings1']] = scaler.transform(X_test[['total_overs_played','first_inning_score','second_inning_score','total_runs_in_innings1']])

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((22, 8), (10, 8), (22,), (10,))

In [23]:
import joblib
joblib.dump(scaler, 'scaler_inning2.pkl')

['scaler_inning2.pkl']

In [24]:
X_train.head()

Unnamed: 0,venue,batting_team,bowling_team,total_overs_played,first_inning_score,second_inning_score,total_runs_in_innings1,%
4,5,4,1,0.075552,-0.931894,-0.657453,-1.213141,0.38
16,6,4,2,0.096072,1.899828,1.642627,-0.475372,0.78
5,8,5,6,0.711678,1.597778,1.686859,0.378886,0.5
13,1,1,9,-0.663176,-1.045163,-0.79015,-1.083708,0.62
11,7,4,7,-1.258261,-0.629844,-0.657453,-1.316687,0.42


In [25]:
#apply decision tree regressor using grid search cv and hyperparameter tuning
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

dt = DecisionTreeRegressor(random_state=42)

params = {
    'max_depth': [3,4,5,6,7,8,9,10,11,12,13,14,15],
    'min_samples_leaf': [1,2,3,4,5,6,7,8,9,10]
}

gs = GridSearchCV(dt, params, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)

gs.fit(X_train,y_train)

gs.best_params_

#print train and test mae

y_pred = gs.predict(X_test)
mae = mean_absolute_error(y_test,y_pred)

print("Test MAE: ",mae)

#print train mae
y_pred = gs.predict(X_train)
mae = mean_absolute_error(y_train,y_pred)

print("Train MAE: ",mae)

Test MAE:  39.2982905982906
Train MAE:  40.635586635586634


In [27]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler


model = keras.Sequential([
    layers.Input(shape=(X_train.shape[1],)),  # Input layer
    # layers.Dense(100, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(1, activation='linear')  # Output layer with 1 unit and linear activation
])

model.compile(optimizer='adam', loss='mse', metrics=['mae'])

early_stopping = EarlyStopping(monitor='val_loss', patience=10)

model.fit(X_train, y_train, epochs=750, batch_size=4, verbose=1, callbacks=[early_stopping], validation_split=0.2)

Epoch 1/750
Epoch 2/750
Epoch 3/750
Epoch 4/750
Epoch 5/750
Epoch 6/750
Epoch 7/750
Epoch 8/750
Epoch 9/750
Epoch 10/750
Epoch 11/750
Epoch 12/750
Epoch 13/750
Epoch 14/750
Epoch 15/750
Epoch 16/750
Epoch 17/750
Epoch 18/750
Epoch 19/750
Epoch 20/750
Epoch 21/750
Epoch 22/750
Epoch 23/750
Epoch 24/750
Epoch 25/750
Epoch 26/750
Epoch 27/750
Epoch 28/750
Epoch 29/750
Epoch 30/750
Epoch 31/750
Epoch 32/750
Epoch 33/750
Epoch 34/750
Epoch 35/750
Epoch 36/750
Epoch 37/750
Epoch 38/750
Epoch 39/750
Epoch 40/750
Epoch 41/750
Epoch 42/750
Epoch 43/750
Epoch 44/750
Epoch 45/750
Epoch 46/750
Epoch 47/750
Epoch 48/750
Epoch 49/750
Epoch 50/750
Epoch 51/750
Epoch 52/750
Epoch 53/750
Epoch 54/750
Epoch 55/750
Epoch 56/750
Epoch 57/750
Epoch 58/750
Epoch 59/750
Epoch 60/750
Epoch 61/750
Epoch 62/750
Epoch 63/750
Epoch 64/750
Epoch 65/750
Epoch 66/750
Epoch 67/750
Epoch 68/750
Epoch 69/750
Epoch 70/750
Epoch 71/750
Epoch 72/750
Epoch 73/750
Epoch 74/750
Epoch 75/750
Epoch 76/750
Epoch 77/750
Epoch 78

<keras.src.callbacks.History at 0x289f2a5c9d0>

In [28]:
train_loss, train_mae = model.evaluate(X_train, y_train, verbose=1)
print(f"Training MAE: {train_mae}")

# Evaluate on the test set
test_loss, test_mae = model.evaluate(X_test, y_test, verbose=1)
print(f"Test MAE: {test_mae}")

Training MAE: 54.48538589477539
Test MAE: 63.49773025512695


In [29]:
#implementing random forest regressor with hyperparameter tuning

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

rf = RandomForestRegressor(random_state=42)

params = {
    'n_estimators': [100,200,300,400,500],
    'max_depth': [3,4,5,6,7,8,9,10,11,12,13,14,15],
    'min_samples_leaf': [1,2,3,4,5,6,7,8,9,10]
}

rs = RandomizedSearchCV(rf, params, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)

rs.fit(X_train,y_train)

print(rs.best_params_)

#print train and test mae

y_pred = rs.predict(X_test)

mae = mean_absolute_error(y_test,y_pred)

print("Test MAE: ",mae)

#print train mae

y_pred = rs.predict(X_train)

mae = mean_absolute_error(y_train,y_pred)

print("Train MAE: ",mae)

{'n_estimators': 200, 'min_samples_leaf': 4, 'max_depth': 11}
Test MAE:  38.529490367965366
Train MAE:  33.27735869118254


In [None]:
#export this rs model to pickle file
import pickle
pickle.dump(rs,open('inning_2_run.pkl','wb'))

In [30]:
#predict dummy_2

dummy_2 = pd.read_csv('./dummy_2.csv')

#everything has been done till now, just need to predict the runs
dummy_2.head()

pred = rs.predict(dummy_2)

pred

array([281.13265462, 280.32687685, 288.64614326, 275.62601375,
       288.34851826, 289.03601826, 274.79240264, 279.59831931,
       290.59788731, 281.50001573, 279.73995026, 288.93913731,
       279.80546216])