In [2]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
data_backup = pd.read_csv('./deliveries.csv')
prev_data = pd.read_csv('./previous_data_per.csv')

In [4]:
prev_data['%'] = prev_data['Won'] / prev_data['Mat']

#rounding off the values

prev_data['%'] = prev_data['%'].apply(lambda x: round(x, 2))

In [5]:
data_backup.head(5)

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
0,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.1,England,New Zealand,JM Bairstow,DJ Malan,...,0,,,,,,,,,
1,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.2,England,New Zealand,JM Bairstow,DJ Malan,...,0,,,,,,,,,
2,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.3,England,New Zealand,JM Bairstow,DJ Malan,...,0,,,,,,,,,
3,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.4,England,New Zealand,DJ Malan,JM Bairstow,...,0,,,,,,,,,
4,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.5,England,New Zealand,JM Bairstow,DJ Malan,...,0,,,,,,,,,


In [6]:
data = data_backup.copy()
data.columns

Index(['match_id', 'season', 'start_date', 'venue', 'innings', 'ball',
       'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler',
       'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes',
       'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type',
       'other_player_dismissed'],
      dtype='object')

In [7]:
original_df = data
# Convert 'start_date' to datetime if it's not already in datetime format
original_df['start_date'] = pd.to_datetime(original_df['start_date'])

# Group by match_id, innings, and venue
grouped_df = original_df.groupby(['match_id', 'innings', 'venue'])

# Calculate the required statistics for each group
result_df = grouped_df.agg({
    'batting_team': 'first',
    'bowling_team': 'first',
    'wides': 'sum',
    'noballs': 'sum',
    'runs_off_bat': 'sum',
    'ball': 'max',
    'wicket_type': lambda x: x.notnull().sum(),
    'start_date': 'first'
}).reset_index()

# Rename columns
result_df.columns = ['match_id', 'innings', 'venue', 'batting_team', 'bowling_team',
                      'total_wides', 'total_noballs', 'total_runs_per_innings_match',
                      'last_ball', 'total_wickets', 'start_date']

# Calculate 'total_overs_played' based on the 'ball' column
result_df['total_overs_played'] = result_df['last_ball'].apply(lambda x: min((int(x) + round((x % 1) * 10, 4) / 6), 50.0))

# Change dtype of 'total_wides' and 'total_noballs' to int
result_df[['total_wides', 'total_noballs']] = result_df[['total_wides', 'total_noballs']].astype(int)

# Add 'total_wides' and 'total_noballs' to 'total_runs_per_innings_match' per innings
result_df['total_runs_per_innings_match'] += result_df['total_wides'] + result_df['total_noballs']

In [8]:
result_df.head(5)

Unnamed: 0,match_id,innings,venue,batting_team,bowling_team,total_wides,total_noballs,total_runs_per_innings_match,last_ball,total_wickets,start_date,total_overs_played
0,1,1,"Narendra Modi Stadium, Ahmedabad",England,New Zealand,6,0,282,49.7,9,2023-10-05,50.0
1,1,2,"Narendra Modi Stadium, Ahmedabad",New Zealand,England,3,0,278,36.2,1,2023-10-05,36.333333
2,2,1,"Rajiv Gandhi International Stadium, Uppal, Hyd...",Pakistan,Netherlands,8,1,286,48.6,10,2023-10-06,49.0
3,2,2,"Rajiv Gandhi International Stadium, Uppal, Hyd...",Netherlands,Pakistan,9,0,205,40.7,10,2023-10-06,41.166667
4,3,1,"Himachal Pradesh Cricket Association Stadium, ...",Afghanistan,Bangladesh,8,0,156,37.2,10,2023-10-07,37.333333


In [9]:
result_df = result_df[result_df['innings'] == 1]

In [10]:
result_df.head(5)

Unnamed: 0,match_id,innings,venue,batting_team,bowling_team,total_wides,total_noballs,total_runs_per_innings_match,last_ball,total_wickets,start_date,total_overs_played
0,1,1,"Narendra Modi Stadium, Ahmedabad",England,New Zealand,6,0,282,49.7,9,2023-10-05,50.0
2,2,1,"Rajiv Gandhi International Stadium, Uppal, Hyd...",Pakistan,Netherlands,8,1,286,48.6,10,2023-10-06,49.0
4,3,1,"Himachal Pradesh Cricket Association Stadium, ...",Afghanistan,Bangladesh,8,0,156,37.2,10,2023-10-07,37.333333
6,4,1,"Arun Jaitley Stadium, Delhi",South Africa,Sri Lanka,21,1,427,49.7,5,2023-10-07,50.0
8,5,1,"MA Chidambaram Stadium, Chepauk, Chennai",Australia,India,6,0,193,49.3,10,2023-10-08,49.5


In [11]:
result_df.drop(['match_id','innings','total_wides','total_noballs','last_ball','total_wickets','start_date'],axis=1,inplace=True)

In [12]:
result_df.head(5)

Unnamed: 0,venue,batting_team,bowling_team,total_runs_per_innings_match,total_overs_played
0,"Narendra Modi Stadium, Ahmedabad",England,New Zealand,282,50.0
2,"Rajiv Gandhi International Stadium, Uppal, Hyd...",Pakistan,Netherlands,286,49.0
4,"Himachal Pradesh Cricket Association Stadium, ...",Afghanistan,Bangladesh,156,37.333333
6,"Arun Jaitley Stadium, Delhi",South Africa,Sri Lanka,427,50.0
8,"MA Chidambaram Stadium, Chepauk, Chennai",Australia,India,193,49.5


In [13]:
#in this result_df add winning_prob for batting team from previous_data_per.csv
#merge if batting_team == Team and bowling_team == Opponent

result_df = pd.merge(result_df,prev_data,left_on=['batting_team','bowling_team'],right_on=['Team','Opposition'],how='left')


In [14]:
result_df.head(5)

Unnamed: 0,venue,batting_team,bowling_team,total_runs_per_innings_match,total_overs_played,Team,Opposition,Mat,Won,Lost,%
0,"Narendra Modi Stadium, Ahmedabad",England,New Zealand,282,50.0,England,New Zealand,96.0,44.0,45.0,0.46
1,"Rajiv Gandhi International Stadium, Uppal, Hyd...",Pakistan,Netherlands,286,49.0,,,,,,
2,"Himachal Pradesh Cricket Association Stadium, ...",Afghanistan,Bangladesh,156,37.333333,Afghanistan,Bangladesh,16.0,6.0,10.0,0.38
3,"Arun Jaitley Stadium, Delhi",South Africa,Sri Lanka,427,50.0,South Africa,Sri Lanka,81.0,46.0,33.0,0.57
4,"MA Chidambaram Stadium, Chepauk, Chennai",Australia,India,193,49.5,Australia,India,150.0,83.0,57.0,0.55


In [15]:
result_df.head(5)

result_df.drop(['Team','Opposition','Mat','Won','Lost'],axis=1,inplace=True)
result_df['%'].fillna(0.50,inplace=True)
result_df.head(5)

Unnamed: 0,venue,batting_team,bowling_team,total_runs_per_innings_match,total_overs_played,%
0,"Narendra Modi Stadium, Ahmedabad",England,New Zealand,282,50.0,0.46
1,"Rajiv Gandhi International Stadium, Uppal, Hyd...",Pakistan,Netherlands,286,49.0,0.5
2,"Himachal Pradesh Cricket Association Stadium, ...",Afghanistan,Bangladesh,156,37.333333,0.38
3,"Arun Jaitley Stadium, Delhi",South Africa,Sri Lanka,427,50.0,0.57
4,"MA Chidambaram Stadium, Chepauk, Chennai",Australia,India,193,49.5,0.55


In [16]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [17]:
mapping = {}
categorical_columns = ['venue','batting_team','bowling_team']

for column in categorical_columns:
    result_df[column] = le.fit_transform(result_df[column])
    mapping[column] = dict(zip(le.classes_, le.transform(le.classes_)))

In [18]:
result_df.head(5)

Unnamed: 0,venue,batting_team,bowling_team,total_runs_per_innings_match,total_overs_played,%
0,7,3,6,282,50.0,0.46
1,8,7,5,286,49.0,0.5
2,3,0,2,156,37.333333,0.38
3,0,8,9,427,50.0,0.57
4,5,1,4,193,49.5,0.55


In [19]:
result_df.head(5)

Unnamed: 0,venue,batting_team,bowling_team,total_runs_per_innings_match,total_overs_played,%
0,7,3,6,282,50.0,0.46
1,8,7,5,286,49.0,0.5
2,3,0,2,156,37.333333,0.38
3,0,8,9,427,50.0,0.57
4,5,1,4,193,49.5,0.55


In [20]:
X=result_df.drop('total_runs_per_innings_match',axis='columns')
y=result_df['total_runs_per_innings_match']

In [36]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1,random_state=42)

size_scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = size_scaler.transform(X_train)
X_test_scaled = size_scaler.transform(X_test)
X_train_scaled.shape, X_test_scaled.shape

((28, 5), (4, 5))

In [37]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.6576207909058742

In [38]:
#print test and train mae

from sklearn.metrics import mean_absolute_error
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test,y_pred)

print("Test MAE: ",mae)

y_pred_train = model.predict(X_train)
mae = mean_absolute_error(y_train,y_pred_train)

print("Train MAE: ",mae)

Test MAE:  38.977533240032415
Train MAE:  40.91528904429316


In [None]:
import pickle
pickle.dump(model, open('inning_1_run.pkl','wb'))

In [32]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler


model = keras.Sequential([
    layers.Input(shape=(X_train.shape[1],)),  # Input layer
    layers.Dense(32, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(8, activation='relu'),
    layers.Dense(1, activation='linear')  # Output layer with 1 unit and linear activation
])

model.compile(optimizer='adam', loss='mse', metrics=['mae'])

early_stopping = EarlyStopping(monitor='val_loss', patience=10)

model.fit(X_train, y_train, epochs=500, batch_size=1, verbose=1, callbacks=[early_stopping], validation_split=0.2)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500


<keras.src.callbacks.History at 0x262f38f0350>

In [33]:
train_loss, train_mae = model.evaluate(X_train, y_train, verbose=1)
print(f"Training MAE: {train_mae}")

# Evaluate on the test set
test_loss, test_mae = model.evaluate(X_test, y_test, verbose=1)
print(f"Test MAE: {test_mae}")

Training MAE: 51.656925201416016
Test MAE: 44.803348541259766


In [32]:
# Predict on the test set
y_pred = model.predict(X_test)
y_pred



array([[287.39777],
       [282.77878],
       [222.55276],
       [294.82825]], dtype=float32)

In [33]:
y_test

29    239
15    284
24    153
17    356
Name: total_runs_per_innings_match, dtype: int64

In [34]:
#apply random forest regressor with hyperparameter tuning

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

rf = RandomForestRegressor()

n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
max_features = ['auto','sqrt']
max_depth = [int(x) for x in np.linspace(5,30,num=6)]
min_samples_split = [2,5,10,15,100]
min_samples_leaf = [1,2,5,10]

random_grid = {'n_estimators':n_estimators,
                'max_features':max_features,
                'max_depth':max_depth,
                'min_samples_split':min_samples_split,
                'min_samples_leaf':min_samples_leaf}

rf_random = RandomizedSearchCV(estimator=rf,param_distributions=random_grid,scoring='neg_mean_squared_error',n_iter=10,cv=5,verbose=2,random_state=42,n_jobs=1)

rf_random.fit(X_train,y_train)

print(rf_random.best_params_)

print(rf_random.best_score_)

predictions = rf_random.predict(X_test)
print(predictions)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=   0.6s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=   0.6s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=   0.5s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=   0.6s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=   0.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1100; total time=   0.7s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1100; total time=   0.7s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimator

20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\DEVANSH\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\DEVANSH\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "c:\Users\DEVANSH\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\DEVANSH\AppData\Local\Programs\Python\Python

{'n_estimators': 1000, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 25}
-3285.0990994599997
[314.6   297.384 228.182 347.637]


In [151]:
y_test

29    239
15    284
24    153
17    356
Name: total_runs_per_innings_match, dtype: int64