In [160]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, LabelEncoder



In [163]:


file_path = 'Datasets/matches_2008-2024 2.csv'

matches_df = pd.read_csv(file_path)

deliveryDF = pd.read_csv('Datasets/deliveries_2008-2024.csv')
matches_df.head()

Unnamed: 0,id,season,city,date,match_type,player_of_match,venue,team1,team2,toss_winner,toss_decision,winner,result,result_margin,target_runs,target_overs,super_over,method,umpire1,umpire2
0,335982,2008,Bangalore,2008-04-18,League,BB McCullum,M Chinnaswamy Stadium,Royal Challengers Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,140.0,223.0,20.0,N,,Asad Rauf,RE Koertzen
1,335983,2008,Chandigarh,2008-04-19,League,MEK Hussey,"Punjab Cricket Association Stadium, Mohali",Kings XI Punjab,Chennai Super Kings,Chennai Super Kings,bat,Chennai Super Kings,runs,33.0,241.0,20.0,N,,MR Benson,SL Shastri
2,335984,2008,Delhi,2008-04-19,League,MF Maharoof,Feroz Shah Kotla,Delhi Daredevils,Rajasthan Royals,Rajasthan Royals,bat,Delhi Daredevils,wickets,9.0,130.0,20.0,N,,Aleem Dar,GA Pratapkumar
3,335985,2008,Mumbai,2008-04-20,League,MV Boucher,Wankhede Stadium,Mumbai Indians,Royal Challengers Bangalore,Mumbai Indians,bat,Royal Challengers Bangalore,wickets,5.0,166.0,20.0,N,,SJ Davis,DJ Harper
4,335986,2008,Kolkata,2008-04-20,League,DJ Hussey,Eden Gardens,Kolkata Knight Riders,Deccan Chargers,Deccan Chargers,bat,Kolkata Knight Riders,wickets,5.0,111.0,20.0,N,,BF Bowden,K Hariharan


In [164]:
matches_df.describe()

Unnamed: 0,id,season,result_margin,target_runs,target_overs
count,1095.0,1095.0,1076.0,1092.0,1092.0
mean,904828.3,2016.126027,17.259294,165.684066,19.759341
std,367740.2,4.94694,21.787444,33.427048,1.581108
min,335982.0,2008.0,1.0,43.0,5.0
25%,548331.5,2012.0,6.0,146.0,20.0
50%,980961.0,2016.0,8.0,166.0,20.0
75%,1254062.0,2021.0,20.0,187.0,20.0
max,1426312.0,2024.0,146.0,288.0,20.0


In [165]:
deliveryDF.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batter,bowler,non_striker,batsman_runs,extra_runs,total_runs,extras_type,is_wicket,player_dismissed,dismissal_kind,fielder
0,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,1,SC Ganguly,P Kumar,BB McCullum,0,1,1,legbyes,0,,,
1,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,2,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,
2,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,3,BB McCullum,P Kumar,SC Ganguly,0,1,1,wides,0,,,
3,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,4,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,
4,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,5,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,


In [166]:
deliveryDF.describe()

Unnamed: 0,match_id,inning,over,ball,batsman_runs,extra_runs,total_runs,is_wicket
count,260920.0,260920.0,260920.0,260920.0,260920.0,260920.0,260920.0,260920.0
mean,907066.5,1.483531,9.197677,3.624486,1.265001,0.067806,1.332807,0.049632
std,367991.3,0.502643,5.683484,1.81492,1.639298,0.343265,1.626416,0.217184
min,335982.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,548334.0,1.0,4.0,2.0,0.0,0.0,0.0,0.0
50%,980967.0,1.0,9.0,4.0,1.0,0.0,1.0,0.0
75%,1254066.0,2.0,14.0,5.0,1.0,0.0,1.0,0.0
max,1426312.0,6.0,19.0,11.0,6.0,7.0,7.0,1.0


In [167]:
# Remove trailing and leading spaces from column names
deliveryDF.columns = deliveryDF.columns.str.strip()
matches_df.columns = matches_df.columns.str.strip()


In [168]:
matches_df.columns

Index(['id', 'season', 'city', 'date', 'match_type', 'player_of_match',
       'venue', 'team1', 'team2', 'toss_winner', 'toss_decision', 'winner',
       'result', 'result_margin', 'target_runs', 'target_overs', 'super_over',
       'method', 'umpire1', 'umpire2'],
      dtype='object')

In [169]:
deliveryDF.columns

Index(['match_id', 'inning', 'batting_team', 'bowling_team', 'over', 'ball',
       'batter', 'bowler', 'non_striker', 'batsman_runs', 'extra_runs',
       'total_runs', 'extras_type', 'is_wicket', 'player_dismissed',
       'dismissal_kind', 'fielder'],
      dtype='object')

In [170]:

merged_df = pd.merge(deliveryDF,matches_df,left_on='match_id',right_on='id',how='left')

In [171]:
merged_df.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batter,bowler,non_striker,batsman_runs,...,toss_decision,winner,result,result_margin,target_runs,target_overs,super_over,method,umpire1,umpire2
0,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,1,SC Ganguly,P Kumar,BB McCullum,0,...,field,Kolkata Knight Riders,runs,140.0,223.0,20.0,N,,Asad Rauf,RE Koertzen
1,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,2,BB McCullum,P Kumar,SC Ganguly,0,...,field,Kolkata Knight Riders,runs,140.0,223.0,20.0,N,,Asad Rauf,RE Koertzen
2,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,3,BB McCullum,P Kumar,SC Ganguly,0,...,field,Kolkata Knight Riders,runs,140.0,223.0,20.0,N,,Asad Rauf,RE Koertzen
3,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,4,BB McCullum,P Kumar,SC Ganguly,0,...,field,Kolkata Knight Riders,runs,140.0,223.0,20.0,N,,Asad Rauf,RE Koertzen
4,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,5,BB McCullum,P Kumar,SC Ganguly,0,...,field,Kolkata Knight Riders,runs,140.0,223.0,20.0,N,,Asad Rauf,RE Koertzen


In [172]:
merged_df.columns

Index(['match_id', 'inning', 'batting_team', 'bowling_team', 'over', 'ball',
       'batter', 'bowler', 'non_striker', 'batsman_runs', 'extra_runs',
       'total_runs', 'extras_type', 'is_wicket', 'player_dismissed',
       'dismissal_kind', 'fielder', 'id', 'season', 'city', 'date',
       'match_type', 'player_of_match', 'venue', 'team1', 'team2',
       'toss_winner', 'toss_decision', 'winner', 'result', 'result_margin',
       'target_runs', 'target_overs', 'super_over', 'method', 'umpire1',
       'umpire2'],
      dtype='object')

In [174]:
new_merged_df = merged_df[['match_id','inning','batting_team',
'bowling_team','player_dismissed','over','ball','batsman_runs','extra_runs','total_runs','venue','target_runs']]

In [175]:
new_merged_df.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,player_dismissed,over,ball,batsman_runs,extra_runs,total_runs,venue,target_runs
0,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,,0,1,0,1,1,M Chinnaswamy Stadium,223.0
1,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,,0,2,0,0,0,M Chinnaswamy Stadium,223.0
2,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,,0,3,0,1,1,M Chinnaswamy Stadium,223.0
3,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,,0,4,0,0,0,M Chinnaswamy Stadium,223.0
4,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,,0,5,0,0,0,M Chinnaswamy Stadium,223.0


In [177]:
new_merged_df['match_id'] = new_merged_df['match_id'].astype('int64')-335981

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_merged_df['match_id'] = new_merged_df['match_id'].astype('int64')-335981


In [178]:
new_merged_df.head(10)

Unnamed: 0,match_id,inning,batting_team,bowling_team,player_dismissed,over,ball,batsman_runs,extra_runs,total_runs,venue,target_runs
0,1,1,Kolkata Knight Riders,Royal Challengers Bangalore,,0,1,0,1,1,M Chinnaswamy Stadium,223.0
1,1,1,Kolkata Knight Riders,Royal Challengers Bangalore,,0,2,0,0,0,M Chinnaswamy Stadium,223.0
2,1,1,Kolkata Knight Riders,Royal Challengers Bangalore,,0,3,0,1,1,M Chinnaswamy Stadium,223.0
3,1,1,Kolkata Knight Riders,Royal Challengers Bangalore,,0,4,0,0,0,M Chinnaswamy Stadium,223.0
4,1,1,Kolkata Knight Riders,Royal Challengers Bangalore,,0,5,0,0,0,M Chinnaswamy Stadium,223.0
5,1,1,Kolkata Knight Riders,Royal Challengers Bangalore,,0,6,0,0,0,M Chinnaswamy Stadium,223.0
6,1,1,Kolkata Knight Riders,Royal Challengers Bangalore,,0,7,0,1,1,M Chinnaswamy Stadium,223.0
7,1,1,Kolkata Knight Riders,Royal Challengers Bangalore,,1,1,0,0,0,M Chinnaswamy Stadium,223.0
8,1,1,Kolkata Knight Riders,Royal Challengers Bangalore,,1,2,4,0,4,M Chinnaswamy Stadium,223.0
9,1,1,Kolkata Knight Riders,Royal Challengers Bangalore,,1,3,4,0,4,M Chinnaswamy Stadium,223.0


In [179]:
print(new_merged_df.isnull().sum())


match_id              0
inning                0
batting_team          0
bowling_team          0
player_dismissed      0
over                  0
ball                  0
batsman_runs          0
extra_runs            0
total_runs            0
venue                 0
target_runs         309
dtype: int64


In [180]:

# wickets by counting non-null values in 'players_dismissed' within 
# each 'match_id' group
new_merged_df['wickets'] = new_merged_df.groupby('match_id')['player_dismissed'].transform(lambda x: x.notnull().cumsum())

# overs as a floating point (e.g., 0.1, 0.2...) based on 'over' and 'ball'

new_merged_df['overs'] = new_merged_df['over'] + (new_merged_df['ball'] - 1)*0.1

# cumulative runs and total score for each match
new_merged_df['total'] = new_merged_df.groupby('match_id')['total_runs'].cumsum()

# last 5 overs runs and wickets using a rolling window of 30 deliveries
new_merged_df['runs_last_5'] = new_merged_df.groupby('match_id')['total_runs'].transform(
    lambda x: x.rolling(30, min_periods=1).sum()
)

# For wickets, we take the last cumulative count within each rolling window
new_merged_df['wickets_last_5'] = new_merged_df.groupby('match_id')['wickets'].transform(
    lambda x: x.rolling(30, min_periods=1).apply(
        lambda y: y.iloc[-1], raw=False)
)




# Selecting final columns for the dataset
final_dataset = new_merged_df[
    ['match_id','inning' ,'venue', 'batting_team', 'bowling_team', 'total_runs',
        'wickets', 'overs', 'runs_last_5', 'wickets_last_5','target_runs']
]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_merged_df['wickets'] = new_merged_df.groupby('match_id')['player_dismissed'].transform(lambda x: x.notnull().cumsum())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_merged_df['overs'] = new_merged_df['over'] + (new_merged_df['ball'] - 1)*0.1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  n

In [181]:
final_dataset.head(10)

Unnamed: 0,match_id,inning,venue,batting_team,bowling_team,total_runs,wickets,overs,runs_last_5,wickets_last_5,target_runs
0,1,1,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,1,1,0.0,1.0,1.0,223.0
1,1,1,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,0,2,0.1,1.0,2.0,223.0
2,1,1,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,1,3,0.2,2.0,3.0,223.0
3,1,1,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,0,4,0.3,2.0,4.0,223.0
4,1,1,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,0,5,0.4,2.0,5.0,223.0
5,1,1,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,0,6,0.5,2.0,6.0,223.0
6,1,1,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,1,7,0.6,3.0,7.0,223.0
7,1,1,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,0,8,1.0,3.0,8.0,223.0
8,1,1,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,4,9,1.1,7.0,9.0,223.0
9,1,1,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,4,10,1.2,11.0,10.0,223.0


Feature Engineering 

In [184]:
first_innings = final_dataset[final_dataset['inning'] == 1].groupby('match_id')['total_runs'].sum().reset_index()
first_innings.rename(columns={'total_runs': 'target_runs'}, inplace=True)

In [185]:
first_innings

Unnamed: 0,match_id,target_runs
0,1,222
1,2,240
2,3,129
3,4,165
4,5,110
...,...,...
1090,1090326,214
1091,1090328,159
1092,1090329,172
1093,1090330,175


KeyError: 'venue'

In [187]:
first_innings

Unnamed: 0,match_id,target_runs
0,1,222
1,2,240
2,3,129
3,4,165
4,5,110
...,...,...
1090,1090326,214
1091,1090328,159
1092,1090329,172
1093,1090330,175


In [188]:
first_innings

Unnamed: 0,match_id,target_runs
0,1,222
1,2,240
2,3,129
3,4,165
4,5,110
...,...,...
1090,1090326,214
1091,1090328,159
1092,1090329,172
1093,1090330,175


In [146]:
first_innings.rename(columns={'total_runs': 'target_runs'}, inplace=True)


Train-Test Split 

In [191]:
%pip install scikit-learn
from sklearn.model_selection import train_test_split

#Splitting match-level data

# Prepare data for training
X = first_innings.drop(columns=['target_runs'])
y = first_innings['target_runs']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


5191.70s - pydevd: Sending message related to process being replaced timed-out after 5 seconds



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [196]:
param_grid = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}



In [197]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np


grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=3, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

# Predict and evaluate
y_pred = best_model.predict(X_test)

print("Best Parameters:", grid_search.best_params_)
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R²:", r2_score(y_test, y_pred))


KeyboardInterrupt: 