In [1]:
# Data Preprocessing
import pandas as pd
import pickle

# Load the CSV file into a DataFrame
scoring_data = pd.read_csv('deliveries.csv')
matches_data=pd.read_csv('matches.csv')

#renaming the id column inorder to perform merge operation
matches_data=matches_data.rename(columns={'id':'match_id'})

matches_data.head()

Unnamed: 0,match_id,season,city,date,match_type,player_of_match,venue,team1,team2,toss_winner,toss_decision,winner,result,result_margin,target_runs,target_overs,super_over,method,umpire1,umpire2
0,335982,2007/08,Bangalore,2008-04-18,League,BB McCullum,M Chinnaswamy Stadium,Royal Challengers Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,140.0,223.0,20.0,N,,Asad Rauf,RE Koertzen
1,335983,2007/08,Chandigarh,2008-04-19,League,MEK Hussey,"Punjab Cricket Association Stadium, Mohali",Kings XI Punjab,Chennai Super Kings,Chennai Super Kings,bat,Chennai Super Kings,runs,33.0,241.0,20.0,N,,MR Benson,SL Shastri
2,335984,2007/08,Delhi,2008-04-19,League,MF Maharoof,Feroz Shah Kotla,Delhi Daredevils,Rajasthan Royals,Rajasthan Royals,bat,Delhi Daredevils,wickets,9.0,130.0,20.0,N,,Aleem Dar,GA Pratapkumar
3,335985,2007/08,Mumbai,2008-04-20,League,MV Boucher,Wankhede Stadium,Mumbai Indians,Royal Challengers Bangalore,Mumbai Indians,bat,Royal Challengers Bangalore,wickets,5.0,166.0,20.0,N,,SJ Davis,DJ Harper
4,335986,2007/08,Kolkata,2008-04-20,League,DJ Hussey,Eden Gardens,Kolkata Knight Riders,Deccan Chargers,Deccan Chargers,bat,Kolkata Knight Riders,wickets,5.0,111.0,20.0,N,,BF Bowden,K Hariharan


In [2]:
#merging two datasets
cricket_data = pd.merge(scoring_data, matches_data, on='match_id', how='left')

In [3]:
# Convert 'over' and 'ball' to a single count for easier calculation
cricket_data['ball_count'] = cricket_data['over'] * 6 + cricket_data['ball']

# Sorting the data to ensure the rolling operation respects the match and inning progression
cricket_data.sort_values(by=['match_id', 'inning', 'ball_count'], inplace=True)

# Reset the index to ensure proper rolling calculations
cricket_data.reset_index(drop=True, inplace=True)

# Calculate rolling totals for both runs and wickets, using the last 30 balls
cricket_data['score_last_30_balls'] = cricket_data.groupby(['match_id', 'inning'])['total_runs'].transform(
    lambda x: x.rolling(window=30, min_periods=1).sum())

cricket_data['wickets_last_30_balls'] = cricket_data.groupby(['match_id', 'inning'])['is_wicket'].transform(
    lambda x: x.rolling(window=30, min_periods=1).sum())

# Calculate the cumulative score
cricket_data['cumulative_score'] = cricket_data.groupby(['match_id', 'inning'])['total_runs'].cumsum()

# Adjust the first 5 overs to just cumulative sum up to that ball
mask = cricket_data['over'] < 5
cricket_data.loc[mask, 'score_last_30_balls'] = cricket_data.loc[mask].groupby(['match_id', 'inning'])['total_runs'].cumsum()
cricket_data.loc[mask, 'wickets_last_30_balls'] = cricket_data.loc[mask].groupby(['match_id', 'inning'])['is_wicket'].cumsum()
# Calculate cumulative wickets
cricket_data['cumulative_wickets'] = cricket_data.groupby(['match_id', 'inning'])['is_wicket'].cumsum()


In [4]:
# Calculate the cumulative total score for each ball in the innings
cricket_data['cumulative_score'] = cricket_data.groupby(['match_id', 'inning'])['total_runs'].cumsum()

# Calculate the final total score for each innings and merge this as 'total_score' for each delivery
final_scores = cricket_data.groupby(['match_id', 'inning'])['cumulative_score'].max().reset_index()
final_scores.rename(columns={'cumulative_score': 'total_score'}, inplace=True)
cricket_data = pd.merge(cricket_data, final_scores, on=['match_id', 'inning'], how='left')


In [5]:
cricket_data.columns

Index(['match_id', 'inning', 'batting_team', 'bowling_team', 'over', 'ball',
       'batter', 'bowler', 'non_striker', 'batsman_runs', 'extra_runs',
       'total_runs', 'extras_type', 'is_wicket', 'player_dismissed',
       'dismissal_kind', 'fielder', 'season', 'city', 'date', 'match_type',
       'player_of_match', 'venue', 'team1', 'team2', 'toss_winner',
       'toss_decision', 'winner', 'result', 'result_margin', 'target_runs',
       'target_overs', 'super_over', 'method', 'umpire1', 'umpire2',
       'ball_count', 'score_last_30_balls', 'wickets_last_30_balls',
       'cumulative_score', 'cumulative_wickets', 'total_score'],
      dtype='object')

In [6]:
#Removing Unwanted Columns from the dataframe
columns= ['fielder','dismissal_kind','player_dismissed','is_wicket','non_striker','extras_type','bowler','batter','match_id','batsman_runs', 'extra_runs',
'extras_type','is_wicket', 'player_dismissed','dismissal_kind', 'fielder', 'season','toss_winner','result','super_over', 'method', 'umpire1', 'umpire2',
'target_overs','venue','player_of_match','team1', 'team2','result_margin', 'target_runs','total_runs','match_type','ball_count','winner','ball']
cricket_data.drop(columns=columns, axis=1, inplace=True)

In [7]:
cricket_data.shape

(260920, 12)

In [8]:
cricket_data.columns

Index(['inning', 'batting_team', 'bowling_team', 'over', 'city', 'date',
       'toss_decision', 'score_last_30_balls', 'wickets_last_30_balls',
       'cumulative_score', 'cumulative_wickets', 'total_score'],
      dtype='object')

In [9]:
cricket_data.isnull().sum()

inning                       0
batting_team                 0
bowling_team                 0
over                         0
city                     12397
date                         0
toss_decision                0
score_last_30_balls          0
wickets_last_30_balls        0
cumulative_score             0
cumulative_wickets           0
total_score                  0
dtype: int64

In [10]:
cricket_data.dropna(inplace=True)

In [11]:
cricket_data['batting_team'].unique()

array(['Kolkata Knight Riders', 'Royal Challengers Bangalore',
       'Chennai Super Kings', 'Kings XI Punjab', 'Rajasthan Royals',
       'Delhi Daredevils', 'Mumbai Indians', 'Deccan Chargers',
       'Kochi Tuskers Kerala', 'Pune Warriors', 'Sunrisers Hyderabad',
       'Rising Pune Supergiants', 'Gujarat Lions',
       'Rising Pune Supergiant', 'Delhi Capitals', 'Punjab Kings',
       'Lucknow Super Giants', 'Gujarat Titans',
       'Royal Challengers Bengaluru'], dtype=object)

In [12]:
# Punjab Kings is the new name of Kings XI Punjab & Delhi Capitals is the new name of Delhi Daredevils

cricket_data['batting_team'] = cricket_data['batting_team'].replace({
    'Kings XI Punjab': 'Punjab Kings',
    'Delhi Daredevils': 'Delhi Capitals'
})

In [13]:
# Updating in the bowling team column

cricket_data['bowling_team'] = cricket_data['bowling_team'].replace({
    'Kings XI Punjab': 'Punjab Kings',
    'Delhi Daredevils': 'Delhi Capitals'
})


In [14]:
#Updating the Bangalore to Bengaluru

cricket_data['city'] = cricket_data['city'].replace({
    'Bangalore': 'Bengaluru'
})


In [15]:
# Keeping only consistent teams
consistent_teams = ['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals',
                    'Mumbai Indians', 'Punjab Kings', 'Royal Challengers Bangalore',
                    'Delhi Capitals', 'Sunrisers Hyderabad','Gujarat Titans','Lucknow Super Giants']

In [16]:
cricket_data['city'].unique()

array(['Bengaluru', 'Chandigarh', 'Delhi', 'Mumbai', 'Kolkata', 'Jaipur',
       'Hyderabad', 'Chennai', 'Cape Town', 'Port Elizabeth', 'Durban',
       'Centurion', 'East London', 'Johannesburg', 'Kimberley',
       'Bloemfontein', 'Ahmedabad', 'Cuttack', 'Nagpur', 'Dharamsala',
       'Kochi', 'Indore', 'Visakhapatnam', 'Pune', 'Raipur', 'Ranchi',
       'Abu Dhabi', 'Rajkot', 'Kanpur', 'Sharjah', 'Dubai', 'Navi Mumbai',
       'Lucknow', 'Guwahati', 'Mohali'], dtype=object)

In [17]:
consistent_cities= ['Chandigarh', 'Delhi', 'Mumbai', 'Kolkata', 'Jaipur',
       'Hyderabad', 'Chennai',  'Ahmedabad', 'Cuttack', 'Nagpur', 'Dharamsala',
       'Kochi', 'Indore', 'Visakhapatnam', 'Pune', 'Raipur', 'Ranchi',
        'Rajkot', 'Kanpur', 'Bengaluru', 
       'Navi Mumbai', 'Lucknow', 'Guwahati', 'Mohali']

In [18]:
cricket_data = cricket_data[(cricket_data['batting_team'].isin(consistent_teams)) & (cricket_data['bowling_team'].isin(consistent_teams)) &
 (cricket_data['city'].isin(consistent_cities))]

In [19]:
# Converting the column 'date' from string into datetime object
from datetime import datetime
cricket_data['date'] = cricket_data['date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))

In [20]:
# Feature Engineering
ipl_data=pd.get_dummies(cricket_data, columns=['batting_team','bowling_team','toss_decision','city'],dtype=int)

In [21]:
# Rearranging the columns accordingly
ipl_data = ipl_data[['inning','date', 'batting_team_Chennai Super Kings', 'batting_team_Delhi Capitals',
       'batting_team_Gujarat Titans', 'batting_team_Kolkata Knight Riders',
       'batting_team_Lucknow Super Giants', 'batting_team_Mumbai Indians',
       'batting_team_Punjab Kings', 'batting_team_Rajasthan Royals',
       'batting_team_Royal Challengers Bangalore',
       'batting_team_Sunrisers Hyderabad', 'bowling_team_Chennai Super Kings',
       'bowling_team_Delhi Capitals', 'bowling_team_Gujarat Titans',
       'bowling_team_Kolkata Knight Riders',
       'bowling_team_Lucknow Super Giants', 'bowling_team_Mumbai Indians',
       'bowling_team_Punjab Kings', 'bowling_team_Rajasthan Royals',
       'bowling_team_Royal Challengers Bangalore',
       'bowling_team_Sunrisers Hyderabad',  'toss_decision_bat',
       'toss_decision_field', 'city_Ahmedabad',
       'city_Bengaluru', 'city_Chandigarh', 'city_Chennai', 'city_Cuttack',
       'city_Delhi', 'city_Dharamsala', 'city_Guwahati', 'city_Hyderabad',
       'city_Indore', 'city_Jaipur', 'city_Kolkata', 'city_Lucknow',
       'city_Mohali', 'city_Mumbai', 'city_Navi Mumbai', 'city_Pune',
       'city_Raipur', 'city_Ranchi', 'city_Visakhapatnam','over',  'score_last_30_balls',
       'wickets_last_30_balls', 'cumulative_score','cumulative_wickets', 'total_score',
       ]]

In [22]:
# Splitting the data to train and test
# Splitting the data into train and test set
x_train = ipl_data.drop(labels='total_score', axis=1)[ipl_data['date'].dt.year <= 2022]
x_test = ipl_data.drop(labels='total_score', axis=1)[ipl_data['date'].dt.year > 2022]

In [23]:
y_train = ipl_data[ipl_data['date'].dt.year <= 2022]['total_score'].values
y_test  = ipl_data[ipl_data['date'].dt.year > 2022]['total_score'].values

In [24]:
# Removing the 'date' column
x_train.drop(labels='date', axis=True, inplace=True)
x_test.drop(labels='date', axis=True, inplace=True)

## Algorithm Selection

In [25]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

In [26]:
models = {
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'RandomForestRegressor': RandomForestRegressor(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'XGBRegressor': XGBRegressor(),
    'AdaBoostRegressor': AdaBoostRegressor()
}


In [27]:
results = {}
for name, model in models.items():
    scores = cross_val_score(model, x_train, y_train, scoring='neg_mean_squared_error', cv=10)
    results[name] = np.mean(scores)

# Print the results
for name, score in results.items():
    print(f"{name}: Mean MSE: {-score:.4f}")



LinearRegression: Mean MSE: 2416876691258114560.0000
Lasso: Mean MSE: 547.4868
Ridge: Mean MSE: 543.3299
RandomForestRegressor: Mean MSE: 731.2414
DecisionTreeRegressor: Mean MSE: 1088.6408
GradientBoostingRegressor: Mean MSE: 532.8350
XGBRegressor: Mean MSE: 651.7768
AdaBoostRegressor: Mean MSE: 700.7449


In [29]:
best_model = max(results, key=results.get)
print(f"Best model is {best_model} with MSE {results[best_model]:.4f}")



Best model is GradientBoostingRegressor with MSE -532.8350


In [31]:
# Retrieve the best model and train it on the entire training dataset
final_model = models[best_model]
final_model.fit(x_train, y_train)

In [32]:
# Save the model to a file using pickle
with open('best_model.pkl', 'wb') as file:
    pickle.dump(final_model, file)


In [34]:
x_train.columns

Index(['inning', 'batting_team_Chennai Super Kings',
       'batting_team_Delhi Capitals', 'batting_team_Gujarat Titans',
       'batting_team_Kolkata Knight Riders',
       'batting_team_Lucknow Super Giants', 'batting_team_Mumbai Indians',
       'batting_team_Punjab Kings', 'batting_team_Rajasthan Royals',
       'batting_team_Royal Challengers Bangalore',
       'batting_team_Sunrisers Hyderabad', 'bowling_team_Chennai Super Kings',
       'bowling_team_Delhi Capitals', 'bowling_team_Gujarat Titans',
       'bowling_team_Kolkata Knight Riders',
       'bowling_team_Lucknow Super Giants', 'bowling_team_Mumbai Indians',
       'bowling_team_Punjab Kings', 'bowling_team_Rajasthan Royals',
       'bowling_team_Royal Challengers Bangalore',
       'bowling_team_Sunrisers Hyderabad', 'toss_decision_bat',
       'toss_decision_field', 'city_Ahmedabad', 'city_Bengaluru',
       'city_Chandigarh', 'city_Chennai', 'city_Cuttack', 'city_Delhi',
       'city_Dharamsala', 'city_Guwahati', 'city