In [1]:
# ------- import libraries ------- #

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_absolute_error
import lightgbm as lgb
from sklearn.compose import ColumnTransformer
import pickle

In [2]:
# ------- read the files ------- #

match = pd.read_csv('IPL_Matches_2008_2022.csv')
delivery = pd.read_csv('IPL_Ball_by_Ball_2008_2022.csv')

In [3]:
# ------- checking the null values for 'match' dataset ------- #

match.isnull().sum()

ID                   0
City                51
Date                 0
Season               0
MatchNumber          0
Team1                0
Team2                0
Venue                0
TossWinner           0
TossDecision         0
SuperOver            4
WinningTeam          4
WonBy                0
Margin              18
method             931
Player_of_Match      4
Team1Players         0
Team2Players         0
Umpire1              0
Umpire2              0
dtype: int64

In [4]:
# ------- checking which venue's cities are null ------- #

match[match['City'].isnull()]['Venue'].value_counts()

Dubai International Cricket Stadium    33
Sharjah Cricket Stadium                18
Name: Venue, dtype: int64

In [5]:
# ------- i. extracting the city's names from the venues -------#
# ------- ii. replacing the city's null values with the names ------- #

cities = np.where(match['City'].isnull(),match['Venue'].str.split().apply(lambda x:x[0]),match['City'])
match['City'] = cities

In [6]:
# ------- i. evaluating the total runs for each match and storing it to a new dataset 'total_score_df' ------- #
# ------- ii. discarding the second innings total and only going forward with first innings total of each match ------- #

total_score_df = delivery.groupby(['ID','innings']).sum()['total_run'].reset_index()
total_score_df = total_score_df[total_score_df['innings'] == 1]

In [7]:
# ------- merging the 'total_score_df' with 'match' and storing it to a new dataset 'match_df' ------- #

match_df = match.merge(total_score_df[['ID','total_run']],left_on='ID',right_on='ID')

In [8]:
# ------- checking the list of all teams in current dataset ------- #

match_df['Team1'].unique()

array(['Rajasthan Royals', 'Royal Challengers Bangalore',
       'Sunrisers Hyderabad', 'Delhi Capitals', 'Chennai Super Kings',
       'Gujarat Titans', 'Lucknow Super Giants', 'Kolkata Knight Riders',
       'Punjab Kings', 'Mumbai Indians', 'Kings XI Punjab',
       'Delhi Daredevils', 'Rising Pune Supergiant', 'Gujarat Lions',
       'Rising Pune Supergiants', 'Pune Warriors', 'Deccan Chargers',
       'Kochi Tuskers Kerala'], dtype=object)

In [9]:
# ------- creating a list of current playing teams in ipl ------- #

teams = [
    'Rajasthan Royals',
    'Royal Challengers Bangalore',
    'Sunrisers Hyderabad',
    'Delhi Capitals',
    'Chennai Super Kings',
    'Kolkata Knight Riders',
    'Punjab Kings',
    'Mumbai Indians'
]

In [10]:
# ------- replacing the old team names with the new team names in the dataset ------- #

match_df['Team1'] = match_df['Team1'].str.replace('Delhi Daredevils','Delhi Capitals')
match_df['Team2'] = match_df['Team2'].str.replace('Delhi Daredevils','Delhi Capitals')

match_df['Team1'] = match_df['Team1'].str.replace('Kings XI Punjab','Punjab Kings')
match_df['Team2'] = match_df['Team2'].str.replace('Kings XI Punjab','Punjab Kings')

match_df['Team1'] = match_df['Team1'].str.replace('Deccan Chargers','Sunrisers Hyderabad')
match_df['Team2'] = match_df['Team2'].str.replace('Deccan Chargers','Sunrisers Hyderabad')

In [11]:
# ------- discarding the past ipl teams and going forward with the current 8 ipl teams ------- #

match_df = match_df[match_df['Team1'].isin(teams)]
match_df = match_df[match_df['Team2'].isin(teams)]

In [12]:
# ------- checking the d/l method matches ------- #

match_df['method'].value_counts()

D/L    15
Name: method, dtype: int64

In [13]:
# ------- discarding the d/l matches ------- #

match_df = match_df[match_df['method'] != 'D/L']
match_df.shape

(791, 21)

In [14]:
# ------- i. taking only the necessary fields from 'match_df' dataset ------- #
# ------- ii. merging that data with 'delivery' and storing it into a new dataset 'delivery_df' ------- #

match_df = match_df[['ID','City','WinningTeam','total_run','Team1','Team2']]
delivery_df = match_df.merge(delivery, on='ID')

In [15]:
# ------- i. replacing the old team names with the new team names in the new 'delivery_df' dataset ------- #
# ------- ii. creating a new feature as [BowlingTeam] as there is no bowling team in the dataset ------- #

delivery_df['BattingTeam'] = delivery_df['BattingTeam'].str.replace('Delhi Daredevils','Delhi Capitals')
delivery_df['BattingTeam'] = delivery_df['BattingTeam'].str.replace('Kings XI Punjab','Punjab Kings')
delivery_df['BattingTeam'] = delivery_df['BattingTeam'].str.replace('Deccan Chargers','Sunrisers Hyderabad')

delivery_df.loc[delivery_df['BattingTeam'] == delivery_df['Team1'], 'BowlingTeam'] = delivery_df['Team2']
delivery_df.loc[delivery_df['BattingTeam'] == delivery_df['Team2'], 'BowlingTeam'] = delivery_df['Team1']

delivery_df.shape

(189350, 23)

In [16]:
# ------- discarding the second innings and only going forward with the first innings data ------- #

delivery_df = delivery_df[delivery_df['innings'] == 1]
delivery_df.shape

(97607, 23)

In [17]:
# ------- checking all the columns of the new dataset ------- #

columns = delivery_df.columns.tolist()
for column in columns:
    print(column)

ID
City
WinningTeam
total_run_x
Team1
Team2
innings
overs
ballnumber
batter
bowler
non-striker
extra_type
batsman_run
extras_run
total_run_y
non_boundary
isWicketDelivery
player_out
kind
fielders_involved
BattingTeam
BowlingTeam


In [18]:
# ------- evaluating the current score ------- #
delivery_df['current_score'] = delivery_df.groupby('ID').cumsum()['total_run_y']

# ------- evaluating how many balls are left in the innings ------- #
delivery_df['balls_left'] = 120 - (delivery_df['overs']*6 + delivery_df['ballnumber'])

# ------- evaluating how many wickets are left in the innings ------- #
wickets = delivery_df.groupby('ID').cumsum()['isWicketDelivery'].values
delivery_df['wickets_left'] = 10 - wickets

# ------- evaluating the current run rate ------- #
delivery_df['crr'] = (delivery_df['current_score'] / (120 - delivery_df['balls_left'])) * 6

# ------- evaluating (i)last 5 overs runs and (ii)last 5 overs wickets fall ------- #
groups = delivery_df.groupby('ID')
match_ids = delivery_df['ID'].unique()
last_five_runs = []
last_five_wickets = []
for id in match_ids:
    last_five_runs.extend(groups.get_group(id).rolling(window=30).sum()['total_run_y'].values.tolist())
    last_five_wickets.extend(groups.get_group(id).rolling(window=30).sum()['isWicketDelivery'].values.tolist())
delivery_df['last_five_runs'] = last_five_runs
delivery_df['last_five_wickets'] = last_five_wickets

In [19]:
# ------- selecting only the necessary features and storing it to a new dataset 'final_df' ------- #

final_df = delivery_df[['BattingTeam','BowlingTeam','City','current_score','balls_left','wickets_left','crr','last_five_runs','last_five_wickets','total_run_x']]

In [20]:
# ------- removing the missing/null values from the 'final_df' ------- #

final_df.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [21]:
# ------- discarding the data where [balls_left] = 0, else it will cause error while evaluating crr ------- #

final_df = final_df[final_df['balls_left'] != 0]
final_df.shape

(73926, 10)

In [22]:
# ------- randomizing the data, otherwise model will become biased ------- #

final_df = final_df.sample(final_df.shape[0])
final_df

Unnamed: 0,BattingTeam,BowlingTeam,City,current_score,balls_left,wickets_left,crr,last_five_runs,last_five_wickets,total_run_x
107274,Kolkata Knight Riders,Royal Challengers Bangalore,Sharjah,131,13,4,7.345794,35.0,2.0,150
42005,Mumbai Indians,Sunrisers Hyderabad,Mumbai,91,48,8,7.583333,38.0,1.0,162
44993,Royal Challengers Bangalore,Chennai Super Kings,Bengaluru,156,3,4,8.000000,45.0,3.0,161
14937,Rajasthan Royals,Royal Challengers Bangalore,Dubai,93,56,9,8.718750,37.0,1.0,149
152411,Delhi Capitals,Punjab Kings,Delhi,45,85,7,7.714286,40.0,2.0,111
...,...,...,...,...,...,...,...,...,...,...
26047,Mumbai Indians,Delhi Capitals,Dubai,106,38,6,7.756098,25.0,2.0,200
167360,Punjab Kings,Mumbai Indians,Centurion,117,2,1,5.949153,25.0,2.0,119
8002,Mumbai Indians,Royal Challengers Bangalore,Pune,62,59,5,6.098361,20.0,5.0,151
178378,Rajasthan Royals,Chennai Super Kings,Chennai,139,43,9,10.831169,49.0,1.0,211


In [23]:
# ------- i. taking all columns except [total_run_x] as model features ------- #
# ------- ii. taking [total_run_x] as model label ------- #
# ------- iii. splitting train and test dataset ------- #

x = final_df.drop(columns=['total_run_x'])
y = final_df['total_run_x']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=1)

In [24]:
# ------- i. transforming the string columns ------- #
# ------- ii. creating the pipeline object ------- #

trf = ColumnTransformer([('trf',OneHotEncoder(sparse=False,drop='first'),['BattingTeam','BowlingTeam','City'])],remainder='passthrough')
pipe = Pipeline(steps=[
    ('step1', trf),
    ('step2', StandardScaler()),
    ('step3', lgb.LGBMRegressor(n_estimators=1000, learning_rate=0.2, max_depth=12, random_state=1))
])

In [25]:
# ------- i. fitting the training data to model ------- #
# ------- ii. testing the prediction of the model on test data ------- #
# ------- iii. checking the efficiency of the model ------- #

pipe.fit(x_train,y_train)
y_pred = pipe.predict(x_test)
print(r2_score(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))

0.9578058989584746
3.870549237257272


In [26]:
# ------- dumping the pipeline object into a pkl file  ------- #

pickle.dump(pipe,open('score_pred.pkl','wb'))

In [27]:
# ------- checking the list of all cities in current dataset ------- #

final_df['City'].unique()

array(['Sharjah', 'Mumbai', 'Bengaluru', 'Dubai', 'Delhi', 'Chandigarh',
       'Durban', 'East London', 'Navi Mumbai', 'Ahmedabad', 'Hyderabad',
       'Port Elizabeth', 'Kolkata', 'Dharamsala', 'Chennai', 'Jaipur',
       'Bangalore', 'Abu Dhabi', 'Raipur', 'Kimberley', 'Pune',
       'Visakhapatnam', 'Indore', 'Centurion', 'Ranchi', 'Johannesburg',
       'Cape Town', 'Cuttack', 'Nagpur', 'Bloemfontein'], dtype=object)