In [1]:
# loading pacakages for EDA
import numpy as np # Linear algebra
import pandas as pd # data processing, scv file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # data visualization library
import seaborn as sns # data visualization library

# split data into train and test sample
from sklearn.model_selection import train_test_split

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Traditional Machine Learning Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

# classification metrices for model evaluation
from sklearn.metrics import accuracy_score



In [2]:
# listing csv file which required for data analysis
match = pd.read_csv("matches.csv")
delivery = pd.read_csv("deliveries.csv")

In [3]:
# let's see some of the raws from match data
match.head()

Unnamed: 0,id,Season,city,date,team1,team2,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,venue,umpire1,umpire2,umpire3
0,1,IPL-2017,Hyderabad,05-04-2017,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,normal,0,Sunrisers Hyderabad,35,0,Yuvraj Singh,"Rajiv Gandhi International Stadium, Uppal",AY Dandekar,NJ Llong,
1,2,IPL-2017,Pune,06-04-2017,Mumbai Indians,Rising Pune Supergiant,Rising Pune Supergiant,field,normal,0,Rising Pune Supergiant,0,7,SPD Smith,Maharashtra Cricket Association Stadium,A Nand Kishore,S Ravi,
2,3,IPL-2017,Rajkot,07-04-2017,Gujarat Lions,Kolkata Knight Riders,Kolkata Knight Riders,field,normal,0,Kolkata Knight Riders,0,10,CA Lynn,Saurashtra Cricket Association Stadium,Nitin Menon,CK Nandan,
3,4,IPL-2017,Indore,08-04-2017,Rising Pune Supergiant,Kings XI Punjab,Kings XI Punjab,field,normal,0,Kings XI Punjab,0,6,GJ Maxwell,Holkar Cricket Stadium,AK Chaudhary,C Shamshuddin,
4,5,IPL-2017,Bangalore,08-04-2017,Royal Challengers Bangalore,Delhi Daredevils,Royal Challengers Bangalore,bat,normal,0,Royal Challengers Bangalore,15,0,KM Jadhav,M Chinnaswamy Stadium,,,


In [4]:
# checking shape of data for match record file
match.shape

(756, 18)

In [5]:
# let's see some of the raws from Delivery data

delivery.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,is_super_over,...,bye_runs,legbye_runs,noball_runs,penalty_runs,batsman_runs,extra_runs,total_runs,player_dismissed,dismissal_kind,fielder
0,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,1,DA Warner,S Dhawan,TS Mills,0,...,0,0,0,0,0,0,0,,,
1,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,2,DA Warner,S Dhawan,TS Mills,0,...,0,0,0,0,0,0,0,,,
2,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,3,DA Warner,S Dhawan,TS Mills,0,...,0,0,0,0,4,0,4,,,
3,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,4,DA Warner,S Dhawan,TS Mills,0,...,0,0,0,0,0,0,0,,,
4,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,5,DA Warner,S Dhawan,TS Mills,0,...,0,0,0,0,0,2,2,,,


In [6]:
# checking shape of data for Delivery record file
delivery.shape

(179078, 21)

In [7]:
# from delivery table we are grouping ("match_id" and "inning") column fro getting Total score
total_score_df = delivery.groupby(["match_id","inning"]).sum()["total_runs"].reset_index()

In [8]:
# Extracting total runs for inning 1 in every match
total_score_df = total_score_df[total_score_df["inning"] == 1]

In [9]:
total_score_df

Unnamed: 0,match_id,inning,total_runs
0,1,1,207
2,2,1,184
4,3,1,183
6,4,1,163
8,5,1,157
...,...,...,...
1518,11347,1,143
1520,11412,1,136
1522,11413,1,171
1524,11414,1,155


In [10]:
# add total score in "match" dataframe 
# we get new dataframe which have total in dataset
match_df = match.merge(total_score_df[["match_id","total_runs"]],left_on="id",right_on="match_id")
match_df

Unnamed: 0,id,Season,city,date,team1,team2,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,venue,umpire1,umpire2,umpire3,match_id,total_runs
0,1,IPL-2017,Hyderabad,05-04-2017,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,normal,0,Sunrisers Hyderabad,35,0,Yuvraj Singh,"Rajiv Gandhi International Stadium, Uppal",AY Dandekar,NJ Llong,,1,207
1,2,IPL-2017,Pune,06-04-2017,Mumbai Indians,Rising Pune Supergiant,Rising Pune Supergiant,field,normal,0,Rising Pune Supergiant,0,7,SPD Smith,Maharashtra Cricket Association Stadium,A Nand Kishore,S Ravi,,2,184
2,3,IPL-2017,Rajkot,07-04-2017,Gujarat Lions,Kolkata Knight Riders,Kolkata Knight Riders,field,normal,0,Kolkata Knight Riders,0,10,CA Lynn,Saurashtra Cricket Association Stadium,Nitin Menon,CK Nandan,,3,183
3,4,IPL-2017,Indore,08-04-2017,Rising Pune Supergiant,Kings XI Punjab,Kings XI Punjab,field,normal,0,Kings XI Punjab,0,6,GJ Maxwell,Holkar Cricket Stadium,AK Chaudhary,C Shamshuddin,,4,163
4,5,IPL-2017,Bangalore,08-04-2017,Royal Challengers Bangalore,Delhi Daredevils,Royal Challengers Bangalore,bat,normal,0,Royal Challengers Bangalore,15,0,KM Jadhav,M Chinnaswamy Stadium,,,,5,157
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
751,11347,IPL-2019,Mumbai,05-05-2019,Kolkata Knight Riders,Mumbai Indians,Mumbai Indians,field,normal,0,Mumbai Indians,0,9,HH Pandya,Wankhede Stadium,Nanda Kishore,O Nandan,S Ravi,11347,143
752,11412,IPL-2019,Chennai,07-05-2019,Chennai Super Kings,Mumbai Indians,Chennai Super Kings,bat,normal,0,Mumbai Indians,0,6,AS Yadav,M. A. Chidambaram Stadium,Nigel Llong,Nitin Menon,Ian Gould,11412,136
753,11413,IPL-2019,Visakhapatnam,08-05-2019,Sunrisers Hyderabad,Delhi Capitals,Delhi Capitals,field,normal,0,Delhi Capitals,0,2,RR Pant,ACA-VDCA Stadium,,,,11413,171
754,11414,IPL-2019,Visakhapatnam,10-05-2019,Delhi Capitals,Chennai Super Kings,Chennai Super Kings,field,normal,0,Chennai Super Kings,0,6,F du Plessis,ACA-VDCA Stadium,Sundaram Ravi,Bruce Oxenford,Chettithody Shamshuddin,11414,155


In [11]:
# match_df have many teams which are not playing recently
match_df["team1"].unique()

array(['Sunrisers Hyderabad', 'Mumbai Indians', 'Gujarat Lions',
       'Rising Pune Supergiant', 'Royal Challengers Bangalore',
       'Kolkata Knight Riders', 'Delhi Daredevils', 'Kings XI Punjab',
       'Chennai Super Kings', 'Rajasthan Royals', 'Deccan Chargers',
       'Kochi Tuskers Kerala', 'Pune Warriors', 'Rising Pune Supergiants',
       'Delhi Capitals'], dtype=object)

In [12]:
# teams which are playing in present time
teams = ['Sunrisers Hyderabad',
         'Mumbai Indians',
        'Royal Challengers Bangalore',
        'Kolkata Knight Riders',
        'Kings XI Punjab',
        'Chennai Super Kings',
        'Rajasthan Royals', 
        'Delhi Capitals']

In [13]:
# replacing team's name with present name beacuse they change there name 
match_df["team1"] = match_df["team1"].str.replace("Delhi Daredevils","Delhi Capitals")
match_df["team2"] = match_df["team2"].str.replace("Delhi Daredevils","Delhi Capitals")

# replacing team's name with present name beacuse they change there name 
match_df["team1"] = match_df["team1"].str.replace("Deccan Chargers","Sunrisers Hyderabad")
match_df["team2"] = match_df["team2"].str.replace("Deccan Chargers","Sunrisers Hyderabad")


In [14]:
# Extracting data from dataset, only which teams playing in present time
match_df = match_df[match_df["team1"].isin(teams)]
match_df = match_df[match_df["team2"].isin(teams)]

# After this only we have 8 team's
match_df.shape

(641, 20)

In [15]:
# these are the team's which playing in present time
match_df["team1"].unique()

array(['Sunrisers Hyderabad', 'Royal Challengers Bangalore',
       'Kolkata Knight Riders', 'Kings XI Punjab', 'Delhi Capitals',
       'Mumbai Indians', 'Chennai Super Kings', 'Rajasthan Royals'],
      dtype=object)

In [16]:
match_df.head()

Unnamed: 0,id,Season,city,date,team1,team2,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,venue,umpire1,umpire2,umpire3,match_id,total_runs
0,1,IPL-2017,Hyderabad,05-04-2017,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,normal,0,Sunrisers Hyderabad,35,0,Yuvraj Singh,"Rajiv Gandhi International Stadium, Uppal",AY Dandekar,NJ Llong,,1,207
4,5,IPL-2017,Bangalore,08-04-2017,Royal Challengers Bangalore,Delhi Capitals,Royal Challengers Bangalore,bat,normal,0,Royal Challengers Bangalore,15,0,KM Jadhav,M Chinnaswamy Stadium,,,,5,157
6,7,IPL-2017,Mumbai,09-04-2017,Kolkata Knight Riders,Mumbai Indians,Mumbai Indians,field,normal,0,Mumbai Indians,0,4,N Rana,Wankhede Stadium,Nitin Menon,CK Nandan,,7,178
7,8,IPL-2017,Indore,10-04-2017,Royal Challengers Bangalore,Kings XI Punjab,Royal Challengers Bangalore,bat,normal,0,Kings XI Punjab,0,8,AR Patel,Holkar Cricket Stadium,AK Chaudhary,C Shamshuddin,,8,148
9,10,IPL-2017,Mumbai,12-04-2017,Sunrisers Hyderabad,Mumbai Indians,Mumbai Indians,field,normal,0,Mumbai Indians,0,4,JJ Bumrah,Wankhede Stadium,Nitin Menon,CK Nandan,,10,158


In [17]:
# there are 15 matches which effected by rain
match_df["dl_applied"].value_counts()

0    626
1     15
Name: dl_applied, dtype: int64

In [18]:
# Removing matches which are affected by rain
match_df = match_df[match_df["dl_applied"] ==0]

In [19]:
# We need these column in delivery dataset
match_df = match_df[["match_id","city","winner","total_runs"]]

In [20]:
# Adding these column into delivery dataset
delivery_df = match_df.merge(delivery,on="match_id")

In [21]:
# now we are noving for inning 2 so for this we only need inning 2 data

delivery_df = delivery_df[delivery_df['inning']==2]

In [22]:
delivery_df.shape

(72413, 24)

In [23]:
# calculating current score of inning 2nd

delivery_df["current_score"] = delivery_df.groupby('match_id').cumsum()["total_runs_y"]

In [24]:
# calculating Needed Runs for 2nd inning 

delivery_df["runs_left"] = delivery_df["total_runs_x"] - delivery_df["current_score"]

In [25]:
# After every ball how much ball we have left
delivery_df["ball_left"] = 126 - (delivery_df["over"]*6 + delivery_df["ball"])

In [26]:
# we are replacing valye Nan --> "0" (it's mean player is not out)

delivery_df["player_dismissed"] = delivery_df["player_dismissed"].fillna("0")
delivery_df["player_dismissed"] = delivery_df["player_dismissed"].apply(lambda x:x if x=="0" else "1")
delivery_df["player_dismissed"] = delivery_df["player_dismissed"].astype('int')

# After that we are able to check how much player is out
wickets = delivery_df.groupby("match_id").cumsum()["player_dismissed"]

# creating new column which gives you info about how much wicket are left
delivery_df["wickets_left"] = 10 - wickets

In [27]:
# Adding new column in delivery_df "Current run rate" -->crr

delivery_df["crr"] = delivery_df["current_score"]/((120 - delivery_df["ball_left"])/6)

In [28]:
# Adding new column in delivery_df "Required run rate" -->rrr

delivery_df["rrr"] = delivery_df['runs_left']/(delivery_df["ball_left"]/6)

In [29]:
delivery_df

Unnamed: 0,match_id,city,winner,total_runs_x,inning,batting_team,bowling_team,over,ball,batsman,...,total_runs_y,player_dismissed,dismissal_kind,fielder,current_score,runs_left,ball_left,wickets_left,crr,rrr
125,1,Hyderabad,Sunrisers Hyderabad,207,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,1,CH Gayle,...,1,0,,,1,206,119,10,6.000000,10.386555
126,1,Hyderabad,Sunrisers Hyderabad,207,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,2,Mandeep Singh,...,0,0,,,1,206,118,10,3.000000,10.474576
127,1,Hyderabad,Sunrisers Hyderabad,207,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,3,Mandeep Singh,...,0,0,,,1,206,117,10,2.000000,10.564103
128,1,Hyderabad,Sunrisers Hyderabad,207,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,4,Mandeep Singh,...,2,0,,,3,204,116,10,4.500000,10.551724
129,1,Hyderabad,Sunrisers Hyderabad,207,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,5,Mandeep Singh,...,4,0,,,7,200,115,10,8.400000,10.434783
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149573,11415,Hyderabad,Mumbai Indians,152,2,Chennai Super Kings,Mumbai Indians,20,2,RA Jadeja,...,1,0,,,152,0,4,5,7.862069,0.000000
149574,11415,Hyderabad,Mumbai Indians,152,2,Chennai Super Kings,Mumbai Indians,20,3,SR Watson,...,2,0,,,154,-2,3,5,7.897436,-4.000000
149575,11415,Hyderabad,Mumbai Indians,152,2,Chennai Super Kings,Mumbai Indians,20,4,SR Watson,...,1,1,run out,KH Pandya,155,-3,2,4,7.881356,-9.000000
149576,11415,Hyderabad,Mumbai Indians,152,2,Chennai Super Kings,Mumbai Indians,20,5,SN Thakur,...,2,0,,,157,-5,1,4,7.915966,-30.000000


In [30]:
# this function gives info teams win or not
def result(row):
    return 1 if row["batting_team"] == row["winner"] else 0

In [31]:
delivery_df["result"] = delivery_df.apply(result,axis=1)

In [32]:
delivery_df.head()

Unnamed: 0,match_id,city,winner,total_runs_x,inning,batting_team,bowling_team,over,ball,batsman,...,player_dismissed,dismissal_kind,fielder,current_score,runs_left,ball_left,wickets_left,crr,rrr,result
125,1,Hyderabad,Sunrisers Hyderabad,207,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,1,CH Gayle,...,0,,,1,206,119,10,6.0,10.386555,0
126,1,Hyderabad,Sunrisers Hyderabad,207,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,2,Mandeep Singh,...,0,,,1,206,118,10,3.0,10.474576,0
127,1,Hyderabad,Sunrisers Hyderabad,207,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,3,Mandeep Singh,...,0,,,1,206,117,10,2.0,10.564103,0
128,1,Hyderabad,Sunrisers Hyderabad,207,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,4,Mandeep Singh,...,0,,,3,204,116,10,4.5,10.551724,0
129,1,Hyderabad,Sunrisers Hyderabad,207,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,5,Mandeep Singh,...,0,,,7,200,115,10,8.4,10.434783,0


In [33]:
# "Final_df" it's have only required column
final_df = delivery_df[["batting_team","bowling_team","city","runs_left","ball_left","wickets_left","total_runs_x","total_runs_y","crr","rrr","result"]]

In [34]:
final_df.isnull().sum()

batting_team      0
bowling_team      0
city            832
runs_left         0
ball_left         0
wickets_left      0
total_runs_x      0
total_runs_y      0
crr               0
rrr               7
result            0
dtype: int64

In [35]:
# Removing null value

final_df.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.dropna(inplace=True)


In [36]:
final_df = final_df[final_df["ball_left"] != 0]

In [37]:
final_df.shape

(71342, 11)

In [38]:
# making two different dataset for modling test and train

X = final_df.iloc[:,:-1]
y = final_df.iloc[:,-1]

In [39]:
# spliting dataset

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [40]:
X_train.head()

Unnamed: 0,batting_team,bowling_team,city,runs_left,ball_left,wickets_left,total_runs_x,total_runs_y,crr,rrr
46530,Delhi Daredevils,Deccan Chargers,Delhi,136,112,9,145,0,6.75,7.285714
17044,Delhi Daredevils,Royal Challengers Bangalore,Bangalore,45,57,8,154,0,10.380952,4.736842
126323,Delhi Daredevils,Kings XI Punjab,Delhi,25,12,5,157,1,7.333333,12.5
6740,Delhi Daredevils,Royal Challengers Bangalore,Delhi,124,92,9,161,4,7.928571,8.086957
79456,Rajasthan Royals,Royal Challengers Bangalore,Jaipur,142,99,9,171,0,8.285714,8.606061


In [41]:
# Transforming into numerical column

trf = ColumnTransformer([
    ('trf',OneHotEncoder(sparse=False,drop='first'),['batting_team','bowling_team','city'])
],
remainder='passthrough')

In [42]:
# making pipeline for data modeling
# Applying data model

pipe = Pipeline(steps=[
    ('step1',trf),
    ('step2',LogisticRegression(solver='liblinear'))
])

In [43]:
# Training of data model

pipe.fit(X_train,y_train)

Pipeline(steps=[('step1',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('trf',
                                                  OneHotEncoder(drop='first',
                                                                sparse=False),
                                                  ['batting_team',
                                                   'bowling_team', 'city'])])),
                ('step2', LogisticRegression(solver='liblinear'))])

In [44]:
# After traing we are using test data for prediction
y_pred = pipe.predict(X_test)

In [45]:
# Checking accouracy of data model
accuracy_score(y_test,y_pred)

0.8046814773284743

In [46]:
# This is the probability of team1 and team2
pipe.predict_proba(X_test)[100]

array([0.40346714, 0.59653286])