# **Step 1 — Understanding the dataset!**

In [None]:
import urllib.request

url = 'https://github.com/siddhu21/IPL-Match-Prediction-ML-/raw/main/IPL_2020.xlsx'
filename, headers =urllib.request.urlretrieve(url, filename="IPLdataset.xlsx")


# **Step 2 — Getting the data into the playground**

**Import all the libraries**

In [17]:
import os
import pandas as pd 
from google.colab import drive
from IPython.display import display, HTML 
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


 **Reading the data-set**

I am using .read_csv() to read the CSV file. The path in the .read_csv() function can be relative or absolute. Here, I we are using Google Colaboratory as our playground and hence the dataset file is stored there.

In [18]:
# Importing the dataset
filepath="/content/drive/My Drive/SEAS-AU/Semester6/ML/ML Project/Codes/IPLdataset_2008_2020.csv"
df=pd.read_csv(filepath)  # dataframe of the da
display(df.head())

Unnamed: 0,id,season,city,date,team1,Pplay T1,pplay twick1,team2,Pplay T2,pplay twick2,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,venue,Avg 1st Innings Score,Avg 2nd Innings Score,umpire1,umpire2,umpire3,Pitch Type
0,1,2017,Hyderabad,4/5/2017,Sunrisers Hyderabad,59,1,Royal Challengers Bangalore,54,1,Royal Challengers Bangalore,field,normal,0,Sunrisers Hyderabad,35,0,Yuvraj Singh,"Rajiv Gandhi International Stadium, Uppal",185,175,AY Dandekar,NJ Llong,,Batting
1,2,2017,Pune,4/6/2017,Mumbai Indians,61,1,Rising Pune Supergiant,59,1,Rising Pune Supergiant,field,normal,0,Rising Pune Supergiant,0,7,SPD Smith,Maharashtra Cricket Association Stadium,160,148,A Nand Kishore,S Ravi,,Batting & Spinner Friendly
2,3,2017,Rajkot,4/7/2017,Gujarat Lions,52,1,Kolkata Knight Riders,73,0,Kolkata Knight Riders,field,normal,0,Kolkata Knight Riders,0,10,CA Lynn,Saurashtra Cricket Association Stadium,183,170,Nitin Menon,CK Nandan,,Batting
3,4,2017,Indore,4/8/2017,Rising Pune Supergiant,35,1,Kings XI Punjab,56,2,Kings XI Punjab,field,normal,0,Kings XI Punjab,0,6,GJ Maxwell,Holkar Cricket Stadium,160,158,AK Chaudhary,C Shamshuddin,,Batting
4,5,2017,Bangalore,4/8/2017,Royal Challengers Bangalore,41,2,Delhi Daredevils,43,2,Royal Challengers Bangalore,bat,normal,0,Royal Challengers Bangalore,15,0,KM Jadhav,M Chinnaswamy Stadium,170,150,,,,Batting


# **Step 3 - Cleaning the data-set!**

**Check for Null Values**

The data that we have, contains null values in several columns.

In [19]:
# Check for Null Values
column_no=0
columns_with_nullval=[]
for column in df:  
  total_null_values = int(str(df.iloc[:, column_no].isnull().sum()))
  if total_null_values > 0:
    print( "Total null values = ", total_null_values, " \tfor column-name: ", column)
  column_no += 1

Total null values =  7  	for column-name:  city
Total null values =  62  	for column-name:  umpire1
Total null values =  62  	for column-name:  umpire2
Total null values =  694  	for column-name:  umpire3


 **Output:**

Total null values =  7    	for column-name:  city

Total null values =  62   	for column-name:  umpire1

Total null values =  62   	for column-name:  umpire2

Total null values =  694  	for column-name:  umpire3

**Inference:**

=> Find some way to insert values in the column city where value is null

=> Drop the feature umpire1, umpire2, and umpire3 since there are lots of null values. 


In [20]:
# Dropping the columns 'umpire1', 'umpire2', and 'umpire3'
df=df.drop(['umpire1', 'umpire2', 'umpire3'], axis=1)
display(df.head())

Unnamed: 0,id,season,city,date,team1,Pplay T1,pplay twick1,team2,Pplay T2,pplay twick2,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,venue,Avg 1st Innings Score,Avg 2nd Innings Score,Pitch Type
0,1,2017,Hyderabad,4/5/2017,Sunrisers Hyderabad,59,1,Royal Challengers Bangalore,54,1,Royal Challengers Bangalore,field,normal,0,Sunrisers Hyderabad,35,0,Yuvraj Singh,"Rajiv Gandhi International Stadium, Uppal",185,175,Batting
1,2,2017,Pune,4/6/2017,Mumbai Indians,61,1,Rising Pune Supergiant,59,1,Rising Pune Supergiant,field,normal,0,Rising Pune Supergiant,0,7,SPD Smith,Maharashtra Cricket Association Stadium,160,148,Batting & Spinner Friendly
2,3,2017,Rajkot,4/7/2017,Gujarat Lions,52,1,Kolkata Knight Riders,73,0,Kolkata Knight Riders,field,normal,0,Kolkata Knight Riders,0,10,CA Lynn,Saurashtra Cricket Association Stadium,183,170,Batting
3,4,2017,Indore,4/8/2017,Rising Pune Supergiant,35,1,Kings XI Punjab,56,2,Kings XI Punjab,field,normal,0,Kings XI Punjab,0,6,GJ Maxwell,Holkar Cricket Stadium,160,158,Batting
4,5,2017,Bangalore,4/8/2017,Royal Challengers Bangalore,41,2,Delhi Daredevils,43,2,Royal Challengers Bangalore,bat,normal,0,Royal Challengers Bangalore,15,0,KM Jadhav,M Chinnaswamy Stadium,170,150,Batting


**Filling the null values in column city**

Using Imputation method to fill the null values in **city** column.

Imputation is a way to fill the missing values statistically. 



In [21]:
# getting series True for NaN values
rowidx = pd.isnull(df["city"])
 
# filtering data where city is NaN or null or empty
df[rowidx][['venue', 'city']]
 

Unnamed: 0,venue,city
460,Dubai International Cricket Stadium,
461,Dubai International Cricket Stadium,
465,Dubai International Cricket Stadium,
467,Dubai International Cricket Stadium,
468,Dubai International Cricket Stadium,
473,Dubai International Cricket Stadium,
475,Dubai International Cricket Stadium,


In [22]:
# replacing the null values with "Dubai" 
# since all the stadium values with null venues
# is "Dubai International Cricket Stadium"
df.loc[rowidx,'city'] = "Dubai"


**Renaming the team names**
 
 Deccan Chargers (DC) was renamed to Sunrisers Hyderabad (SRH) in 2012 [[1]](https://en.wikipedia.org/wiki/Deccan_Chargers)

 Delhi Daredevils (DD) was renamed to Delhi Capitals (DC)  [[2]](https://en.wikipedia.org/wiki/Delhi_Capitals)

Kings XI Punjab was renamed to Punjab Kings (but still KXIP) in 2021
[[3]]( https://en.wikipedia.org/wiki/Punjab_Kings)

In [23]:
IPL_teams=list(set(df['team1']).union(set(df['team2'])))
IPL_teams.sort()
print("IPL TEAMS")
for team_name in IPL_teams:
  print(team_name)


IPL TEAMS
Chennai Super Kings
Deccan Chargers
Delhi Capitals
Delhi Daredevils
Gujarat Lions
Kings XI Punjab
Kochi Tuskers Kerala
Kolkata Knight Riders
Mumbai Indians
Pune Warriors
Rajasthan Royals
Rising Pune Supergiant
Royal Challengers Bangalore
Sunrisers Hyderabad


In [24]:
# Replacing all the team names with its abbrevations 
# to reduce space occupied by unwanted lengthy team names
df.replace("Chennai Super Kings", "CSK", inplace=True)
df.replace("Deccan Chargers", "SRH", inplace=True)
df.replace("Delhi Capitals", "DC", inplace=True)
df.replace("Delhi Daredevils", "DC", inplace=True)
df.replace("Gujarat Lions", "GL", inplace=True)
df.replace("Kings XI Punjab", "KXIP", inplace=True)
df.replace("Kochi Tuskers Kerala", "KTK", inplace=True)
df.replace("Kolkata Knight Riders", "KKR", inplace=True)
df.replace("Mumbai Indians", "MI", inplace=True)
df.replace("Pune Warriors", "PW", inplace=True)
df.replace("Rajasthan Royals", "RR", inplace=True)
df.replace("Rising Pune Supergiant", "RPS", inplace=True)
df.replace("Royal Challengers Bangalore", "RCB", inplace=True)
df.replace("Sunrisers Hyderabad", "SRH", inplace=True)
display(df.head())

Unnamed: 0,id,season,city,date,team1,Pplay T1,pplay twick1,team2,Pplay T2,pplay twick2,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,venue,Avg 1st Innings Score,Avg 2nd Innings Score,Pitch Type
0,1,2017,Hyderabad,4/5/2017,SRH,59,1,RCB,54,1,RCB,field,normal,0,SRH,35,0,Yuvraj Singh,"Rajiv Gandhi International Stadium, Uppal",185,175,Batting
1,2,2017,Pune,4/6/2017,MI,61,1,RPS,59,1,RPS,field,normal,0,RPS,0,7,SPD Smith,Maharashtra Cricket Association Stadium,160,148,Batting & Spinner Friendly
2,3,2017,Rajkot,4/7/2017,GL,52,1,KKR,73,0,KKR,field,normal,0,KKR,0,10,CA Lynn,Saurashtra Cricket Association Stadium,183,170,Batting
3,4,2017,Indore,4/8/2017,RPS,35,1,KXIP,56,2,KXIP,field,normal,0,KXIP,0,6,GJ Maxwell,Holkar Cricket Stadium,160,158,Batting
4,5,2017,Bangalore,4/8/2017,RCB,41,2,DC,43,2,RCB,bat,normal,0,RCB,15,0,KM Jadhav,M Chinnaswamy Stadium,170,150,Batting


In [25]:
IPL_teams=list(set(df['team1']).union(set(df['team2'])))
IPL_teams.sort()
print("IPL TEAMS")
for team_name in IPL_teams:
  print(team_name)

IPL TEAMS
CSK
DC
GL
KKR
KTK
KXIP
MI
PW
RCB
RPS
RR
SRH


**Checking for the same venue, different venue names**

In [26]:
# df[['venue', 'city']].drop_duplicates(subset=['venue', 'city'], keep='last').sort_values(by='city')
df[['venue', 'city']].drop_duplicates(subset=['venue', 'city'], keep='last').sort_values(by='venue')
# print(df[['venue', 'city']].drop_duplicates(subset=['venue', 'city'], keep='last').sort_values(by='venue').values.tolist())

Unnamed: 0,venue,city
750,ACA-VDCA Stadium,Visakhapatnam
495,Barabati Stadium,Cuttack
567,Brabourne Stadium,Mumbai
149,Buffalo Park,East London
154,De Beers Diamond Oval,Kimberley
297,Dr DY Patil Sports Academy,Mumbai
625,Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket St...,Visakhapatnam
811,Dubai International Cricket Stadium,Dubai
739,Eden Gardens,Kolkata
687,Feroz Shah Kotla,Delhi


**Output:**

Redundant values found manually are as follows:

________________________________________________________________________________

**Location 1:** Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium **Visakhapatnam**

ACA-VDCA Stadium	**Visakhapatnam**

Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium **Visakhapatnam**

________________________________________________________________________________

**Location 2:** Arun Jaitley Stadium	**Delhi**

Feroz Shah Kotla	**Delhi**

Feroz Shah Kotla Ground	**Delhi**
________________________________________________________________________________

**Location 3:** M. Chinnaswamy Stadium **Bangalore**

M Chinnaswamy Stadium	**Bengaluru**

M Chinnaswamy Stadium	**Bangalore**

M. Chinnaswamy Stadium	**Bengaluru**

________________________________________________________________________________

**Location 4:** M. A. Chidambaram Stadium **Chennai**

M. A. Chidambaram Stadium	**Chennai**

MA Chidambaram Stadium, Chepauk	**Chennai**

________________________________________________________________________________

**Location 5:** Punjab Cricket Association IS Bindra Stadium **Mohali**

Punjab Cricket Association IS Bindra Stadium, Mohali  **Chandigarh**

Punjab Cricket Association IS Bindra Stadium, Mohali **Mohali**

Punjab Cricket Association Stadium, Mohali **Chandigarh**

IS Bindra Stadium	**Mohali**

________________________________________________________________________________

**Location 6:** Rajiv Gandhi International Cricket Stadium **Hyderabad**

Rajiv Gandhi International Stadium, Uppal	**Hyderabad**

Rajiv Gandhi Intl. Cricket Stadium **Hyderabad**

________________________________________________________________________________


**Replacing the same venue, different venue names**



In [27]:
# Location 1
df.replace("ACA-VDCA Stadium", "Dr. Y.S.R. ACA VDCA Cricket Stadium", inplace=True)

# Location 2
df.replace("Feroz Shah Kotla", "Arun Jaitley Stadium", inplace=True)
df.replace("Feroz Shah Kotla Ground", "Arun Jaitley Stadium", inplace=True)

# Location 3
df.replace("M Chinnaswamy Stadium", "M. Chinnaswamy Stadium", inplace=True)
df.replace("Bengaluru", "Bangalore", inplace=True)

# Location 4
df.replace("MA Chidambaram Stadium, Chepauk", "M. A. Chidambaram Stadium", inplace=True)

# Location 5
df.replace("Punjab Cricket Association IS Bindra Stadium, Mohali", "Punjab Cricket Association IS Bindra Stadium", inplace=True)
df.replace("Punjab Cricket Association IS Bindra Stadium, Mohali", "Punjab Cricket Association IS Bindra Stadium", inplace=True)
df.replace("Punjab Cricket Association Stadium, Mohali", "Punjab Cricket Association IS Bindra Stadium", inplace=True)
df.replace("IS Bindra Stadium", "Punjab Cricket Association IS Bindra Stadium", inplace=True)
df.replace("Chandigarh", "Mohali", inplace=True)

# Location 6
df.replace("Rajiv Gandhi International Stadium, Uppal", "Rajiv Gandhi International Cricket Stadium", inplace=True)
df.replace("Rajiv Gandhi Intl. Cricket Stadium", "Rajiv Gandhi International Cricket Stadium", inplace=True)

# Display the new dataframe after the replacement
# display(df.head())

Phew!! The data is clean now. We can finally start on analyzing the features (columns).

# **Step 4 — Feature Engineering**

The column values should make some sense to the computers. 
Since the computer don’t have the ability to understand and draw inference from the text, we need to encode the strings to **numeric categorical values**. 
Encoding ways:
1. Manually
2. Using *LabelEncoder()* from the Scikit-learn library


In [None]:
features=list(df.columns) 
print(features)
df.head(10)

['id', 'season', 'city', 'date', 'team1', 'Pplay T1', 'pplay twick1', 'team2', 'Pplay T2', 'pplay twick2', 'toss_winner', 'toss_decision', 'result', 'dl_applied', 'winner', 'win_by_runs', 'win_by_wickets', 'player_of_match', 'venue', 'Avg 1st Innings Score', 'Avg 2nd Innings Score', 'Pitch Type']


Unnamed: 0,id,season,city,date,team1,Pplay T1,pplay twick1,team2,Pplay T2,pplay twick2,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,venue,Avg 1st Innings Score,Avg 2nd Innings Score,Pitch Type
0,1,2017,Hyderabad,4/5/2017,SRH,59,1,RCB,54,1,RCB,field,normal,0,SRH,35,0,Yuvraj Singh,Rajiv Gandhi International Cricket Stadium,185,175,Batting
1,2,2017,Pune,4/6/2017,MI,61,1,RPS,59,1,RPS,field,normal,0,RPS,0,7,SPD Smith,Maharashtra Cricket Association Stadium,160,148,Batting & Spinner Friendly
2,3,2017,Rajkot,4/7/2017,GL,52,1,KKR,73,0,KKR,field,normal,0,KKR,0,10,CA Lynn,Saurashtra Cricket Association Stadium,183,170,Batting
3,4,2017,Indore,4/8/2017,RPS,35,1,KXIP,56,2,KXIP,field,normal,0,KXIP,0,6,GJ Maxwell,Holkar Cricket Stadium,160,158,Batting
4,5,2017,Bangalore,4/8/2017,RCB,41,2,DC,43,2,RCB,bat,normal,0,RCB,15,0,KM Jadhav,M. Chinnaswamy Stadium,170,150,Batting
5,6,2017,Hyderabad,4/9/2017,GL,40,2,SRH,59,1,SRH,field,normal,0,SRH,0,9,Rashid Khan,Rajiv Gandhi International Cricket Stadium,185,175,Batting
6,7,2017,Mumbai,4/9/2017,KKR,59,2,MI,49,0,MI,field,normal,0,MI,0,4,N Rana,Wankhede Stadium,194,182,Both
7,8,2017,Indore,4/10/2017,RCB,23,3,KXIP,62,1,RCB,bat,normal,0,KXIP,0,8,AR Patel,Holkar Cricket Stadium,160,158,Batting
8,9,2017,Pune,4/11/2017,DC,62,1,RPS,50,3,RPS,field,normal,0,DC,97,0,SV Samson,Maharashtra Cricket Association Stadium,160,148,Batting & Spinner Friendly
9,10,2017,Mumbai,4/12/2017,SRH,34,0,MI,61,2,MI,field,normal,0,MI,0,4,JJ Bumrah,Wankhede Stadium,194,182,Both


**Remove:**

'id', 'season', 'date', 'player_of_match'

**Encode: numbers**

'team1', 'team2', 'toss_winner' , 'winner', 'city', 'venue',

**Encode: boolean**

'Pitch Type'

**New field: boolean**

'winner' => 'team1_win' : team1 wins the match (1) ? else (0)

*'toss_winner'* => 'team1_toss_win' : team1 wins the toss (1) ? else (0)

 'toss_decision' => 'team1_bat' : team1 bats first (1) ? else (0)
 
 'result' => 'is_tied' : is the matched tied (1) ? or normal (0) ?

**Redundant fields:** drop first one

*'city', 'venue'*

*'toss_winner'*, 'team1_toss_win'

'toss_decision', 'team1_bat'
  
'result', 'is_tied'

**No change:**

'win_by_runs', 'win_by_wickets', 'Avg 1st Innings Score', 'Avg 2nd Innings Score', 'Pplay T1', 'pplay twick1', 'team2', 'Pplay T2', 'pplay twick2', 'dl_applied'


In [None]:
print(set(df['Pitch Type']))
print(set(df['result']))

{'Bowling', 'Batting & Spinner Friendly', 'Both', 'Batting'}
{'tie', 'normal'}


{'Bowling' => -1, 'Batting & Spinner Friendly' => -2, 'Both' => 2, 'Batting' => 1}


{'tie' => 1, 'normal' => 0}

In [None]:
# ***********************  ENCODE   ***********************  
# encode teams
IPL_teams=list(set(df['team1']).union(set(df['team2'])))
count=1  
encoding_key={}
for team_name in IPL_teams:
  encoding_key[team_name]=count
  df.replace(team_name, count, inplace=True)
  count += 1
print("Encoding for teams: ", encoding_key)

# encode city
IPL_city=list(set(df['city']))
count=1  
encoding_key={}
for city in IPL_city:
  encoding_key[city]=count
  df.replace(city, count, inplace=True)
  count += 1
print("Encoding for city: ", encoding_key)

# encode city
IPL_venue=list(set(df['venue']))
count=1  
encoding_key={}
for venue in IPL_venue:
  encoding_key[venue]=count
  df.replace(venue, count, inplace=True)
  count += 1
print("Encoding for venue: ", encoding_key)


# ***********************  ENCODE : BOOLEAN  ***********************  

# encode pitch type
df.replace("Batting & Spinner Friendly", -2, inplace=True)
df.replace("Bowling", -1, inplace=True)
df.replace("Batting", 1, inplace=True)
df.replace("Both", 2, inplace=True)

# ***********************  NEW FIELD : BOOLEAN  ***********************  

# 'winner' => 'team1_win' : team1 wins the match (1) ? else (0)
# outcome variable team1_win as a probability of team1 winning the match
df.loc[df["winner"]==df["team1"],"team1_win"]=1
df.loc[df["winner"]!=df["team1"],"team1_win"]=0

# 'toss_winner' => 'team1_toss_win' : team1 wins the toss (1) ? else (0)
#outcome variable team1_toss_win as a value of team1 winning the toss
df.loc[df["toss_winner"]==df["team1"],"team1_toss_win"]=1
df.loc[df["toss_winner"]!=df["team1"],"team1_toss_win"]=0

# 'toss_decision' => 'team1_bat' : team1 bats first (1) ? else (0)
#outcome variable team1_bat to depict if team1 bats first
df["team1_bat"]=0
df.loc[(df["team1_toss_win"]==1) & (df["toss_decision"]=="bat"),"team1_bat"]=1

# 'result' => 'is_tied' : is the matched tied (1) ? or normal (0) ?
# encode result
df.loc[df["result"]=="tie","is_tied"]=1
df.loc[df["result"]=="normal","is_tied"]=0

"""
  #encoding the numeric values
  encoder= preprocessing.LabelEncoder()

  df["city"]=encoder.fit_transform(df["city"])
  df["team1"]=encoder.fit_transform(df["team1"])
  df["team2"]=encoder.fit_transform(df["team2"])
  df["winner"]=encoder.fit_transform(df["winner"].astype(str))
  df["toss_winner"]=encoder.fit_transform(df["toss_winner"])
  df["venue"]=encoder.fit_transform(df["venue"])
"""

Encoding for teams:  {'SRH': 1, 'RR': 2, 'KKR': 3, 'MI': 4, 'RPS': 5, 'CSK': 6, 'DC': 7, 'GL': 8, 'KXIP': 9, 'PW': 10, 'RCB': 11, 'KTK': 12}
Encoding for city:  {'Mohali': 1, 'Abu Dhabi': 2, 'Kolkata': 3, 'Bangalore': 4, 'Rajkot': 5, 'East London': 6, 'Dharamsala': 7, 'Mumbai': 8, 'Delhi': 9, 'Dubai': 10, 'Pune': 11, 'Kochi': 12, 'Durban': 13, 'Hyderabad': 14, 'Cape Town': 15, 'Cuttack': 16, 'Visakhapatnam': 17, 'Kanpur': 18, 'Raipur': 19, 'Indore': 20, 'Port Elizabeth': 21, 'Kimberley': 22, 'Ahmedabad': 23, 'Centurion': 24, 'Ranchi': 25, 'Nagpur': 26, 'Johannesburg': 27, 'Jaipur': 28, 'Bloemfontein': 29, 'Chennai': 30, 'Sharjah': 31}
Encoding for venue:  {'Sheikh Zayed Stadium': 1, 'Barabati Stadium': 2, 'Green Park': 3, 'M. A. Chidambaram Stadium': 4, 'SuperSport Park': 5, 'Dr DY Patil Sports Academy': 6, 'Nehru Stadium': 7, 'Wankhede Stadium': 8, 'Arun Jaitley Stadium': 9, 'M. Chinnaswamy Stadium': 10, 'Rajiv Gandhi International Cricket Stadium': 11, 'Sawai Mansingh Stadium': 12, '

'\n  #encoding the numeric values\n  encoder= preprocessing.LabelEncoder()\n\n  df["city"]=encoder.fit_transform(df["city"])\n  df["team1"]=encoder.fit_transform(df["team1"])\n  df["team2"]=encoder.fit_transform(df["team2"])\n  df["winner"]=encoder.fit_transform(df["winner"].astype(str))\n  df["toss_winner"]=encoder.fit_transform(df["toss_winner"])\n  df["venue"]=encoder.fit_transform(df["venue"])\n'

In [30]:
print(df.venue.unique())

['Rajiv Gandhi International Cricket Stadium'
 'Maharashtra Cricket Association Stadium'
 'Saurashtra Cricket Association Stadium' 'Holkar Cricket Stadium'
 'M. Chinnaswamy Stadium' 'Wankhede Stadium' 'Eden Gardens'
 'Arun Jaitley Stadium' 'Punjab Cricket Association IS Bindra Stadium'
 'Green Park' 'Sawai Mansingh Stadium' 'M. A. Chidambaram Stadium'
 'Dr DY Patil Sports Academy' 'Newlands' "St George's Park" 'Kingsmead'
 'SuperSport Park' 'Buffalo Park' 'New Wanderers Stadium'
 'De Beers Diamond Oval' 'OUTsurance Oval' 'Brabourne Stadium'
 'Sardar Patel Stadium, Motera' 'Barabati Stadium'
 'Vidarbha Cricket Association Stadium, Jamtha'
 'Himachal Pradesh Cricket Association Stadium' 'Nehru Stadium'
 'Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium'
 'Subrata Roy Sahara Stadium'
 'Shaheed Veer Narayan Singh International Stadium'
 'JSCA International Stadium Complex' 'Sheikh Zayed Stadium'
 'Sharjah Cricket Stadium' 'Dubai International Cricket Stadium'
 'Dr. Y.S.R. ACA VDCA Crick

In [None]:
# HOME GROUND ADVANTAGE COLUMN
# 0: NO TEAM HAS ADVANTAGE
# 1: TEAM 1 HAS ADVANTAGE
# 2: TEAM 2 HAS ADVANTAGE

# **Step 5 — Feature Selection**

**Drop redundant fields**

In [None]:
# ***********************  DROP REDUNDANT FIELDS *********************** 
# Redundant fields: drop first one
df=df.drop(columns=['city', 'toss_winner', 'winner', 'toss_decision', 'result'], axis=1)

# ***********************  DROP UNNECESSARY FIELDS *********************** 
df=df.drop(columns=['id', 'season', 'date', 'player_of_match'], axis=1)

df.head(10)


Unnamed: 0,team1,Pplay T1,pplay twick1,team2,Pplay T2,pplay twick2,dl_applied,winner,win_by_runs,win_by_wickets,venue,Avg 1st Innings Score,Avg 2nd Innings Score,Pitch Type,team1_win,team1_toss_win,team1_bat,is_tied
0,1,59,1,11,54,1,0,1,35,0,11,185,175,1,1.0,0.0,0,0.0
1,4,61,1,5,59,1,0,5,0,7,24,160,148,-2,0.0,0.0,0,0.0
2,8,52,1,3,73,0,0,3,0,10,23,183,170,1,0.0,0.0,0,0.0
3,5,35,1,9,56,2,0,9,0,6,15,160,158,1,0.0,0.0,0,0.0
4,11,41,2,7,43,2,0,11,15,0,10,170,150,1,1.0,1.0,1,0.0
5,8,40,2,1,59,1,0,1,0,9,11,185,175,1,0.0,0.0,0,0.0
6,3,59,2,4,49,0,0,4,0,4,8,194,182,2,0.0,0.0,0,0.0
7,11,23,3,9,62,1,0,9,0,8,15,160,158,1,0.0,1.0,1,0.0
8,7,62,1,5,50,3,0,7,97,0,24,160,148,-2,1.0,0.0,0,0.0
9,1,34,0,4,61,2,0,4,0,4,8,194,182,2,0.0,0.0,0,0.0


# **Step 6 - Building, Training and Testing the Model**

In [None]:
df.head()

Unnamed: 0,team1,Pplay T1,pplay twick1,team2,Pplay T2,pplay twick2,dl_applied,winner,win_by_runs,win_by_wickets,venue,Avg 1st Innings Score,Avg 2nd Innings Score,Pitch Type,team1_win,team1_toss_win,team1_bat,is_tied
0,1,59,1,11,54,1,0,1,35,0,11,185,175,1,1.0,0.0,0,0.0
1,4,61,1,5,59,1,0,5,0,7,24,160,148,-2,0.0,0.0,0,0.0
2,8,52,1,3,73,0,0,3,0,10,23,183,170,1,0.0,0.0,0,0.0
3,5,35,1,9,56,2,0,9,0,6,15,160,158,1,0.0,0.0,0,0.0
4,11,41,2,7,43,2,0,11,15,0,10,170,150,1,1.0,1.0,1,0.0


**Splitting the data-set into training and testing**

In [None]:
#Splitting the data into training and testing data and scaling it
X=df.drop(columns=['team1_win'], axis=1)
target=df[["team1_win"]]
X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.2, random_state=0,shuffle=True)

Applying various models

In [None]:
#Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Accuracy of Logistic Regression Classifier on test set: {:.4f}'.format(logreg.score(X_test, y_test)))

#Decision Tree Classifier
dtree=DecisionTreeClassifier()
dtree.fit(X_train,y_train)
y_pred = dtree.predict(X_test)
print('Accuracy of Decision Tree Classifier on test set: {:.4f}'.format(dtree.score(X_test, y_test)))

#SVM
svm=SVC()
svm.fit(X_train,y_train)
y_pred = svm.predict(X_test)
print('Accuracy of SVM Classifier on test set: {:.4f}'.format(svm.score(X_test, y_test)))

#Random Forest Classifier
randomForest= RandomForestClassifier(n_estimators=100)
randomForest.fit(X_train,y_train)
y_pred = randomForest.predict(X_test)
print('Accuracy of Random Forest Classifier on test set: {:.4f}'.format(randomForest.score(X_test, y_test)))

**Overfitting:**

Index(['team1', 'Pplay T1', 'pplay twick1', 'team2', 'Pplay T2',
       'pplay twick2', 'dl_applied', 'winner', 'win_by_runs', 'win_by_wickets',
       'venue', 'Avg 1st Innings Score', 'Avg 2nd Innings Score', 'Pitch Type',
       'team1_win', 'team1_toss_win', 'team1_bat', 'is_tied'],
      dtype='object')


Accuracy of Logistic Regression Classifier on test set: 0.9632

Accuracy of Decision Tree Classifier on test set: 0.9509

Accuracy of SVM Classifier on test set: 0.8712

Accuracy of Random Forest Classifier on test set: 0.9571



# **Using K-fold Cross-Validation**:
to maximize the use of the available data for training and then testing a model.

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from numpy import mean

# prepare the cross-validation procedure
cv = KFold(n_splits=6, random_state=1, shuffle=True)

# Logistic Regression
logreg = LogisticRegression()
scores = cross_val_score(logreg, X, target, scoring='accuracy', cv=cv, n_jobs=-1)
print('Accuracy of Logistic Regression Classifier on test set: {:.4f}'.format(mean(scores)))

#Decision Tree Classifier
dtree=DecisionTreeClassifier()
scores = cross_val_score(dtree, X, target, scoring='accuracy', cv=cv, n_jobs=-1)
print('Accuracy of Decision Tree Classifier on test set: {:.4f}'.format(mean(scores)))

#SVM
svm=SVC()
scores = cross_val_score(svm, X, target, scoring='accuracy', cv=cv, n_jobs=-1)
print('Accuracy of SVM Classifier on test set: {:.4f}'.format(mean(scores)))

#Random Forest Classifier
randomForest= RandomForestClassifier(n_estimators=100)
scores = cross_val_score(randomForest, X, target, scoring='accuracy', cv=cv, n_jobs=-1)
print('Accuracy of Random Forest Classifier on test set: {:.4f}'.format(mean(scores)))

Accuracy of Logistic Regression Classifier on test set: 0.9396
Accuracy of Decision Tree Classifier on test set: 0.9397
Accuracy of SVM Classifier on test set: 0.8547
Accuracy of Random Forest Classifier on test set: 0.9507


**Using K-fold:** K=5

Accuracy of Logistic Regression Classifier on test set: 0.9359

Accuracy of Decision Tree Classifier on test set: 0.9335

Accuracy of SVM Classifier on test set: 0.8546

Accuracy of Random Forest Classifier on test set: 0.9507

**Using K-fold:** K=6

Accuracy of Logistic Regression Classifier on test set: 0.9396

Accuracy of Decision Tree Classifier on test set: 0.9397

Accuracy of SVM Classifier on test set: 0.8547

Accuracy of Random Forest Classifier on test set: 0.9507

**Using K-fold:** K=8

Accuracy of Logistic Regression Classifier on test set: 0.9397

Accuracy of Decision Tree Classifier on test set: 0.9299

Accuracy of SVM Classifier on test set: 0.8559

Accuracy of Random Forest Classifier on test set: 0.9557

**Using K-fold:** K=9

Accuracy of Logistic Regression Classifier on test set: 0.9408

Accuracy of Decision Tree Classifier on test set: 0.9421

Accuracy of SVM Classifier on test set: 0.8572

Accuracy of Random Forest Classifier on test set: 0.9557


**Using K-fold:** K=10

Accuracy of Logistic Regression Classifier on test set: 0.9396

Accuracy of Decision Tree Classifier on test set: 0.9298

Accuracy of SVM Classifier on test set: 0.8584

Accuracy of Random Forest Classifier on test set: 0.9544


# Extra 

In [None]:
matches=pd.read_csv("finalest_ipl.csv")

dict = {"Punjab Cricket Association Stadium, Mohali":["KXIP"], "MA Chidambaram Stadium, Chepauk":["CSK"], "Rajiv Gandhi International Stadium, Uppal": ["SRH","DC"]
        , "Feroz Shah Kotla":["DD","DC"], "M Chinnaswamy Stadium":["RCB"], "Wankhede Stadium":["MI"], "Dr DY Patil Sports Academy":["MI","DC","PWI"]
        , "Eden Gardens":["KKR"], "Sawai Mansingh Stadium":["RR"], "Newlands":["None"], "St George's Park":["None"], "De Beers Diamond Oval":["None"]
        , "Buffalo Park":["None"], "SuperSport Park":["None"], "Kingsmead":["None"], "New Wanderers Stadium":["None"], "Brabourne Stadium":["None"]
        , "Himachal Pradesh Cricket Association Stadium":["KXIP"], "Barabati Stadium": ["DC","KXIP","KKR"], "Sardar Patel Stadium, Motera": ["RR"]
        , "Vidarbha Cricket Association Stadium, Jamtha":["None"], "Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium":["SRH"]
        , "Subrata Roy Sahara Stadium":["None"], "JSCA International Stadium Complex":["KKR","CSK"], "Shaheed Veer Narayan Singh International Stadium":["None"]
        , "Sheikh Zayed Stadium":["None"], "Dubai International Cricket Stadium":["None"], "Sharjah Cricket Stadium":["None"]
        , "Maharashtra Cricket Association Stadium":["KXIP","RPS","CSK"], "Punjab Cricket Association IS Bindra Stadium, Mohali":["KXIP"]
        , "Holkar Cricket Stadium":["KXIP"], "M. A. Chidambaram Stadium":["CSK"], "M. Chinnaswamy Stadium":["RCB"]
        , "Feroz Shah Kotla Ground":["DD","DC"], "ACA-VDCA Stadium":["DC","SRH","MI","RPS"], "Rajiv Gandhi Intl. Cricket Stadium":["DC","SRH"]
        , "IS Bindra Stadium":["KXIP"]
}

team1_home_advantage = []

for index, row in matches.iterrows():
    dict_team_names = dict.get(row['venue'])
    
    if(row['team1'] in dict_team_names):
        if(row['team2'] in dict_team_names):
            team1_home_advantage.append(0)
        else:
            team1_home_advantage.append(1)
    elif(row['team2'] in dict_team_names):
        team1_home_advantage.append(-1)
    else:
        team1_home_advantage.append(0)

print(len(team1_home_advantage))
from csv import writer
from csv import reader
# Open the input_file in read mode and output_file in write mode
with open('finalest_ipl.csv', 'r') as read_obj, \
        open('finalest_ipl1.csv', 'w', newline='') as write_obj:
    # Create a csv.reader object from the input file object
    csv_reader = reader(read_obj)
    # Create a csv.writer object from the output file object
    csv_writer = writer(write_obj)
    # Read each row of the input csv file as list
    i=-1
    for row in csv_reader:
        # Append the default text in the row / list
        if(i>-1):
            row.append(team1_home_advantage[i])
        
        i+=1
        # Add the updated row / list to the output file
        csv_writer.writerow(row)