In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
import sklearn
print(sklearn.__version__)

1.3.2


In [11]:
import pandas as pd
import pickle

### Importing Match Data from 2008 - 2024

In [12]:
matches = pd.read_csv('/content/drive/My Drive/hackathon/matches.csv')
matches.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1095 entries, 0 to 1094
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               1095 non-null   int64  
 1   season           1095 non-null   object 
 2   city             1044 non-null   object 
 3   date             1095 non-null   object 
 4   match_type       1095 non-null   object 
 5   player_of_match  1090 non-null   object 
 6   venue            1095 non-null   object 
 7   team1            1095 non-null   object 
 8   team2            1095 non-null   object 
 9   toss_winner      1095 non-null   object 
 10  toss_decision    1095 non-null   object 
 11  winner           1090 non-null   object 
 12  result           1095 non-null   object 
 13  result_margin    1076 non-null   float64
 14  target_runs      1092 non-null   float64
 15  target_overs     1092 non-null   float64
 16  super_over       1095 non-null   object 
 17  method        

In [13]:
matches.head()

Unnamed: 0,id,season,city,date,match_type,player_of_match,venue,team1,team2,toss_winner,toss_decision,winner,result,result_margin,target_runs,target_overs,super_over,method,umpire1,umpire2
0,335982,2007/08,Bangalore,2008-04-18,League,BB McCullum,M Chinnaswamy Stadium,Royal Challengers Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,140.0,223.0,20.0,N,,Asad Rauf,RE Koertzen
1,335983,2007/08,Chandigarh,2008-04-19,League,MEK Hussey,"Punjab Cricket Association Stadium, Mohali",Kings XI Punjab,Chennai Super Kings,Chennai Super Kings,bat,Chennai Super Kings,runs,33.0,241.0,20.0,N,,MR Benson,SL Shastri
2,335984,2007/08,Delhi,2008-04-19,League,MF Maharoof,Feroz Shah Kotla,Delhi Daredevils,Rajasthan Royals,Rajasthan Royals,bat,Delhi Daredevils,wickets,9.0,130.0,20.0,N,,Aleem Dar,GA Pratapkumar
3,335985,2007/08,Mumbai,2008-04-20,League,MV Boucher,Wankhede Stadium,Mumbai Indians,Royal Challengers Bangalore,Mumbai Indians,bat,Royal Challengers Bangalore,wickets,5.0,166.0,20.0,N,,SJ Davis,DJ Harper
4,335986,2007/08,Kolkata,2008-04-20,League,DJ Hussey,Eden Gardens,Kolkata Knight Riders,Deccan Chargers,Deccan Chargers,bat,Kolkata Knight Riders,wickets,5.0,111.0,20.0,N,,BF Bowden,K Hariharan


### Choosing relevant columns

In [14]:
matches = matches[['id','season','date','venue','team1','team2','toss_winner','winner','city']]

### Identifying Columns with NULL values

In [15]:
matches.isnull().sum()

Unnamed: 0,0
id,0
season,0
date,0
venue,0
team1,0
team2,0
toss_winner,0
winner,5
city,51


### Analysing when winner is NULL



In [16]:
matches[matches.winner.isnull()]

Unnamed: 0,id,season,date,venue,team1,team2,toss_winner,winner,city
241,501265,2011,2011-05-21,Feroz Shah Kotla,Delhi Daredevils,Pune Warriors,Delhi Daredevils,,Delhi
485,829763,2015,2015-04-29,M Chinnaswamy Stadium,Royal Challengers Bangalore,Rajasthan Royals,Rajasthan Royals,,Bangalore
511,829813,2015,2015-05-17,M Chinnaswamy Stadium,Royal Challengers Bangalore,Delhi Daredevils,Royal Challengers Bangalore,,Bangalore
744,1178424,2019,2019-04-30,M.Chinnaswamy Stadium,Royal Challengers Bangalore,Rajasthan Royals,Rajasthan Royals,,Bengaluru
994,1359519,2023,2023-05-03,Bharat Ratna Shri Atal Bihari Vajpayee Ekana C...,Lucknow Super Giants,Chennai Super Kings,Chennai Super Kings,,Lucknow


### Removing matches where winner is NULL

In [17]:
matches = matches[matches['winner'].notnull()]

### Analysing when city is NULL

City is not known only for these two venues

In [18]:
matches.venue[matches.city.isnull()].value_counts()

Unnamed: 0_level_0,count
venue,Unnamed: 1_level_1
Dubai International Cricket Stadium,33
Sharjah Cricket Stadium,18


### Filling city values

In [19]:
def fill_venue(row):
    if pd.isna(row['city']):
        if row['venue'] == 'Dubai International Cricket Stadium':
            return 'Dubai'
        elif row['venue'] == 'Sharjah Cricket Stadium':
            return 'Sharjah'
        else:
            return row['city']
    else:
        return row['city']
matches['city'] = matches.apply(fill_venue, axis=1)

### Created a dataframe without NULL values

In [20]:
matches.isnull().sum()

Unnamed: 0,0
id,0
season,0
date,0
venue,0
team1,0
team2,0
toss_winner,0
winner,0
city,0


In [21]:
teams = matches.team1.unique()
teams

array(['Royal Challengers Bangalore', 'Kings XI Punjab',
       'Delhi Daredevils', 'Mumbai Indians', 'Kolkata Knight Riders',
       'Rajasthan Royals', 'Deccan Chargers', 'Chennai Super Kings',
       'Kochi Tuskers Kerala', 'Pune Warriors', 'Sunrisers Hyderabad',
       'Gujarat Lions', 'Rising Pune Supergiants',
       'Rising Pune Supergiant', 'Delhi Capitals', 'Punjab Kings',
       'Lucknow Super Giants', 'Gujarat Titans',
       'Royal Challengers Bengaluru'], dtype=object)

In [22]:
total_win = matches.winner.value_counts()
total_matches = (matches.team1.value_counts()+ matches.team2.value_counts())
win_percentage = total_win/total_matches
win_percentage.sort_values(ascending=False)

Unnamed: 0,count
Rising Pune Supergiant,0.625
Gujarat Titans,0.622222
Chennai Super Kings,0.582278
Lucknow Super Giants,0.55814
Mumbai Indians,0.551724
Delhi Capitals,0.527473
Kolkata Knight Riders,0.521912
Rajasthan Royals,0.511416
Royal Challengers Bangalore,0.489451
Sunrisers Hyderabad,0.483516


We can see that Royal Challengers Bangalore and King XI Punjab had a name change and there is not much differnce in their win percentage(less than 2%), therefore replacing that in the dataframe

Other teams like Delhi Daredevils and Delhi Capitals had a management change and therefore there is big difference in their performance, so it would not be right to consider them as the same team

In [23]:
team_replacements = {
    'Royal Challengers Bangalore': 'Royal Challengers Bengaluru',
    'Kings XI Punjab': 'Punjab Kings'
}

matches['team1'] = matches['team1'].replace(team_replacements)
matches['team2'] = matches['team2'].replace(team_replacements)
matches['winner'] = matches['winner'].replace(team_replacements)

### Checking if there is any wrong entry for the winner column

Check condition : Winner must either be TEAM 1 or TEAM 2

In [24]:
len(matches[(matches.team1 != matches.winner) & (matches.team2 != matches.winner)])

0

### Calculating team strength

### Mapping teams and their respective home ground

In [25]:
city_venue_pairs = matches[['city', 'venue']].drop_duplicates()

In [26]:
city_venue_pairs

Unnamed: 0,city,venue
0,Bangalore,M Chinnaswamy Stadium
1,Chandigarh,"Punjab Cricket Association Stadium, Mohali"
2,Delhi,Feroz Shah Kotla
3,Mumbai,Wankhede Stadium
4,Kolkata,Eden Gardens
5,Jaipur,Sawai Mansingh Stadium
6,Hyderabad,"Rajiv Gandhi International Stadium, Uppal"
7,Chennai,"MA Chidambaram Stadium, Chepauk"
12,Mumbai,Dr DY Patil Sports Academy
58,Cape Town,Newlands


In [27]:
teams = matches.team1.unique()

In [28]:
teams

array(['Royal Challengers Bengaluru', 'Punjab Kings', 'Delhi Daredevils',
       'Mumbai Indians', 'Kolkata Knight Riders', 'Rajasthan Royals',
       'Deccan Chargers', 'Chennai Super Kings', 'Kochi Tuskers Kerala',
       'Pune Warriors', 'Sunrisers Hyderabad', 'Gujarat Lions',
       'Rising Pune Supergiants', 'Rising Pune Supergiant',
       'Delhi Capitals', 'Lucknow Super Giants', 'Gujarat Titans'],
      dtype=object)

In [29]:
home_stadium = {}
for team_name in teams:
    home_stadium[team_name] = []
    for city, venue in city_venue_pairs.values:
        if city in team_name or city.replace('Bangalore', 'Bengaluru') in team_name:
            home_stadium[team_name].append([city, venue])

In [30]:
home_stadium

{'Royal Challengers Bengaluru': [['Bangalore', 'M Chinnaswamy Stadium'],
  ['Bengaluru', 'M.Chinnaswamy Stadium'],
  ['Bengaluru', 'M Chinnaswamy Stadium, Bengaluru']],
 'Punjab Kings': [],
 'Delhi Daredevils': [['Delhi', 'Feroz Shah Kotla'],
  ['Delhi', 'Arun Jaitley Stadium'],
  ['Delhi', 'Arun Jaitley Stadium, Delhi']],
 'Mumbai Indians': [['Mumbai', 'Wankhede Stadium'],
  ['Mumbai', 'Dr DY Patil Sports Academy'],
  ['Mumbai', 'Brabourne Stadium'],
  ['Mumbai', 'Brabourne Stadium, Mumbai'],
  ['Mumbai', 'Wankhede Stadium, Mumbai'],
  ['Mumbai', 'Dr DY Patil Sports Academy, Mumbai']],
 'Kolkata Knight Riders': [['Kolkata', 'Eden Gardens'],
  ['Kolkata', 'Eden Gardens, Kolkata']],
 'Rajasthan Royals': [],
 'Deccan Chargers': [],
 'Chennai Super Kings': [['Chennai', 'MA Chidambaram Stadium, Chepauk'],
  ['Chennai', 'MA Chidambaram Stadium'],
  ['Chennai', 'MA Chidambaram Stadium, Chepauk, Chennai']],
 'Kochi Tuskers Kerala': [['Kochi', 'Nehru Stadium']],
 'Pune Warriors': [['Pune', 'Su

In [31]:
home_stadium['Rajasthan Royals'] = [['Jaipur', 'Sawai Mansingh Stadium'],
                                    ['Jaipur', 'Sawai Mansingh Stadium, Jaipur']]

home_stadium['Deccan Chargers'] = [['Hyderabad', 'Rajiv Gandhi International Stadium, Uppal']]

home_stadium['Gujarat Lions'] = [['Rajkot', 'Saurashtra Cricket Association Stadium'],
                                  ['Ahmedabad', 'Sardar Patel Stadium, Motera']]

home_stadium['Punjab Kings'] = [['Chandigarh', 'Punjab Cricket Association Stadium, Mohali'],
                                ['Chandigarh', 'Punjab Cricket Association IS Bindra Stadium, Mohali'],
                                ['Mohali', 'Maharaja Yadavindra Singh International Cricket Stadium, Mullanpur']]

home_stadium['Gujarat Titans'] = [['Ahmedabad', 'Narendra Modi Stadium, Ahmedabad']]

In [32]:
home_stadium

{'Royal Challengers Bengaluru': [['Bangalore', 'M Chinnaswamy Stadium'],
  ['Bengaluru', 'M.Chinnaswamy Stadium'],
  ['Bengaluru', 'M Chinnaswamy Stadium, Bengaluru']],
 'Punjab Kings': [['Chandigarh', 'Punjab Cricket Association Stadium, Mohali'],
  ['Chandigarh', 'Punjab Cricket Association IS Bindra Stadium, Mohali'],
  ['Mohali',
   'Maharaja Yadavindra Singh International Cricket Stadium, Mullanpur']],
 'Delhi Daredevils': [['Delhi', 'Feroz Shah Kotla'],
  ['Delhi', 'Arun Jaitley Stadium'],
  ['Delhi', 'Arun Jaitley Stadium, Delhi']],
 'Mumbai Indians': [['Mumbai', 'Wankhede Stadium'],
  ['Mumbai', 'Dr DY Patil Sports Academy'],
  ['Mumbai', 'Brabourne Stadium'],
  ['Mumbai', 'Brabourne Stadium, Mumbai'],
  ['Mumbai', 'Wankhede Stadium, Mumbai'],
  ['Mumbai', 'Dr DY Patil Sports Academy, Mumbai']],
 'Kolkata Knight Riders': [['Kolkata', 'Eden Gardens'],
  ['Kolkata', 'Eden Gardens, Kolkata']],
 'Rajasthan Royals': [['Jaipur', 'Sawai Mansingh Stadium'],
  ['Jaipur', 'Sawai Mansingh

In [33]:
with open('/content/drive/My Drive/hackathon/home_stadium.pkl', 'wb') as model_file:
    pickle.dump(home_stadium, model_file)

## Importing Deliveries dataset

Contains ball by ball data

In [34]:
deliveres = pd.read_csv('/content/drive/My Drive/hackathon/deliveries.csv')
deliveres.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260920 entries, 0 to 260919
Data columns (total 17 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   match_id          260920 non-null  int64 
 1   inning            260920 non-null  int64 
 2   batting_team      260920 non-null  object
 3   bowling_team      260920 non-null  object
 4   over              260920 non-null  int64 
 5   ball              260920 non-null  int64 
 6   batter            260920 non-null  object
 7   bowler            260920 non-null  object
 8   non_striker       260920 non-null  object
 9   batsman_runs      260920 non-null  int64 
 10  extra_runs        260920 non-null  int64 
 11  total_runs        260920 non-null  int64 
 12  extras_type       14125 non-null   object
 13  is_wicket         260920 non-null  int64 
 14  player_dismissed  12950 non-null   object
 15  dismissal_kind    12950 non-null   object
 16  fielder           9354 non-null    obj

In [35]:
deliveres.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batter,bowler,non_striker,batsman_runs,extra_runs,total_runs,extras_type,is_wicket,player_dismissed,dismissal_kind,fielder
0,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,1,SC Ganguly,P Kumar,BB McCullum,0,1,1,legbyes,0,,,
1,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,2,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,
2,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,3,BB McCullum,P Kumar,SC Ganguly,0,1,1,wides,0,,,
3,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,4,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,
4,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,5,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,


In [36]:
deliveres.dismissal_kind.value_counts()

Unnamed: 0_level_0,count
dismissal_kind,Unnamed: 1_level_1
caught,8063
bowled,2212
run out,1114
lbw,800
caught and bowled,367
stumped,358
retired hurt,15
hit wicket,15
obstructing the field,3
retired out,3


In [None]:
data = pd.read_csv('/content/drive/My Drive/hackathon/streamlit_x_test_batsmen.csv')


## Choosing relevant columns

In [None]:
deliveres = deliveres[['match_id', 'batter','batting_team','bowling_team','batsman_runs','bowler','total_runs','is_wicket']]

## Checking NULL Values

In [None]:
deliveres.isnull().sum()

Unnamed: 0,0
match_id,0
batter,0
batting_team,0
bowling_team,0
batsman_runs,0
bowler,0
total_runs,0
is_wicket,0


In [None]:
team_replacements = {
    'Royal Challengers Bangalore': 'Royal Challengers Bengaluru',
    'Kings XI Punjab': 'Punjab Kings'
}

deliveres['batting_team'] = deliveres['batting_team'].replace(team_replacements)
deliveres['bowling_team'] = deliveres['bowling_team'].replace(team_replacements)

In [None]:
batsmen = deliveres.batter.unique()
bowler = deliveres.bowler.unique()

## Joining match dataframe with deliveries dataframe

In [None]:
raw_data = deliveres.merge(matches, left_on='match_id', right_on='id')

In [None]:
raw_data.to_csv('/content/drive/My Drive/hackathon/raw_data.csv')