In [136]:
# Data Preprocessing
import pandas as pd
import pickle

# Load the CSV file into a DataFrame
scoring_data = pd.read_csv('deliveries.csv')
matches_data=pd.read_csv('matches.csv')

#renaming the id column inorder to perform merge operation
matches_data=matches_data.rename(columns={'id':'match_id'})

matches_data.head()

Unnamed: 0,match_id,season,city,date,match_type,player_of_match,venue,team1,team2,toss_winner,toss_decision,winner,result,result_margin,target_runs,target_overs,super_over,method,umpire1,umpire2
0,335982,2007/08,Bangalore,2008-04-18,League,BB McCullum,M Chinnaswamy Stadium,Royal Challengers Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,140.0,223.0,20.0,N,,Asad Rauf,RE Koertzen
1,335983,2007/08,Chandigarh,2008-04-19,League,MEK Hussey,"Punjab Cricket Association Stadium, Mohali",Kings XI Punjab,Chennai Super Kings,Chennai Super Kings,bat,Chennai Super Kings,runs,33.0,241.0,20.0,N,,MR Benson,SL Shastri
2,335984,2007/08,Delhi,2008-04-19,League,MF Maharoof,Feroz Shah Kotla,Delhi Daredevils,Rajasthan Royals,Rajasthan Royals,bat,Delhi Daredevils,wickets,9.0,130.0,20.0,N,,Aleem Dar,GA Pratapkumar
3,335985,2007/08,Mumbai,2008-04-20,League,MV Boucher,Wankhede Stadium,Mumbai Indians,Royal Challengers Bangalore,Mumbai Indians,bat,Royal Challengers Bangalore,wickets,5.0,166.0,20.0,N,,SJ Davis,DJ Harper
4,335986,2007/08,Kolkata,2008-04-20,League,DJ Hussey,Eden Gardens,Kolkata Knight Riders,Deccan Chargers,Deccan Chargers,bat,Kolkata Knight Riders,wickets,5.0,111.0,20.0,N,,BF Bowden,K Hariharan


In [137]:
#merging two datasets
cricket_data = pd.merge(scoring_data, matches_data, on='match_id', how='left')

In [138]:
# Convert 'over' and 'ball' to a single count for easier calculation
cricket_data['ball_count'] = cricket_data['over'] * 6 + cricket_data['ball']

# Sorting the data to ensure the rolling operation respects the match and inning progression
cricket_data.sort_values(by=['match_id', 'inning', 'ball_count'], inplace=True)

# Reset the index to ensure proper rolling calculations
cricket_data.reset_index(drop=True, inplace=True)

# Calculate rolling totals for both runs and wickets, using the last 30 balls
cricket_data['score_last_30_balls'] = cricket_data.groupby(['match_id', 'inning'])['total_runs'].transform(
    lambda x: x.rolling(window=30, min_periods=1).sum())

cricket_data['wickets_last_30_balls'] = cricket_data.groupby(['match_id', 'inning'])['is_wicket'].transform(
    lambda x: x.rolling(window=30, min_periods=1).sum())

# Calculate the cumulative score
cricket_data['cumulative_score'] = cricket_data.groupby(['match_id', 'inning'])['total_runs'].cumsum()

# Adjust the first 5 overs to just cumulative sum up to that ball
mask = cricket_data['over'] < 5
cricket_data.loc[mask, 'score_last_30_balls'] = cricket_data.loc[mask].groupby(['match_id', 'inning'])['total_runs'].cumsum()
cricket_data.loc[mask, 'wickets_last_30_balls'] = cricket_data.loc[mask].groupby(['match_id', 'inning'])['is_wicket'].cumsum()



In [139]:
# Calculate the cumulative total score for each ball in the innings
cricket_data['cumulative_score'] = cricket_data.groupby(['match_id', 'inning'])['total_runs'].cumsum()

# Calculate the final total score for each innings and merge this as 'total_score' for each delivery
final_scores = cricket_data.groupby(['match_id', 'inning'])['cumulative_score'].max().reset_index()
final_scores.rename(columns={'cumulative_score': 'total_score'}, inplace=True)
cricket_data = pd.merge(cricket_data, final_scores, on=['match_id', 'inning'], how='left')


In [140]:
cricket_data.columns

Index(['match_id', 'inning', 'batting_team', 'bowling_team', 'over', 'ball',
       'batter', 'bowler', 'non_striker', 'batsman_runs', 'extra_runs',
       'total_runs', 'extras_type', 'is_wicket', 'player_dismissed',
       'dismissal_kind', 'fielder', 'season', 'city', 'date', 'match_type',
       'player_of_match', 'venue', 'team1', 'team2', 'toss_winner',
       'toss_decision', 'winner', 'result', 'result_margin', 'target_runs',
       'target_overs', 'super_over', 'method', 'umpire1', 'umpire2',
       'ball_count', 'score_last_30_balls', 'wickets_last_30_balls',
       'cumulative_score', 'total_score'],
      dtype='object')

In [141]:
#Removing Unwanted Columns from the dataframe
columns= ['fielder','dismissal_kind','player_dismissed','is_wicket','non_striker','extras_type','bowler','batter','match_id','batsman_runs', 'extra_runs',
'extras_type','is_wicket', 'player_dismissed','dismissal_kind', 'fielder', 'season','toss_winner','result','super_over', 'method', 'umpire1', 'umpire2',
'target_overs','venue','player_of_match','team1', 'team2','result_margin', 'target_runs','total_runs','match_type','ball_count']
cricket_data.drop(columns=columns, axis=1, inplace=True)

In [142]:
cricket_data.shape

(260920, 13)

In [143]:
cricket_data.columns

Index(['inning', 'batting_team', 'bowling_team', 'over', 'ball', 'city',
       'date', 'toss_decision', 'winner', 'score_last_30_balls',
       'wickets_last_30_balls', 'cumulative_score', 'total_score'],
      dtype='object')

In [144]:
cricket_data.isnull().sum()

inning                       0
batting_team                 0
bowling_team                 0
over                         0
ball                         0
city                     12397
date                         0
toss_decision                0
winner                     490
score_last_30_balls          0
wickets_last_30_balls        0
cumulative_score             0
total_score                  0
dtype: int64

In [145]:
cricket_data.dropna(inplace=True)

In [146]:
cricket_data['batting_team'].unique()

array(['Kolkata Knight Riders', 'Royal Challengers Bangalore',
       'Chennai Super Kings', 'Kings XI Punjab', 'Rajasthan Royals',
       'Delhi Daredevils', 'Mumbai Indians', 'Deccan Chargers',
       'Kochi Tuskers Kerala', 'Pune Warriors', 'Sunrisers Hyderabad',
       'Rising Pune Supergiants', 'Gujarat Lions',
       'Rising Pune Supergiant', 'Delhi Capitals', 'Punjab Kings',
       'Lucknow Super Giants', 'Gujarat Titans',
       'Royal Challengers Bengaluru'], dtype=object)

In [147]:
# Punjab Kings is the new name of Kings XI Punjab & Delhi Capitals is the new name of Delhi Daredevils

cricket_data['batting_team'] = cricket_data['batting_team'].replace({
    'Kings XI Punjab': 'Punjab Kings',
    'Delhi Daredevils': 'Delhi Capitals'
})

In [148]:
# Updating in the bowling team column

cricket_data['bowling_team'] = cricket_data['bowling_team'].replace({
    'Kings XI Punjab': 'Punjab Kings',
    'Delhi Daredevils': 'Delhi Capitals'
})


In [149]:
# Updating the winner column
cricket_data['winner'] = cricket_data['bowling_team'].replace({
    'Kings XI Punjab': 'Punjab Kings',
    'Delhi Daredevils': 'Delhi Capitals'
})

In [150]:
# Keeping only consistent teams
consistent_teams = ['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals',
                    'Mumbai Indians', 'Punjab Kings', 'Royal Challengers Bangalore',
                    'Delhi Capitals', 'Sunrisers Hyderabad','Gujarat Titans','Lucknow Super Giants']

In [151]:
cricket_data['city'].unique()

array(['Bangalore', 'Chandigarh', 'Delhi', 'Mumbai', 'Kolkata', 'Jaipur',
       'Hyderabad', 'Chennai', 'Cape Town', 'Port Elizabeth', 'Durban',
       'Centurion', 'East London', 'Johannesburg', 'Kimberley',
       'Bloemfontein', 'Ahmedabad', 'Cuttack', 'Nagpur', 'Dharamsala',
       'Kochi', 'Indore', 'Visakhapatnam', 'Pune', 'Raipur', 'Ranchi',
       'Abu Dhabi', 'Rajkot', 'Kanpur', 'Bengaluru', 'Sharjah', 'Dubai',
       'Navi Mumbai', 'Lucknow', 'Guwahati', 'Mohali'], dtype=object)

In [152]:
consistent_cities= ['Bangalore', 'Chandigarh', 'Delhi', 'Mumbai', 'Kolkata', 'Jaipur',
       'Hyderabad', 'Chennai',  'Ahmedabad', 'Cuttack', 'Nagpur', 'Dharamsala',
       'Kochi', 'Indore', 'Visakhapatnam', 'Pune', 'Raipur', 'Ranchi',
        'Rajkot', 'Kanpur', 'Bengaluru', 
       'Navi Mumbai', 'Lucknow', 'Guwahati', 'Mohali']

In [153]:
cricket_data = cricket_data[(cricket_data['batting_team'].isin(consistent_teams)) & (cricket_data['bowling_team'].isin(consistent_teams)) &
(cricket_data['winner'].isin(consistent_teams)) & (cricket_data['city'].isin(consistent_cities))]

In [154]:
# Converting the column 'date' from string into datetime object
from datetime import datetime
cricket_data['date'] = cricket_data['date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))

In [155]:
# Feature Engineering
ipl_data=pd.get_dummies(cricket_data, columns=['batting_team','bowling_team','winner','toss_decision','city'],dtype=int)

In [158]:
ipl_data = ipl_data[['inning','date', 'batting_team_Chennai Super Kings', 'batting_team_Delhi Capitals',
       'batting_team_Gujarat Titans', 'batting_team_Kolkata Knight Riders',
       'batting_team_Lucknow Super Giants', 'batting_team_Mumbai Indians',
       'batting_team_Punjab Kings', 'batting_team_Rajasthan Royals',
       'batting_team_Royal Challengers Bangalore',
       'batting_team_Sunrisers Hyderabad', 'bowling_team_Chennai Super Kings',
       'bowling_team_Delhi Capitals', 'bowling_team_Gujarat Titans',
       'bowling_team_Kolkata Knight Riders',
       'bowling_team_Lucknow Super Giants', 'bowling_team_Mumbai Indians',
       'bowling_team_Punjab Kings', 'bowling_team_Rajasthan Royals',
       'bowling_team_Royal Challengers Bangalore',
       'bowling_team_Sunrisers Hyderabad', 'winner_Chennai Super Kings',
       'winner_Delhi Capitals', 'winner_Gujarat Titans',
       'winner_Kolkata Knight Riders', 'winner_Lucknow Super Giants',
       'winner_Mumbai Indians', 'winner_Punjab Kings',
       'winner_Rajasthan Royals', 'winner_Royal Challengers Bangalore',
       'winner_Sunrisers Hyderabad', 'toss_decision_bat',
       'toss_decision_field', 'city_Ahmedabad', 'city_Bangalore',
       'city_Bengaluru', 'city_Chandigarh', 'city_Chennai', 'city_Cuttack',
       'city_Delhi', 'city_Dharamsala', 'city_Guwahati', 'city_Hyderabad',
       'city_Indore', 'city_Jaipur', 'city_Kolkata', 'city_Lucknow',
       'city_Mohali', 'city_Mumbai', 'city_Navi Mumbai', 'city_Pune',
       'city_Raipur', 'city_Ranchi', 'city_Visakhapatnam','over', 'ball',  'score_last_30_balls',
       'wickets_last_30_balls', 'cumulative_score', 'total_score',
       ]]