In [34]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

ball_by_ball_data = pd.read_csv("data/deliveries.csv")
matches_data = pd.read_csv("data/matches.csv")

In [35]:
matches_data['match_id'] = matches_data['id']
matches_data.drop('id', axis = 1, inplace=True)

In [36]:
data = pd.merge(ball_by_ball_data, matches_data, on='match_id', how='left')
# data

In [37]:
# data.columns

In [38]:
#ensuring that wides, noballs, and penalties don't count as a ball
for i in range(len(data)):
    if data.extras_type[i] in ['wides', 'noballs', 'penalty']:
        k = True
        j = i+1
        # Ensure j does not exceed DataFrame length
        while k and j < len(data):
            if data.ball[j] > data.ball[i]:
                data.loc[j, 'ball'] = data.ball[j] - 1
                j += 1
            else:
                k = False
        data.loc[i, 'ball'] = data.ball[i] - 1


In [39]:
#get the cumulative innings total runs
innings_total_list = []
i = 0
x = True
while i < len(data):
    innings_total = data.total_runs[i]
    innings_total_list.append(innings_total)
    i = i+1
    while i < len(data) and data.match_id[i] == data.match_id[i-1] and data.inning[i] == data.inning[i-1]:
        innings_total = innings_total + data.total_runs[i]
        innings_total_list.append(innings_total)
        i = i+1

In [40]:
data['cumulative_innings_total'] = innings_total_list 
data['cumulative_balls'] = data['over']*6 + data['ball']

In [41]:
#get the total runs scored in that innings
data['innings_total_runs'] = data.groupby(['match_id', 'inning'])['cumulative_innings_total'].transform('max')

#get the total balls bowled in that innings
data['innings_total_balls'] = data.groupby(['match_id', 'inning'])['cumulative_balls'].transform('max')

In [42]:
data['match_total_runs'] = data['target_runs'] - 1 + data['target_runs'] - 1 - data['result_margin']

In [43]:
#get the first row in each innings and then add the total balls in each innings to get the total balls bowled in the match
data_first_rows = data.groupby(['match_id', 'inning']).first().reset_index()
data_first_rows['match_total_balls'] = data_first_rows.groupby('match_id')['innings_total_balls'].transform('sum')
to_merge_df = data_first_rows[['match_id', 'match_total_balls']].drop_duplicates()

In [44]:
data = pd.merge(data, to_merge_df, on='match_id', how='left')

In [45]:
# data[0:10]

In [46]:
data['batter_total_runs'] = data.groupby(['match_id','batter'])['batsman_runs'].cumsum()
data['batter_total_balls'] = data[(data['extras_type'] != 'wides') & (data['extras_type'] != 'penalty')].groupby(['match_id','batter']).cumcount() + 1
#will forward fill the last valid value to the nan value
data['batter_total_balls'] = data.groupby(['match_id', 'batter'])['batter_total_balls'].transform(lambda x: x.ffill())
#if the first ball that the batter faced was a wide then there is nothing to ffill so we fill these with 0
data['batter_total_balls'] = data['batter_total_balls'].fillna(0)

In [47]:
data['bowler_total_runs'] = data[(data['extras_type'] != 'byes') & (data['extras_type'] != 'legbyes') & (data['extras_type'] != 'penalty')].groupby(['match_id', 'bowler'])['total_runs'].cumsum()
data['bowler_total_balls'] = data[(data['extras_type'] != 'wides') & (data['extras_type'] != 'noballs') & (data['extras_type'] != 'penalty')].groupby(['match_id','bowler']).cumcount() + 1
#will forward fill the last valid value to the nan value
data['bowler_total_runs'] = data.groupby(['match_id', 'bowler'])['bowler_total_runs'].transform(lambda x: x.ffill())
#if the first ball bowled by a bowler is a bye then there is nothing to ffill so we fill these with 0
data['bowler_total_runs'] = data['bowler_total_runs'].fillna(0)
#will forward fill the last valid value to the nan value
data['bowler_total_balls'] = data.groupby(['match_id', 'bowler'])['bowler_total_balls'].transform(lambda x: x.ffill())
#if the first ball bowled by a bowler is a noball then there is nothing to ffill so we fill these with 0
data['bowler_total_balls'] = data['bowler_total_balls'].fillna(0)

In [48]:
data['bowler_economy'] = data['bowler_total_runs']/data['bowler_total_balls'] * 6

In [49]:
data['batter_SR'] = data['batter_total_runs']/data['batter_total_balls'] * 100

In [50]:
# data.columns

In [51]:
# data.season.unique()

In [52]:
seasons = ['2007/08', '2009', '2009/10', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020/21', '2021', '2022', '2023']
data_by_season = {}

for season in seasons:
  data_by_season[season] = data[data['season'] == season]


In [53]:
xR_data = data

In [54]:
#xR_data.to_csv("xR_data.csv", index=False)

In [55]:
unnecessary_cols = [
    'batting_team', 'batsman_runs', 'over', 'ball', 'total_runs',
    'non_striker', 'extra_runs', 'extras_type', 'is_wicket',
    'player_dismissed', 'fielder', 'season', 'city',
    'player_of_match', 'team1', 'team2', 'toss_winner',
    'toss_decision', 'winner', 'result', 'result_margin', 'target_runs',
    'target_overs', 'super_over', 'method', 'umpire1', 'umpire2',
    'cumulative_innings_total', 'innings_total_runs',
    'innings_total_balls', 'match_total_runs', 'match_total_balls',
    'bowler_total_runs', 'bowler_total_balls', 'bowler_economy',
]

In [56]:
# Drop the unnecessary columns
df_reduced = data.drop(columns=unnecessary_cols)

# Display the new DataFrame
# df_reduced

In [57]:
# ball_by_ball_data[0:10]
df_reduced['batter_score'] = df_reduced.groupby(['match_id', 'batter'])['batter_total_runs'].transform('max')
# df_reduced

In [58]:
dismissal_kinds = df_reduced['dismissal_kind'].unique()
# dismissal_kinds

In [59]:
import pandas as pd

# Read the player.csv file into a DataFrame
players = pd.read_csv("data/Player.csv")

# players

new_players = pd.DataFrame()
new_players[['Player_Name', 'Bowling_Skill']] = players[['Player_Name', 'Bowling_Skill']]

new_players.dropna(inplace=True)

new_players

df_reduced['Bowling_Skill'] = df_reduced['bowler'].map(new_players.set_index('Player_Name')['Bowling_Skill'])
# df_reduced['Bowling_Skill'] = df_reduced['Bowling_Skill'].fillna('nan')
df_reduced[20:40]

df_reduced['Bowling_Skill'] = df_reduced['Bowling_Skill'].replace({
  'Right-arm medium': 'Right-arm pace',
  'Right-arm fast-medium': 'Right-arm pace',
  'Right-arm medium-fast': 'Right-arm pace',
  'Right-arm fast': 'Right-arm pace',
  'Left-arm medium': 'Left-arm pace',
  'Left-arm fast-medium': 'Left-arm pace',
  'Left-arm medium-fast': 'Left-arm pace',
  'Left-arm fast': 'Left-arm pace',
  'Right-arm offbreak': 'Right-arm offspin',
  'Legbreak googly': 'Right-arm legspin',
  'Legbreak': 'Right-arm legspin',
  'Slow left-arm orthodox': 'Left-arm offspin',
  'Slow left-arm chinaman': 'Left-arm legspin',
})

# unique_bowling_types = df_reduced['Bowling_Skill'].unique()
# unique_bowling_types

new_players

Unnamed: 0,Player_Name,Bowling_Skill
0,SC Ganguly,Right-arm medium
1,BB McCullum,Right-arm medium
2,RT Ponting,Right-arm medium
3,DJ Hussey,Right-arm offbreak
4,Mohammad Hafeez,Right-arm offbreak
...,...,...
463,SM Boland,Right-arm fast-medium
464,DL Chahar,Right-arm medium
466,RV Pawar,Slow left-arm orthodox
468,T Mishra,Right-arm fast-medium


In [60]:
import pandas as pd

# Read the CSV file into a DataFrame
players_info = pd.read_csv("data/Players_Info_2024.csv")

# Drop the "about" column
players_info = players_info.drop(columns=["About"])

# Display the updated DataFrame
players_info

Unnamed: 0,Player Name,Team Name,Player Nationality,Date of Birth,Player Role,IPL Debut,Batting Style,Bowling Style,Player Salary
0,MS Dhoni,CSK,Indian,7-Jul-81,Wicketkeeper Batter,2008.0,Right hand Bat,Right arm Medium,₹12crore(US$1.5 million)
1,Devon Conway,CSK,South African,8-Jul-91,Wicketkeeper Batter,2022.0,Left hand Bat,Right arm Medium,"₹1crore(US$130,000)"
2,Ruturaj Gaikwad,CSK,Indian,31-Jan-97,Batter,2020.0,Right hand Bat,Right arm Offbreak,"₹6crore(US$750,000)"
3,Ajinkya Rahane,CSK,Indian,6-Jun-88,Top order Batter,2008.0,Right hand Bat,Right arm Medium,"₹50lakh(US$63,000)"
4,Shaik Rasheed,CSK,Indian,24-Sep-04,Batter,,Right hand Bat,Legbreak,"₹20lakh(US$25,000)"
...,...,...,...,...,...,...,...,...,...
240,Mujeeb Ur Rahman,KKR,Afghan,28-Mar-01,Bowler,2018.0,Right hand Bat,Right arm Offbreak,Salary not available
241,Dushmantha Chameera,KKR,Sri Lankan,11-Jan-92,Bowler,,Right hand Bat,Right arm Fast,Salary not available
242,Sakib Hussain,KKR,Indian,14-Dec-04,Bowler,,Right hand Bat,Right arm Medium,Salary not available
243,Mitchell Starc,KKR,Australian,30-Jan-90,Bowler,2014.0,Left hand Bat,Left arm Fast,2475


In [61]:
import numpy as np

# Function to preprocess names
def preprocess_name(name):
    return ' '.join(name.split()).lower()

# Preprocess player names in both dataframes
df_reduced['bowler'] = df_reduced['bowler'].apply(preprocess_name)
players_info['Player Name'] = players_info['Player Name'].apply(preprocess_name)

# Create a dictionary mapping from players_info to df_reduced
name_to_skill = dict(zip(players_info['Player Name'], players_info['Bowling Style']))

# Function to fill NaN values using the mapping dictionary
def fill_bowling_skill(row):
    if pd.isna(row['Bowling_Skill']):
        return name_to_skill.get(row['bowler'], row['Bowling_Skill'])
    return row['Bowling_Skill']

# Apply the function to fill NaN values in df_reduced
df_reduced['Bowling_Skill'] = df_reduced.apply(fill_bowling_skill, axis=1)

df_reduced['Bowling_Skill'].unique()


array(['Right-arm pace', 'Left-arm pace', 'Left-arm offspin',
       'Right-arm legspin', 'Right-arm offspin', 'Left-arm legspin', nan,
       'Legbreak Googly', 'Left arm Medium', 'Right arm Offbreak',
       'Right arm Fast', 'Right arm Fast medium',
       'Slow Left arm Orthodox', 'Left arm Medium fast', 'Legbreak',
       'Right arm Medium, Right arm Offbreak', 'Right arm Medium fast',
       'Left arm Fast medium', 'Right arm Medium', 'Left arm Wrist spin'],
      dtype=object)

In [64]:
# Filter the DataFrame to get rows with NaN in Bowling_Skill
nan_bowling_skill_df = df_reduced[df_reduced['Bowling_Skill'].isna()]

# Drop duplicates to get unique values
unique_nan_bowling_skill_df = nan_bowling_skill_df[['bowler']].drop_duplicates()

# Add an empty Bowling_Skill column
unique_nan_bowling_skill_df['Bowling_Skill'] = np.nan

unique_nan_bowling_skill_df.to_csv("data/unique_nan_bowling_skill.csv", index=False)

In [None]:
from tqdm import tqdm

df_reduced['no_of_right_arm_pacers_faced'] = 0
df_reduced['no_of_left_arm_pacers_faced'] = 0
df_reduced['no_of_right_arm_offspinners_faced'] = 0
df_reduced['no_of_left_arm_offspinners_faced'] = 0
df_reduced['no_of_right_arm_legspinners_faced'] = 0
df_reduced['no_of_left_arm_legspinners_faced'] = 0

# Dictionary to keep track of bowlers faced by each batter in each match
bowlers_faced = {}

# Iterate through the DataFrame
for idx, row in tqdm(df_reduced.iterrows(), total=len(df_reduced)):
    match_id = row['match_id']
    batter = row['batter']
    bowler = row['bowler']
    skill = row['Bowling_Skill']
    
    if match_id not in bowlers_faced:
        bowlers_faced[match_id] = {}
    
    if batter not in bowlers_faced[match_id]:
        bowlers_faced[match_id][batter] = {'Right-arm pace': set(), 'Left-arm pace': set(), 'Right-arm offspin': set(), 'Left-arm offspin': set(), 'Right-arm legspin': set(), 'Left-arm legspin': set(), 'nan': set()}
    
    #if bowler not in bowlers_faced[match_id][batter][skill]:
    bowlers_faced[match_id][batter][skill].add(bowler)
    
    df_reduced.loc[df_reduced.index == idx, 'no_of_right_arm_pacers_faced'] = len(bowlers_faced[match_id][batter]['Right-arm pace'])
    df_reduced.loc[df_reduced.index == idx, 'no_of_left_arm_pacers_faced'] = len(bowlers_faced[match_id][batter]['Left-arm pace'])
    df_reduced.loc[df_reduced.index == idx, 'no_of_right_arm_offspinners_faced'] = len(bowlers_faced[match_id][batter]['Right-arm offspin'])
    df_reduced.loc[df_reduced.index == idx, 'no_of_left_arm_offspinners_faced'] = len(bowlers_faced[match_id][batter]['Left-arm offspin'])
    df_reduced.loc[df_reduced.index == idx, 'no_of_right_arm_legspinners_faced'] = len(bowlers_faced[match_id][batter]['Right-arm legspin'])
    df_reduced.loc[df_reduced.index == idx, 'no_of_left_arm_legspinners_faced'] = len(bowlers_faced[match_id][batter]['Left-arm legspin'])
    # df_reduced.loc[df_reduced.index == idx, 'no_of_nans'] = len(bowlers_faced[match_id][batter]['nan'])

    # if skill == 'Right-arm pace':
    #     df_reduced.loc[df_reduced.index == idx, 'no_of_right_arm_pacers_faced'] = len(bowlers_faced[match_id][batter]['Right-arm pace'])
    # if skill == 'Left-arm pace':
    #     df_reduced.loc[df_reduced.index == idx, 'no_of_left_arm_pacers_faced'] = len(bowlers_faced[match_id][batter]['Left-arm pace'])
    # if skill == 'Right-arm offspin':
    #     df_reduced.loc[df_reduced.index == idx, 'no_of_right_arm_offspinners_faced'] = len(bowlers_faced[match_id][batter]['Right-arm offspin'])
    # if skill == 'Left-arm offspin':
    #     df_reduced.loc[df_reduced.index == idx, 'no_of_left_arm_offspinners_faced'] = len(bowlers_faced[match_id][batter]['Left-arm offspin'])
    # if skill == 'Right-arm legspin':
    #     df_reduced.loc[df_reduced.index == idx, 'no_of_right_arm_legspinners_faced'] = len(bowlers_faced[match_id][batter]['Right-arm legspin'])
    # if skill == 'Left-arm legspin':
    #     df_reduced.loc[df_reduced.index == idx, 'no_of_left_arm_legspinners_faced'] = len(bowlers_faced[match_id][batter]['Left-arm legspin'])
    # if skill == 'nan':
    #     df_reduced.loc[df_reduced.index == idx, 'no_of_nans'] = len(bowlers_faced[match_id][batter]['nan'])


In [None]:
aggregated_df = df_reduced.groupby(['match_id', 'inning', 'bowling_team', 'batter']).agg({
    'batter_total_balls': 'max',
    'batter_SR': 'max',
    'batter_score': 'max',
    'dismissal_kind': 'first',
    'date': 'first',
    'match_type': 'first',
    'venue': 'first',
    'no_of_right_arm_pacers_faced': 'max',
    'no_of_left_arm_pacers_faced': 'max',
    'no_of_right_arm_offspinners_faced': 'max',
    'no_of_right_arm_legspinners_faced': 'max',
    'no_of_left_arm_legspinners_faced': 'max',
    'no_of_left_arm_offspinners_faced': 'max'
}).reset_index()

aggregated_df

nan_bowlers = df_reduced[df_reduced['Bowling_Skill'] == 'nan']['bowler'].unique()
print(len(nan_bowlers))

print(len(new_players))

all_bowlers = new_players._append(pd.DataFrame({'Player_Name': nan_bowlers, 'Bowling_Skill': 'nan'}), ignore_index=True)
len(all_bowlers)

In [None]:
aggregated_df

In [None]:
# Ensure the dataframe is sorted by 'batter' and 'date'
# aggregated_df = aggregated_df.sort_values(by=['batter', 'date'])

# Group by 'batter' and calculate the rolling sums for the last 5 matches
aggregated_df['score_last_5'] = aggregated_df.groupby('batter')['batter_score'].transform(lambda x: x.rolling(window=5, min_periods=1).sum())
aggregated_df['balls_last_5'] = aggregated_df.groupby('batter')['batter_total_balls'].transform(lambda x: x.rolling(window=5, min_periods=1).sum())

# Create new columns for each of the last 5 matches
for i in range(1, 6):
    aggregated_df[f'score_last_{i}'] = aggregated_df.groupby('batter')['batter_score'].transform(lambda x: x.shift(i))
    aggregated_df[f'balls_last_{i}'] = aggregated_df.groupby('batter')['batter_total_balls'].transform(lambda x: x.shift(i))

aggregated_df.columns

In [None]:
aggregated_df.to_csv("data/aggregated_df.csv", index=False)

In [None]:
dataframe = pd.read_csv("data/aggregated_df.csv")
dataframe.columns

In [None]:
aggregated_df.drop(columns=['no_of_left_arm_offpacers_faced'], inplace=True)

In [None]:
df_reduced
bowlers_per_inning = df_reduced.groupby(['match_id', 'inning', 'Bowling_Skill'])['bowler'].nunique()
bowlers_per_inning