In [1]:
import os
import json
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
def clean_name(name):
    name = name.replace('(c)', '')
    name = name.replace('†', '')
    name = name.strip()
    return name

def find_wicket_c(wicket):
    wicket = wicket.replace('†', '')
    wicket = wicket.strip()
    if wicket.startswith('c & b '):
        wicket_c = wicket[len('c & b '):]
    elif wicket.startswith('c '):
        wicket_c = wicket[len('c '):wicket.find(' b ')]
    else:
        wicket_c = 'NA'
    return wicket_c

def find_wicket_b(wicket):
    wicket = wicket.replace('†', '')
    wicket = wicket.strip()
    if wicket.startswith('c & b '):
        wicket_b = wicket[len('c & b '):]
    elif wicket.startswith('b '):
        wicket_b = wicket[len('b '):]
    elif wicket.startswith('lbw b '):
        wicket_b = wicket[len('lbw b '):]
    elif wicket.startswith('c '):
        wicket_b = wicket[wicket.find(' b ')+3:]
    else:
        wicket_b = 'NA'
    return wicket_b

def match_player_caught(row, players_1, players_2):
    if row['inning'] == 1:
        players = players_2
    else:
        players = players_1
    for player in players:
        if row['wicket_b'] in player:
            return player
    else:
        if row['wicket_b'] == 'NA':
            return 'NA'
        else:
            return 'Not Matched'
        
def match_player_bowled(row, players_1, players_2):
    if row['inning'] == 1:
        players = players_2
    else:
        players = players_1
    for player in players:
        if row['wicket_c'] in player:
            return player
    else:
        if row['wicket_c'] == 'NA':
            return 'NA'
        else:
            return 'Not Matched'

In [3]:
def create_player_df(match_dir):
    
    batsman_df = pd.read_csv(os.path.join(match_dir, 'batsman_df.csv'))
    bowler_df = pd.read_csv(os.path.join(match_dir, 'bowler_df.csv'))
    bowler_df = bowler_df.rename(columns={
        'runs': 'runs_given',
        'fours': 'fours_given',
        'sixes': 'sixes_given'
    })
    if 'inning' in bowler_df.columns:
        bowler_df = bowler_df.drop(columns=['inning'])
    with open(os.path.join(match_dir, 'meta_data.json'), 'r') as file:
        meta_data = json.load(file)
        
    player_df = batsman_df[batsman_df['name'] != 'Extras'].copy()
    player_df['is_captain'] = player_df['name'].str.contains('(c)', regex=False, case=True)
    player_df['is_wicket_keeper'] = player_df['name'].str.contains('†', regex=False, case=True)
    player_df['name'] = player_df['name'].apply(clean_name)
    player_df['wicket_c'] = player_df['wicket'].apply(find_wicket_c)
    player_df['wicket_b'] = player_df['wicket'].apply(find_wicket_b)
    
    players_1 = player_df[player_df['inning']==1]['name']
    players_2 = player_df[player_df['inning']==2]['name']
    player_df['player_caught'] = player_df.apply(match_player_caught, axis=1, 
                                                 players_1=players_1, players_2=players_2)
    player_df['player_bowled'] = player_df.apply(match_player_bowled, axis=1, 
                                                 players_1=players_1, players_2=players_2)
    
    player_df['caught_count'] = player_df['name'].map(player_df['player_caught'].value_counts())
    player_df['caught_count'] = player_df['caught_count'].fillna(0).astype('int')
    player_df['bowled_count'] = player_df['name'].map(player_df['player_bowled'].value_counts())
    player_df['bowled_count'] = player_df['bowled_count'].fillna(0).astype('int')
        
    player_df = pd.merge(player_df, bowler_df, on='name', how='outer').fillna('NA')
    
    return player_df

In [5]:
data_dir = 'Data'
seasons = range(2010, 2021)

for season in tqdm(seasons, desc='seasons'):
    season_dir = os.path.join(data_dir, str(season))
    matches = os.listdir(season_dir)
    for match in tqdm(matches, desc='matches', leave=False):
        match_dir = os.path.join(season_dir, match)
        try:
            player_df = create_player_df(match_dir)
            player_df.to_csv(os.path.join(match_dir, 'player_df.csv'), index=False)
        except: # abandoned matches
            print(match_dir)

HBox(children=(FloatProgress(value=0.0, description='seasons', max=11.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='matches', max=60.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='matches', max=74.0, style=ProgressStyle(description_width…

Data/2011/20th match (N)


HBox(children=(FloatProgress(value=0.0, description='matches', max=76.0, style=ProgressStyle(description_width…

Data/2012/32nd match (N)
Data/2012/34th match (N)


HBox(children=(FloatProgress(value=0.0, description='matches', max=76.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='matches', max=60.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='matches', max=60.0, style=ProgressStyle(description_width…

Data/2015/25th match (D and N)


HBox(children=(FloatProgress(value=0.0, description='matches', max=60.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='matches', max=60.0, style=ProgressStyle(description_width…

Data/2017/29th match (N)


HBox(children=(FloatProgress(value=0.0, description='matches', max=60.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='matches', max=60.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='matches', max=6.0, style=ProgressStyle(description_width=…


