In [1]:
import pandas as pd
import numpy as np
ball_by_ball = pd.read_csv('../resources/train/ball_by_ball_data.csv')
match_data = pd.read_csv('../resources/train/match_data.csv')
bbb_venue = ball_by_ball.merge(match_data, on='match_id')
bbb_venue.head(n=2)
mbm = ball_by_ball[ball_by_ball['extra_runs'] == 0].groupby(['match_id', 'inning', 'batting_team', 'bowling_team', 
                            'batsman_id']).agg({'batsman_runs': np.sum, 'bowler_id': np.size}).rename(columns={'bowler_id':'deliveries'}).reset_index()
mbm_venue = mbm.merge(match_data, on='match_id')

In [2]:

bat_sat = bbb_venue[['batsman_id', 'match_id', 'season', 'bowling_team', 'venue_id', 'batsman_runs']]

bat_sat_agg = bbb_venue.groupby(['batsman_id', 'season', 'bowling_team', 'venue_id']).agg({'batsman_runs': np.sum}).reset_index()
bat_sat_agg.head(n=2)

Unnamed: 0,batsman_id,season,bowling_team,venue_id,batsman_runs
0,1,2008,DC,4,37
1,1,2008,DC,5,23


In [None]:
bat_run_group = bbb_venue[bbb_venue['extra_runs'] == 0].groupby(['batsman_id', 'season', 'bowling_team', 'venue_id', 'batsman_runs']).agg({'match_id': np.size}).reset_index()
bat_run_group.head(n = 2)

Unnamed: 0,batsman_id,season,bowling_team,venue_id,batsman_runs,match_id
0,1,2008,DC,4,0,7
1,1,2008,DC,4,1,9


In [4]:
for i in [0,1,2,3,4,6]:
    cname = str(i) + "_runs"
    bat_run_group[cname] = bat_run_group.apply(lambda x: x.match_id if x.batsman_runs == i else 0, axis = 1)
    
bat_run_group.head(n=2)

Unnamed: 0,batsman_id,season,bowling_team,venue_id,batsman_runs,match_id,0_runs,1_runs,2_runs,3_runs,4_runs,6_runs
0,1,2008,DC,4,0,7,7,0,0,0,0,0
1,1,2008,DC,4,1,9,0,9,0,0,0,0


In [5]:
bat_run_class = bat_run_group.groupby(['batsman_id', 'season', 'bowling_team', 'venue_id']).agg({'0_runs': np.sum, '1_runs': np.sum, '2_runs': np.sum, '3_runs': np.sum, '4_runs': np.sum, '6_runs': np.sum}).reset_index()
bat_run_class.head(n =2)

Unnamed: 0,batsman_id,season,bowling_team,venue_id,0_runs,1_runs,2_runs,3_runs,4_runs,6_runs
0,1,2008,DC,4,7,9,3,0,4,1
1,1,2008,DC,5,18,7,4,0,2,0


In [6]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
#dismissal_kind
dis_deliveries = bbb_venue[bbb_venue['player_dismissed_id'].notnull()][['match_id', 'player_dismissed_id', 'dismissal_kind']]

dis_sparse= mlb.fit_transform(dis_deliveries.apply(lambda x: set([x.dismissal_kind]), axis = 1))
dis_classes = list(mlb.classes_)

dis_del_byplayer = dis_deliveries.merge(pd.DataFrame(dis_sparse, columns = dis_classes, index = dis_deliveries.index), 
                     left_index=True, right_index=True)

mbm_venue['50s'] = mbm_venue.apply(lambda x: 1 if x.batsman_runs // 50 == 1 else 0,  axis = 1)
mbm_venue['100s'] = mbm_venue.apply(lambda x: 1 if x.batsman_runs // 50 > 1 else 0,  axis = 1)
mbm_stat = mbm_venue.merge(dis_del_byplayer, left_on=['batsman_id', 'match_id'], right_on=['player_dismissed_id', 'match_id'], 
                how='left')[['batsman_id', 'season', 'bowling_team', 'venue_id', 
                             'match_id', 'player_dismissed_id', 'deliveries', 'batsman_runs', '50s', '100s'] + dis_classes].fillna(0)

mbm_stat['is_dismissed'] = mbm_stat.apply(lambda x: 0 if x.player_dismissed_id==0 else 1, axis = 1)
mbm_stat['innings'] = mbm_stat.apply(lambda x : 1, axis = 1)
mbm_stat['not_outs'] = mbm_stat.apply(lambda x: 1 if x.player_dismissed_id==0 else 0, axis = 1)
mbm_stat.head(n=2)

Unnamed: 0,batsman_id,season,bowling_team,venue_id,match_id,player_dismissed_id,deliveries,batsman_runs,50s,100s,...,caught and bowled,hit wicket,lbw,obstructing the field,retired hurt,run out,stumped,is_dismissed,innings,not_outs
0,66,2008,RCB,7,2008_1,66.0,10,10,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,0
1,73,2008,RCB,7,2008_1,73.0,19,20,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,0


In [9]:
agg_dict = {x: np.sum for x in dis_classes + ['is_dismissed', 'innings', 'deliveries', 'batsman_runs', '50s', '100s', 'not_outs']}
agg_dict['batsman_runs'] = [np.sum, np.max]
vbv_stat = mbm_stat.groupby(['batsman_id', 'season', 'bowling_team', 'venue_id']).agg(agg_dict).reset_index()
vbv_stat['average_runs'] = vbv_stat.apply(lambda x: x['batsman_runs']['sum']/(x['innings']), axis =1)
vbv_stat['strike_rate'] = vbv_stat.apply(lambda x: x['batsman_runs']['sum'] * 100/(x['deliveries']), axis =1)
vbv_stat.columns = (vbv_stat.columns.get_level_values(0) + "_" + vbv_stat.columns.get_level_values(1))
vbv_stat.head(n=2)

Unnamed: 0,batsman_id_,season_,bowling_team_,venue_id_,bowled_sum,caught_sum,caught and bowled_sum,hit wicket_sum,lbw_sum,obstructing the field_sum,...,is_dismissed_sum,innings_sum,deliveries_sum,batsman_runs_sum,batsman_runs_amax,50s_sum,100s_sum,not_outs_sum,average_runs_,strike_rate_
0,1,2008,DC,4,0.0,1.0,0.0,0.0,0.0,0.0,...,1,1,24,37,37,0,0,0,37.0,154.166667
1,1,2008,DC,5,0.0,1.0,0.0,0.0,0.0,0.0,...,1,1,31,23,23,0,0,0,23.0,74.193548


In [10]:
vbv_stat.merge(bat_run_class, left_on =['batsman_id_', 'season_', 'bowling_team_', 'venue_id_'], right_on=['batsman_id', 'season', 'bowling_team', 'venue_id']).to_csv('../resources/features/bat_stat.csv', index = False)