In [8]:
import os
import pandas as pd
import numpy as np
import json
import warnings
warnings.filterwarnings('ignore')

In [101]:
clean_data = pd.read_csv('../Inputs/clean_data.csv', index_col=0)

In [11]:
def get_stats(df, till_date, typ="bowler"):
    # Get the data till the given date
    df["start_date"] = pd.to_datetime(df["start_date"])
    df = df[df["start_date"] < till_date]

    # Get the number of runs scored by each batsman
    runs = df.groupby(typ)["runs_off_bat"].value_counts().unstack(level=1)
    runs = runs.dropna(axis=1, thresh=runs.shape[0] * 0.1).fillna(0)
    runs.columns = ["0_runs", "1_runs", "2_runs", "3_runs", "4_runs", "6_runs"]

    # Get the number of balls faced by each batsman
    balls = pd.DataFrame(df.groupby(typ)["ball"].count())

    # Get the number of wides faced by each batsman
    df.loc[:, "wides"] = (
        df["wides"].fillna(0).apply(lambda x: 0 if x == 0 else 1).tolist()
    )
    wides = pd.DataFrame(df.groupby(typ)["wides"].sum())

    # Get the number of dismissals faced by each batsman
    dismissals = pd.DataFrame(
        df.groupby(typ)["wicket_type"].apply(lambda x: x.notna().sum())
    ).rename(columns={"wicket_type": "num_dismissals"})

    # Add the number of balls faced and the number of wides faced to get the total number of balls faced
    balls["total_balls"] = balls["ball"] + wides["wides"]
    runs["0_runs"] = runs["0_runs"].subtract(dismissals["num_dismissals"])

    # Merge all the dataframes
    stats = pd.concat([runs, dismissals, wides, balls["total_balls"]], axis=1).fillna(0)
    return stats.reset_index()

def divide_data(cleandf):
    # Divide the data into history, train, and test
    temp = cleandf.sort_values(by=['start_date', 'innings', 'overs', 'balls']).copy()
    train = temp.query('"2016-01-01" <= start_date < "2022-01-01"')
    test = temp.query('start_date >= "2022-01-01"')
    return train, test


def merge_feats(train):
    temp = train.copy()
    temp = temp.sort_values(by=['start_date', 'innings', 'overs', 'balls'])
    
    unique_dates = temp['start_date'].unique().tolist()

    train_dfs = [
        temp[temp['start_date'] == date].merge(
            get_stats(clean_data, date, 'striker').add_prefix('bat_'),
            left_on='striker',
            right_on='bat_striker',
            how='left'
        ).merge(
            get_stats(clean_data, date, 'bowler').add_prefix('bowl_'),
            left_on='bowler',
            right_on='bowl_bowler',
            how='left'
        )
        for date in unique_dates
    ]
    
    newtrain = pd.concat(train_dfs)
    newtrain = newtrain.sort_values(by=['start_date', 'innings', 'overs', 'balls'])
    newtrain = newtrain.drop(['start_date', 'ball', 'extras','noballs','byes','legbyes', \
                            'player_dismissed', 'bat_striker', 'bowl_bowler'], axis=1)
    
    return newtrain

def make_target(df):
    cleandf = df.copy()
    create_y = cleandf[['runs_off_bat', 'wides', 'wicket_type']].fillna('none')
    
    def create_labels(row):
        if row['wicket_type'] != 'none':
            return "Wicket"
        elif row['runs_off_bat'] == 0:
            return "0_runs"
        elif row['runs_off_bat'] == 1:
            return "1_runs"
        elif row['runs_off_bat'] == 2:
            return "2_runs"
        elif row['runs_off_bat'] == 3:
            return "3_runs"
        elif row['runs_off_bat'] == 4 or row['runs_off_bat'] == 5:
            return "4_runs"
        elif row['runs_off_bat'] == 6 or row['runs_off_bat'] == 7:
            return "6_runs"
        elif row['wides'] != 'none':
            return "Wide"
        else:
            raise ValueError('Error in creating y, check the data')
    
    cleandf['target'] = create_y.apply(create_labels, axis=1)
    cleandf = cleandf.drop(['runs_off_bat', 'wides', 'wicket_type'], axis=1)
    return cleandf.fillna(0.0)

#slow takes 13 mins
def make_features(clean_data):
    train1, test1 = divide_data(clean_data)
    train2, test2 = merge_feats(train1), merge_feats(test1)
    del train1, test1
    train, test = make_target(train2), make_target(test2)
    del train2, test2
    return train, test

In [12]:
%time
train, test = make_features(clean_data)

CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 1.91 µs


In [13]:
train.to_csv('../Inputs/train.csv', index=False)
test.to_csv('../Inputs/test.csv', index=False)

In [14]:
train.head()

Unnamed: 0,venue,innings,batting_team,bowling_team,striker,non_striker,bowler,overs,balls,bat_0_runs,...,bowl_0_runs,bowl_1_runs,bowl_2_runs,bowl_3_runs,bowl_4_runs,bowl_6_runs,bowl_num_dismissals,bowl_wides,bowl_total_balls,target
0,Bay Oval,1,New Zealand,Sri Lanka,2be41edb,d027ba9f,2f28dc94,1,1,468.0,...,33.0,36.0,4.0,0.0,11.0,5.0,5.0,4.0,98.0,0_runs
1,Bay Oval,1,New Zealand,Sri Lanka,2be41edb,d027ba9f,2f28dc94,1,2,468.0,...,33.0,36.0,4.0,0.0,11.0,5.0,5.0,4.0,98.0,4_runs
2,Bay Oval,1,New Zealand,Sri Lanka,2be41edb,d027ba9f,2f28dc94,1,3,468.0,...,33.0,36.0,4.0,0.0,11.0,5.0,5.0,4.0,98.0,0_runs
3,Bay Oval,1,New Zealand,Sri Lanka,2be41edb,d027ba9f,2f28dc94,1,4,468.0,...,33.0,36.0,4.0,0.0,11.0,5.0,5.0,4.0,98.0,4_runs
4,Bay Oval,1,New Zealand,Sri Lanka,2be41edb,d027ba9f,2f28dc94,1,5,468.0,...,33.0,36.0,4.0,0.0,11.0,5.0,5.0,4.0,98.0,0_runs


In [15]:
len(train)

276285

In [17]:
minitrain = train[:1000].copy()

In [18]:
minitrain

Unnamed: 0,venue,innings,batting_team,bowling_team,striker,non_striker,bowler,overs,balls,bat_0_runs,...,bowl_0_runs,bowl_1_runs,bowl_2_runs,bowl_3_runs,bowl_4_runs,bowl_6_runs,bowl_num_dismissals,bowl_wides,bowl_total_balls,target
0,Bay Oval,1,New Zealand,Sri Lanka,2be41edb,d027ba9f,2f28dc94,1,1,468.0,...,33.0,36.0,4.0,0.0,11.0,5.0,5.0,4.0,98.0,0_runs
1,Bay Oval,1,New Zealand,Sri Lanka,2be41edb,d027ba9f,2f28dc94,1,2,468.0,...,33.0,36.0,4.0,0.0,11.0,5.0,5.0,4.0,98.0,4_runs
2,Bay Oval,1,New Zealand,Sri Lanka,2be41edb,d027ba9f,2f28dc94,1,3,468.0,...,33.0,36.0,4.0,0.0,11.0,5.0,5.0,4.0,98.0,0_runs
3,Bay Oval,1,New Zealand,Sri Lanka,2be41edb,d027ba9f,2f28dc94,1,4,468.0,...,33.0,36.0,4.0,0.0,11.0,5.0,5.0,4.0,98.0,4_runs
4,Bay Oval,1,New Zealand,Sri Lanka,2be41edb,d027ba9f,2f28dc94,1,5,468.0,...,33.0,36.0,4.0,0.0,11.0,5.0,5.0,4.0,98.0,0_runs
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,Sheikh Abu Naser Stadium,1,Zimbabwe,Bangladesh,31fbf891,0fdabe05,0a8fce53,6,3,345.0,...,62.0,35.0,10.0,1.0,7.0,3.0,6.0,5.0,129.0,0_runs
69,Eden Park,1,Pakistan,New Zealand,9ab63e7b,b98b5e1c,8abdf100,6,4,558.0,...,130.0,131.0,33.0,1.0,55.0,22.0,14.0,14.0,400.0,6_runs
70,Sheikh Abu Naser Stadium,1,Zimbabwe,Bangladesh,31fbf891,0fdabe05,0a8fce53,6,4,345.0,...,62.0,35.0,10.0,1.0,7.0,3.0,6.0,5.0,129.0,1_runs
71,Eden Park,1,Pakistan,New Zealand,9ab63e7b,b98b5e1c,8abdf100,6,5,558.0,...,130.0,131.0,33.0,1.0,55.0,22.0,14.0,14.0,400.0,0_runs


In [103]:
clean_data['match_id'] = clean_data.index.str.split('_').str[0]

In [104]:
clean_data.sort_values(by=['start_date', 'innings', 'overs', 'balls'], inplace=True)
# matchid = clean_data.loc[0, 'match_id']

In [144]:
#if innings == 1, over = 1, ball = 1, then it is the start of the innings, add a column for the score at the start of the innings
clean_data['current_score_cumsum'] = clean_data.groupby(['match_id', 'innings'])['runs_off_bat'].cumsum()
# clean_data['wicket_type'] = clean_data['player_dismissed'].apply(lambda x: 1 if isinstance(x,str) else 0)
# clean_data['wickets_cumsum'] = (10-clean_data.groupby(['match_id', 'innings'])['wicket_type'].cumsum())
# clean_data['wickets_remaining'] = clean_data.groupby(['match_id', 'innings'])['wickets_cumsum'].shift(1).fillna(10).astype(int)
# clean_data.drop('wickets_cumsum', axis=1, inplace=True)


In [148]:
clean_data['current_score'] = clean_data.groupby(['match_id', 'innings'])['current_score_cumsum'].shift(1).fillna(0).astype(int)

In [152]:
clean_data[['wickets_remaining', 'current_score']].iloc[80:140]

Unnamed: 0_level_0,wickets_remaining,current_score
delivery_id,Unnamed: 1_level_1,Unnamed: 2_level_1
211048_1_12.6,6.0,110
211048_1_13.1,6.0,111
211048_1_13.2,6.0,113
211048_1_13.3,6.0,113
211048_1_13.4,6.0,117
211048_1_13.5,6.0,117
211048_1_13.6,6.0,123
211048_1_14.1,6.0,124
211048_1_14.2,6.0,125
211048_1_14.3,6.0,127


In [147]:
clean_data['runs_off_bat'].iloc[80:140]

delivery_id
211048_1_12.6    1
211048_1_13.1    2
211048_1_13.2    0
211048_1_13.3    4
211048_1_13.4    0
211048_1_13.5    6
211048_1_13.6    1
211048_1_14.1    1
211048_1_14.2    2
211048_1_14.3    0
211048_1_14.4    1
211048_1_14.5    2
211048_1_14.6    1
211048_1_15.1    1
211048_1_15.2    2
211048_1_15.3    0
211048_1_15.4    1
211048_1_15.5    1
211048_1_15.6    1
211048_1_16.1    1
211048_1_16.2    1
211048_1_16.3    1
211048_1_16.4    0
211048_1_16.5    1
211048_1_16.6    4
211048_1_17.1    1
211048_1_17.2    1
211048_1_17.3    1
211048_1_17.4    0
211048_1_17.5    6
211048_1_17.6    6
211048_1_18.1    6
211048_1_18.2    2
211048_1_18.3    6
211048_1_18.4    6
211048_1_18.5    4
211048_1_18.6    6
211048_1_19.1    1
211048_1_19.2    4
211048_1_19.3    0
211048_1_19.4    1
211048_1_19.5    6
211048_1_19.6    0
211048_2_0.1     0
211048_2_0.2     0
211048_2_0.3     1
211048_2_0.4     0
211048_2_0.5     1
211048_2_0.6     0
211048_2_1.1     4
211048_2_1.2     1
211048_2_1.3     1
