In [34]:
import os
import wandb
import pandas as pd
import numpy as np
import json
import warnings
import utility as ut
warnings.filterwarnings('ignore')

In [35]:
run = wandb.init(project=ut.PROJECT_NAME, entity=ut.ENTITY, job_type="Upload")

VBox(children=(Label(value='0.001 MB of 0.012 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.078239…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016752438883334737, max=1.0…

In [45]:
path = '../Inputs/ball-by-ball prediction/'
wandb.log_artifact(path+'clean_data.csv', name='clean_data', type='ball_by_ball_prediction')
run.finish()

In [14]:
clean_data = pd.read_csv(path+'clean_data.csv')

In [15]:
def get_stats(df, till_date, typ="bowler"):
    # Get the data till the given date
    df["start_date"] = pd.to_datetime(df["start_date"])
    df = df[df["start_date"] < till_date]

    # Get the number of runs scored by each batsman
    runs = df.groupby(typ)["runs_off_bat"].value_counts().unstack(level=1)
    runs = runs.dropna(axis=1, thresh=runs.shape[0] * 0.1).fillna(0)
    runs.columns = ["0_runs", "1_runs", "2_runs", "3_runs", "4_runs", "6_runs"]

    # Get the number of balls faced by each batsman
    balls = pd.DataFrame(df.groupby(typ)["ball"].count())

    # Get the number of wides faced by each batsman
    df.loc[:, "wides"] = (
        df["wides"].fillna(0).apply(lambda x: 0 if x == 0 else 1).tolist()
    )
    wides = pd.DataFrame(df.groupby(typ)["wides"].sum())

    # Get the number of dismissals faced by each batsman
    dismissals = pd.DataFrame(
        df.groupby(typ)["wicket_type"].apply(lambda x: x.notna().sum())
    ).rename(columns={"wicket_type": "num_dismissals"})

    # Add the number of balls faced and the number of wides faced to get the total number of balls faced
    balls["total_balls"] = balls["ball"] + wides["wides"]
    runs["0_runs"] = runs["0_runs"].subtract(dismissals["num_dismissals"])

    # Merge all the dataframes
    stats = pd.concat([runs, dismissals, wides, balls["total_balls"]], axis=1).fillna(0)
    return stats.reset_index()

def divide_data(cleandf):
    # Divide the data into history, train, and test
    temp = cleandf.sort_values(by=['start_date', 'innings', 'overs', 'balls']).copy()
    train = temp.query('"2016-01-01" <= start_date < "2022-01-01"')
    test = temp.query('start_date >= "2022-01-01"')
    return train, test


def merge_feats(train):
    temp = train.copy()
    temp = temp.sort_values(by=['start_date', 'innings', 'overs', 'balls'])
    
    unique_dates = temp['start_date'].unique().tolist()

    train_dfs = [
        temp[temp['start_date'] == date].merge(
            get_stats(clean_data, date, 'striker').add_prefix('bat_'),
            left_on='striker',
            right_on='bat_striker',
            how='left'
        ).merge(
            get_stats(clean_data, date, 'bowler').add_prefix('bowl_'),
            left_on='bowler',
            right_on='bowl_bowler',
            how='left'
        )
        for date in unique_dates
    ]
    
    newtrain = pd.concat(train_dfs)
    newtrain = newtrain.sort_values(by=['start_date', 'innings', 'overs', 'balls'])
    newtrain = newtrain.drop(['start_date', 'ball', 'extras','noballs','byes','legbyes', \
                            'player_dismissed', 'bat_striker', 'bowl_bowler'], axis=1)
    
    return newtrain

def make_target(df):
    cleandf = df.copy()
    create_y = cleandf[['runs_off_bat', 'wides', 'wicket_type']].fillna('none')
    
    def create_labels(row):
        if row['wicket_type'] != 'none':
            return "Wicket"
        elif row['runs_off_bat'] == 0:
            return "0_runs"
        elif row['runs_off_bat'] == 1:
            return "1_runs"
        elif row['runs_off_bat'] == 2:
            return "2_runs"
        elif row['runs_off_bat'] == 3:
            return "3_runs"
        elif row['runs_off_bat'] == 4 or row['runs_off_bat'] == 5:
            return "4_runs"
        elif row['runs_off_bat'] == 6 or row['runs_off_bat'] == 7:
            return "6_runs"
        elif row['wides'] != 'none':
            return "Wide"
        else:
            raise ValueError('Error in creating y, check the data')
    
    cleandf['target'] = create_y.apply(create_labels, axis=1)
    cleandf = cleandf.drop(['runs_off_bat', 'wides', 'wicket_type'], axis=1)
    return cleandf.fillna(0.0)

#slow takes 13 mins
def make_features(clean_data):
    train1, test1 = divide_data(clean_data)
    train2, test2 = merge_feats(train1), merge_feats(test1)
    del train1, test1
    train, test = make_target(train2), make_target(test2)
    del train2, test2
    return train, test

In [53]:
run = wandb.init(project=ut.PROJECT_NAME, entity=ut.ENTITY, job_type="upload")

In [18]:
clean_data.sort_values(by=['start_date', 'innings', 'overs', 'balls'], inplace=True)

In [23]:
def get_player_dict():
    player_dict = {}
    kinds = ["ipl", "t20s"]
    for kind in kinds:
        path_json = f"../Inputs/{kind}_json"
        json_files = [
            pos_json for pos_json in os.listdir(path_json) if pos_json.endswith(".json")
        ]

        for file in json_files:
            with open(os.path.join(path_json, file)) as json_file:
                json_data = json.load(json_file)
                for key, val in json_data["info"]["registry"]["people"].items():
                    player_dict[val] = key
    return player_dict

player_dict = get_player_dict()

In [28]:
player_dict

{'5b040b81': 'A Singh',
 '883e3818': 'AM Saheba',
 'c69a7b5c': 'AS Raut',
 'cb4b3ab0': 'GAV Baxter',
 '72861603': 'GC Smith',
 '323e4c16': 'HDPK Dharmasena',
 'bad31fac': 'J Srinath',
 'c3d35165': 'JA Morkel',
 '3eac9d95': 'JDP Oram',
 'bda8cca8': 'KH Hurter',
 'b2b50355': 'L Balaji',
 'd872f52a': 'LA Carseldine',
 '4ba44e19': 'M Muralitharan',
 '4b57e452': 'M Vijay',
 'd8699ab7': 'ML Hayden',
 '4a8a2e3b': 'MS Dhoni',
 '890946a0': 'NV Ojha',
 'fe93fd9d': 'RA Jadeja',
 '3576e47e': 'S Badrinath',
 'c24a2c5d': 'S Tyagi',
 'ae091d39': 'SA Asnodkar',
 'c8179c68': 'SB Jakati',
 '1dc12ab9': 'SK Raina',
 '3d7e087f': 'SK Trivedi',
 'bb18be76': 'SK Warne',
 '63bff7f9': 'SM Harwood',
 '3c6ffae8': 'YK Pathan',
 'acdc62f5': 'A Nortje',
 'fdcc6236': 'AK Chaudhary',
 '2e171977': 'AR Patel',
 'eef2536f': 'Avesh Khan',
 'dbe50b21': 'HH Pandya',
 '81049310': 'J Yadav',
 '462411b3': 'JJ Bumrah',
 'e62dd25d': 'K Rabada',
 'a757b0d8': 'KA Pollard',
 '5b8c830e': 'KH Pandya',
 '76752ac8': 'MA Gough',
 '56ab4

In [47]:
artifact = wandb.Artifact(name='player_stats', type='ball_by_ball_prediction')

In [32]:
batter_stats = get_stats(clean_data, "2023-03-14", "striker")
batter_stats = batter_stats.applymap(lambda x: player_dict.get(x, x))

In [33]:
bowler_stats = get_stats(clean_data, "2023-03-14", "bowler")
bowler_stats = batter_stats.applymap(lambda x: player_dict.get(x, x))

In [48]:
bat_stats = wandb.Table(dataframe=batter_stats)
bowl_stats = wandb.Table(dataframe=bowler_stats)

In [49]:
artifact.add(bat_stats, "bat_stats")
artifact.add(bowl_stats, "bowl_stats")

ArtifactManifestEntry(path='bowl_stats.table.json', digest='iOsEboVTawBkGfVGxxMwSA==', ref=None, birth_artifact_id=None, size=181311, extra={}, local_path='/Users/sparshgupta/Library/Application Support/wandb/artifacts/staging/tmpv6rl_ga_')

In [50]:
run.log_artifact(artifact)
run.finish()

In [55]:
wandb.log_artifact(path+'train.csv', name='train_split_2016to2022', type='ball_by_ball_prediction')
wandb.log_artifact(path+'test.csv', name='test_split_2022on', type='ball_by_ball_prediction')
run.finish()

In [12]:
# train, test = make_features(clean_data)

CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 1.91 µs


In [13]:
# train.to_csv('../Inputs/train.csv', index=False)
# test.to_csv('../Inputs/test.csv', index=False)

In [6]:
clean_data['match_id'] = clean_data.index.str.split('_').str[0]

In [7]:
clean_data.sort_values(by=['start_date', 'innings', 'overs', 'balls'], inplace=True)
# matchid = clean_data.loc[0, 'match_id']

In [8]:
#if innings == 1, over = 1, ball = 1, then it is the start of the innings, add a column for the score at the start of the innings
clean_data['current_score_cumsum'] = clean_data.groupby(['match_id', 'innings'])['runs_off_bat'].cumsum()
clean_data['current_score'] = clean_data.groupby(['match_id', 'innings'])['current_score_cumsum'].shift(1).fillna(0).astype(int)
clean_data['wicket_type'] = clean_data['player_dismissed'].apply(lambda x: 1 if isinstance(x,str) else 0)
clean_data['wickets_cumsum'] = (10-clean_data.groupby(['match_id', 'innings'])['wicket_type'].cumsum())
clean_data['wickets_remaining'] = clean_data.groupby(['match_id', 'innings'])['wickets_cumsum'].shift(1).fillna(10).astype(int)
clean_data.drop('wickets_cumsum', axis=1, inplace=True)


In [28]:
train.columns

Index(['venue', 'innings', 'batting_team', 'bowling_team', 'striker',
       'non_striker', 'bowler', 'overs', 'balls', 'bat_0_runs', 'bat_1_runs',
       'bat_2_runs', 'bat_3_runs', 'bat_4_runs', 'bat_6_runs',
       'bat_num_dismissals', 'bat_wides', 'bat_total_balls', 'bowl_0_runs',
       'bowl_1_runs', 'bowl_2_runs', 'bowl_3_runs', 'bowl_4_runs',
       'bowl_6_runs', 'bowl_num_dismissals', 'bowl_wides', 'bowl_total_balls',
       'target'],
      dtype='object')

In [1]:
# pd.concat([train, train_feats], ignore_index=True, axis=1)

In [25]:
train_feats = clean_data[['start_date','wickets_remaining', 'current_score']].query('"2016-01-01" <= start_date < "2022-01-01"')

In [15]:
#create a feature called 1st innings score. This variable should be 0 if innings==1, otherwise it should be the final score of the first innings of the match
clean_data.groupby(['match_id'])['current_score'].transform('last').iloc[80:140]

delivery_id
211048_1_12.6    168
211048_1_13.1    168
211048_1_13.2    168
211048_1_13.3    168
211048_1_13.4    168
211048_1_13.5    168
211048_1_13.6    168
211048_1_14.1    168
211048_1_14.2    168
211048_1_14.3    168
211048_1_14.4    168
211048_1_14.5    168
211048_1_14.6    168
211048_1_15.1    168
211048_1_15.2    168
211048_1_15.3    168
211048_1_15.4    168
211048_1_15.5    168
211048_1_15.6    168
211048_1_16.1    168
211048_1_16.2    168
211048_1_16.3    168
211048_1_16.4    168
211048_1_16.5    168
211048_1_16.6    168
211048_1_17.1    168
211048_1_17.2    168
211048_1_17.3    168
211048_1_17.4    168
211048_1_17.5    168
211048_1_17.6    168
211048_1_18.1    168
211048_1_18.2    168
211048_1_18.3    168
211048_1_18.4    168
211048_1_18.5    168
211048_1_18.6    168
211048_1_19.1    168
211048_1_19.2    168
211048_1_19.3    168
211048_1_19.4    168
211048_1_19.5    168
211048_1_19.6    168
211048_2_0.1     168
211048_2_0.2     168
211048_2_0.3     168
211048_2_0.4     168
2

In [16]:
clean_data['1st_innings_score'] = clean_data.groupby(['match_id'])['current_score'].shift(1).fillna(0).astype(int)

In [18]:
"""
Create a featue called 'target_score', this variable should be 0 if innings=1, otherwise it should be the target score for the team batting second
which is the 1st innings score + 1
"""


delivery_id
211048_1_12.6    109
211048_1_13.1    110
211048_1_13.2    111
211048_1_13.3    113
211048_1_13.4    113
211048_1_13.5    117
211048_1_13.6    117
211048_1_14.1    123
211048_1_14.2    124
211048_1_14.3    125
211048_1_14.4    127
211048_1_14.5    127
211048_1_14.6    128
211048_1_15.1    130
211048_1_15.2    131
211048_1_15.3    132
211048_1_15.4    134
211048_1_15.5    134
211048_1_15.6    135
211048_1_16.1    136
211048_1_16.2    137
211048_1_16.3    138
211048_1_16.4    139
211048_1_16.5    140
211048_1_16.6    140
211048_1_17.1    141
211048_1_17.2    145
211048_1_17.3    146
211048_1_17.4    147
211048_1_17.5    148
211048_1_17.6    148
211048_1_18.1    154
211048_1_18.2    160
211048_1_18.3    166
211048_1_18.4    168
211048_1_18.5    174
211048_1_18.6    180
211048_1_19.1    184
211048_1_19.2    190
211048_1_19.3    191
211048_1_19.4    195
211048_1_19.5    195
211048_1_19.6    196
211048_2_0.1     202
211048_2_0.2       0
211048_2_0.3       0
211048_2_0.4       0
2

In [20]:
clean_data.groupby(['match_id'])['1st_innings_score'].transform('last').shift(-1).fillna(0).astype(int).iloc[80:140]

delivery_id
211048_1_12.6    167
211048_1_13.1    167
211048_1_13.2    167
211048_1_13.3    167
211048_1_13.4    167
211048_1_13.5    167
211048_1_13.6    167
211048_1_14.1    167
211048_1_14.2    167
211048_1_14.3    167
211048_1_14.4    167
211048_1_14.5    167
211048_1_14.6    167
211048_1_15.1    167
211048_1_15.2    167
211048_1_15.3    167
211048_1_15.4    167
211048_1_15.5    167
211048_1_15.6    167
211048_1_16.1    167
211048_1_16.2    167
211048_1_16.3    167
211048_1_16.4    167
211048_1_16.5    167
211048_1_16.6    167
211048_1_17.1    167
211048_1_17.2    167
211048_1_17.3    167
211048_1_17.4    167
211048_1_17.5    167
211048_1_17.6    167
211048_1_18.1    167
211048_1_18.2    167
211048_1_18.3    167
211048_1_18.4    167
211048_1_18.5    167
211048_1_18.6    167
211048_1_19.1    167
211048_1_19.2    167
211048_1_19.3    167
211048_1_19.4    167
211048_1_19.5    167
211048_1_19.6    167
211048_2_0.1     167
211048_2_0.2     167
211048_2_0.3     167
211048_2_0.4     167
2