In [82]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
# import tiktoken as tt
# from fastai.tabular import *
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections.abc import Iterable
import glob
from functools import lru_cache, cache
import os
import json
import warnings

warnings.filterwarnings('ignore')

In [42]:
# a function to loop over folder ipl_csv2 and read all csv files
kind = 't20s'
path_csv = f'../Inputs/{kind}_csv'
path_json = f'../Inputs/{kind}_json'

def get_json_info(jsondata):
    keys = ['event', 'match_type', 'officials', 'outcome', 'season', 'teams', 'players', 'toss', 'venue']
    match_info, player_info = {}, []
    for key in keys:
        # print(f"{key}: {jsondata['info'][key]}")
        if key == 'event':
            try:
                # print(f"{key}: {jsondata['info'][key]}")
                match_info[key] = jsondata['info'][key]['name']
            except:
                match_info[key] = 'NA'
        elif key == 'match_type':
            try:
                # print(f"{key}: {jsondata['info'][key]}")
                match_info[key] = jsondata['info'][key]
            except:
                match_info[key] = 'NA'
        elif key == 'officials':
            try:
                # print(f"{key}: {jsondata['info'][key]}")
                match_info['umpire_1'] = jsondata['info'][key]['umpires'][0]
                match_info['umpire_2'] = jsondata['info'][key]['umpires'][1]
            except:
                match_info['umpire_1'] = 'NA'
                match_info['umpire_2'] = 'NA'
        elif key == 'outcome':
            try:
                match_info['winner'] = jsondata['info'][key]['winner']
                try:
                    match_info['win_by_runs'] = jsondata['info'][key]['by']['runs']
                except:
                    match_info['win_by_wickets'] = jsondata['info'][key]['by']['wickets']
            except:
                match_info['winner'] = 'Draw' #change this later
        elif key == 'teams':
            match_info['team_1'] = jsondata['info'][key][0]
            match_info['team_2'] = jsondata['info'][key][1]
        elif key == 'players':
            try:
                player_info = pd.DataFrame(jsondata['info'][key])
            except:
                player_info = pd.DataFrame()
        elif key == 'toss':
            try:
                match_info['toss_winner'] = jsondata['info'][key]['winner']
                match_info['toss_decision'] = jsondata['info'][key]['decision']
            except:
                match_info['toss_winner'] = 'NA'
                match_info['toss_decision'] = 'NA'
        elif key == 'venue':
            try:
                match_info[key] = jsondata['info'][key]
            except:
                match_info[key] = 'NA'
    return (match_info, player_info)

@cache
def read_data(path_csv, path_json):
    # read all csv files in the folder
    #seperate files with '_info' in the name
    all_files = glob.glob(path_csv + "/*.csv")
    data_files = [file for file in all_files if '_info' not in file]
    if path_csv+'/all_matches.csv' in data_files: data_files.remove(path_csv+'/all_matches.csv')
    json_files = [pos_json for pos_json in os.listdir(path_json) if pos_json.endswith('.json')]
    match_dict = {file.split('/')[-1].split('_')[0][:-4]: [pd.read_csv(file, index_col=None, header=0,low_memory=False)] for file in data_files}
    
    for file in json_files:
        if file[:-5] in match_dict:
            with open(os.path.join(path_json, file)) as json_file:
                json_data = json.load(json_file)
                matchdf, palyerdf = get_json_info(json_data)
                match_dict[file[:-5]].append(matchdf)
                match_dict[file[:-5]].append(palyerdf)
            # break;
    return match_dict


def collate_datasets(kinds):
    dfs = []
    for kind in kinds:
        path_csv = f'../Inputs/{kind}_csv'
        path_json = f'../Inputs/{kind}_json'
        match_dict = read_data(path_csv, path_json)
        dfs.append(pd.concat([match_dict[match][0] for match in match_dict]))
    return pd.concat(dfs, axis=0, ignore_index=True)

# match_dict = read_data(path_csv, path_json)
# full_df = collate_datasets(['t20s', 'ipl'])

In [85]:
def get_stats(df, till_date, typ='bowler'):
    # Get the data till the given date
    df['start_date'] = pd.to_datetime(df['start_date'])
    df = df[df['start_date'] < till_date]
    # print(df.shape)

    # Get the number of runs scored by each batsman
    runs = df.groupby(typ)['runs_off_bat'].value_counts()
    runs = runs.unstack(level=1)
    runs = runs.dropna(axis=1, thresh=runs.shape[0]*0.1)
    runs = runs.fillna(0)
    runs.columns = ['0_runs', '1_runs', '2_runs', '3_runs', '4_runs', '6_runs']
    # print(runs.head(2))

    # Get the number of balls faced by each batsman
    balls = df.groupby(typ)['ball'].count()
    balls = pd.DataFrame(balls)
    # print(balls.head(2))

    # Get the number of wides faced by each batsman
    df.loc[:,'wides'] = df['wides'].fillna(0).tolist()
    df['wides'] = [0 if x == 0 else 1 for x in df['wides']]
    wides = df.groupby(typ)['wides'].sum()
    wides = pd.DataFrame(wides)

    # Get the number of dismissals faced by each batsman
    dismissal = df['wicket_type'].value_counts().index.tolist()
    df.loc[:,'is_wicket'] = np.where(df['wicket_type'].isin(dismissal), 1, 0).tolist()
    dismissals = df.groupby(typ)['is_wicket'].sum()
    dismissals = pd.DataFrame(dismissals)
    dismissals = dismissals.rename(columns={'is_wicket':'num_dismissals'})

    # Add the number of balls faced and the number of wides faced to get the total number of balls faced
    balls['total_balls'] = balls['ball']+wides['wides']
    runs['0_runs'] = runs['0_runs'].subtract(dismissals['num_dismissals'])

    # Merge all the dataframes
    stats = pd.concat([runs, dismissals, wides], axis=1)

    # Divide all the columns by the number of balls faced by each batsman
    stats = stats.div(balls['total_balls'], axis=0)
    
    return stats

1. Match State (innings, over, number of runs, number of wickets, first innings score when relevant)
2. Bowler Statistics (historical distribution of ball results across all balls that bowler has bowled)
3. Batter Statistics (historical distribution of ball results across all balls that batsman has faced)

In [312]:
df = pd.read_csv('../Inputs/all_matches.csv', index_col=None, header=0,low_memory=False)

In [303]:
df.describe(exclude='object').T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
match_id,630668.0,979041.320776,347858.388412,211028.0,598061.0,1144162.0,1275047.0,1362242.0
innings,630668.0,1.475077,0.50166,1.0,1.0,1.0,2.0,6.0
ball,630668.0,9.403794,5.658149,0.1,4.4,9.3,14.3,19.9
runs_off_bat,630668.0,1.179968,1.573854,0.0,0.0,1.0,1.0,7.0
extras,630668.0,0.07176,0.352025,0.0,0.0,0.0,0.0,7.0
wides,21270.0,1.207757,0.777127,1.0,1.0,1.0,1.0,5.0
noballs,2689.0,1.035701,0.335506,1.0,1.0,1.0,1.0,5.0
byes,2136.0,1.835674,1.269959,1.0,1.0,1.0,3.0,5.0
legbyes,9941.0,1.289307,0.814973,1.0,1.0,1.0,1.0,5.0
penalty,9.0,5.0,0.0,5.0,5.0,5.0,5.0,5.0


In [304]:
df.describe(include='object').T

Unnamed: 0,count,unique,top,freq
season,630668,37,2022,77100
start_date,630668,1716,2022-07-15,1848
venue,630668,252,Dubai International Cricket Stadium,31155
batting_team,630668,113,Mumbai Indians,27826
bowling_team,630668,113,Mumbai Indians,27854
striker,630668,3014,V Kohli,8259
non_striker,630668,2971,V Kohli,8018
bowler,630668,2242,R Ashwin,5534
wicket_type,33589,11,caught,19510
player_dismissed,33589,2800,RG Sharma,319


In [305]:
#A function to clean the data
def clean_data(df):
    #remove the unwanted columns
    df = df.drop(['penalty', 'other_wicket_type', 'other_player_dismissed', 'season'],axis=1)
    #replace the null values with 0
    #replace NaN with not out in wicket_type, and with none in player_dismissed
    df['player_dismissed'] = df['player_dismissed'].fillna('none')
    df['wicket_type'] = df['wicket_type'].fillna('none')
    df = df.fillna(0)
    #create a new unique id for each delivery
    df['delivery_id'] = df['match_id'].astype(str) + '_' + df['innings'].astype(str) + '_' + df['ball'].astype(str)
    #drop the unwanted columns
    df = df.set_index('delivery_id')
    #convert object type to category type
    cat_cols = df.select_dtypes(include=['object']).columns.tolist()
    cat_cols.remove('start_date')
    df[cat_cols] = df[cat_cols].astype('category')
    #calculate runs scored in each delivery
    # df['runs_delivery'] = df['runs_off_bat'] + df['extras'] + df['wides'] + df['noballs'] + df['byes'] + df['legbyes']
    #drop the unwanted columns
    # df = df.drop(['runs_off_bat','extras','wides','noballs','byes','legbyes'],axis=1)
    return df

In [322]:
#clean venue column
def clean_venues(df):
    print(df['venue'].nunique())
    df['venue'] = df['venue'].replace('M Chinnaswamy Stadium, Bangalore', 'M Chinnaswamy Stadium', regex=True)
    df['venue'] = df['venue'].replace('Vidarbha Cricket Association Stadium, Jamtha, Nagpur', 'Vidarbha Cricket Association Stadium, Jamtha', regex=True)
    df['venue'] = df['venue'].replace('Punjab Cricket Association IS Bindra Stadium, Mohali', 'Punjab Cricket Association IS Bindra Stadium', regex=True)
    df['venue'] = df['venue'].replace('Punjab Cricket Association Stadium, Mohali', 'Punjab Cricket Association IS Bindra Stadium', regex=True)
    df['venue'] = df['venue'].replace('Punjab Cricket Association IS Bindra Stadium, Mohali, Chandigarh', 'Punjab Cricket Association IS Bindra Stadium', regex=True)
    df['venue'] = df['venue'].replace('Punjab Cricket Association IS Bindra Stadium, Chandigarh', 'Punjab Cricket Association IS Bindra Stadium', regex=True)
    df['venue'] = df['venue'].replace('Sawai Mansingh Stadium, Jaipur', 'Sawai Mansingh Stadium', regex=True)
    df['venue'] = df['venue'].replace('MA Chidambaram Stadium, Chepauk, Chennai', 'MA Chidambaram Stadium', regex=True)
    df['venue'] = df['venue'].replace('MA Chidambaram Stadium, Chepauk', 'MA Chidambaram Stadium', regex=True)
    df['venue'] = df['venue'].replace('Sardar Patel Stadium, Motera', 'Narendra Modi Stadium', regex=True)
    df['venue'] = df['venue'].replace('Eden Gardens, Kolkata', 'Eden Gardens', regex=True)
    df['venue'] = df['venue'].replace('Eden Park, Auckland', 'Eden Park', regex=True)
    df['venue'] = df['venue'].replace('Rajiv Gandhi International Stadium, Uppal', 'Rajiv Gandhi International Stadium', regex=True)
    df['venue'] = df['venue'].replace('Rajiv Gandhi International Stadium, Uppal, Hyderabad', 'Rajiv Gandhi International Stadium', regex=True)
    df['venue'] = df['venue'].replace('Rajiv Gandhi International Stadium, Hyderabad', 'Rajiv Gandhi International Stadium', regex=True)
    df['venue'] = df['venue'].replace('Rajiv Gandhi International Cricket Stadium, Dehradun', 'Rajiv Gandhi International Cricket Stadium', regex=True)
    df['venue'] = df['venue'].replace('Wankhede Stadium, Mumbai', 'Wankhede Stadium', regex=True)
    df['venue'] = df['venue'].replace('McLean Park, Napier', 'McLean Park', regex=True)
    df['venue'] = df['venue'].replace('Arun Jaitley Stadium, Delhi', 'Arun Jaitley Stadium', regex=True)
    df['venue'] = df['venue'].replace('Feroz Shah Kotla', 'Arun Jaitley Stadium', regex=True)
    df['venue'] = df['venue'].replace('New Wanderers Stadium', 'Wanderers', regex=True)
    df['venue'] = df['venue'].replace('The Wanderers Stadium, Johannesburg', 'Wanderers', regex=True)
    df['venue'] = df['venue'].replace('The Wanderers Stadium', 'Wanderers', regex=True)
    df['venue'] = df['venue'].replace('Wanderers Cricket Ground, Windhoek', 'Wanderers Cricket Ground', regex=True)
    df['venue'] = df['venue'].replace('M.Chinnaswamy Stadium', 'M Chinnaswamy Stadium', regex=True)
    df['venue'] = df['venue'].replace('Ministry Turf 2', 'Ministry Turf 1', regex=True)
    df['venue'] = df['venue'].replace('Zayed Cricket Stadium, Abu Dhabi', 'Sheikh Zayed Stadium', regex=True)
    df['venue'] = df['venue'].replace('Gahanga International Cricket Stadium. Rwanda', 'Gahanga International Cricket Stadium, Rwanda', regex=True)
    df['venue'] = df['venue'].replace('Shere Bangla National Stadium, Mirpur', 'Shere Bangla National Stadium', regex=True)
    df['venue'] = df['venue'].replace('Desert Springs Cricket Ground, Almeria', 'Desert Springs Cricket Ground', regex=True)
    df['venue'] = df['venue'].replace('R.Premadasa Stadium, Khettarama', 'R Premadasa Stadium', regex=True)
    df['venue'] = df['venue'].replace('R Premadasa Stadium, Colombo', 'R Premadasa Stadium', regex=True)
    df['venue'] = df['venue'].replace('Maharashtra Cricket Association Stadium, Pune', 'Maharashtra Cricket Association Stadium', regex=True)
    df['venue'] = df['venue'].replace('SuperSport Park, Centurion', 'SuperSport Park', regex=True)
    df['venue'] = df['venue'].replace('Narendra Modi Stadium, Ahmedabad', 'Narendra Modi Stadium', regex=True)
    df['venue'] = df['venue'].replace('Dr DY Patil Sports Academy, Mumbai', 'Dr DY Patil Sports Academy', regex=True)
    df['venue'] = df['venue'].replace('Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium, Visakhapatnam', 'Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium', regex=True)
    df['venue'] = df['venue'].replace('ICC Academy, Dubai', 'ICC Academy', regex=True)
    df['venue'] = df['venue'].replace('ICC Academy Ground No 2', 'ICC Academy', regex=True)
    df['venue'] = df['venue'].replace('ICC Global Cricket Academy', 'ICC Academy', regex=True)
    df['venue'] = df['venue'].replace('Zahur Ahmed Chowdhury Stadium, Chattogram', 'Zahur Ahmed Chowdhury Stadium', regex=True)
    df['venue'] = df['venue'].replace('Brabourne Stadium, Mumbai', 'Brabourne Stadium', regex=True)
    df['venue'] = df['venue'].replace('Kensington Oval, Bridgetown, Barbados', 'Kensington Oval, Bridgetown', regex=True)
    df['venue'] = df['venue'].replace('Queens Sports Club, Bulawayo', 'Queens Sports Club', regex=True)
    df['venue'] = df['venue'].replace('Himachal Pradesh Cricket Association Stadium, Dharamsala', 'Himachal Pradesh Cricket Association Stadium', regex=True)
    df['venue'] = df['venue'].replace('Gaddafi Stadium, Lahore', 'Gaddafi Stadium', regex=True)
    df['venue'] = df['venue'].replace('Tribhuvan University International Cricket Ground, Kirtipur', 'Tribhuvan University International Cricket Ground', regex=True)
    df['venue'] = df['venue'].replace('Indian Association Ground, Singapore', 'Indian Association Ground', regex=True)
    df['venue'] = df['venue'].replace('Saurashtra Cricket Association Stadium, Rajkot', 'Saurashtra Cricket Association Stadium', regex=True)
    df['venue'] = df['venue'].replace('The Village, Malahide, Dublin', 'The Village, Malahide', regex=True)
    df['venue'] = df['venue'].replace('Barsapara Cricket Stadium, Guwahati', 'Barsapara Cricket Stadium', regex=True)
    df['venue'] = df['venue'].replace('Sportpark Westvliet, The Hague', 'Sportpark Westvliet', regex=True)
    df['venue'] = df['venue'].replace('Bready Cricket Club, Magheramason, Bready', 'Bready', regex=True)
    df['venue'] = df['venue'].replace('Bready Cricket Club, Magheramason', 'Bready', regex=True)
    df['venue'] = df['venue'].replace('Greenfield International Stadium, Thiruvananthapuram', 'Greenfield International Stadium', regex=True)
    df['venue'] = df['venue'].replace('Edgbaston, Birmingham', 'Edgbaston', regex=True)
    df['venue'] = df['venue'].replace('Barabati Stadium, Cuttack', 'Barabati Stadium', regex=True)
    df['venue'] = df['venue'].replace('Bay Oval, Mount Maunganui', 'Bay Oval', regex=True)
    df['venue'] = df['venue'].replace('College Field, St Peter Port', 'College Field', regex=True)
    df['venue'] = df['venue'].replace('Holkar Cricket Stadium, Indore', 'Holkar Cricket Stadium', regex=True)
    df['venue'] = df['venue'].replace("National Cricket Stadium, St George's, Grenada", 'National Cricket Stadium, Grenada', regex=True)
    df['venue'] = df['venue'].replace('Daren Sammy National Cricket Stadium, Gros Islet, St Lucia', 'Darren Sammy National Cricket Stadium, St Lucia', regex=True)
    df['venue'] = df['venue'].replace('Windsor Park, Roseau, Dominica', 'Windsor Park, Roseau', regex=True)
    df['venue'] = df['venue'].replace('Manuka Oval, Canberra', 'Manuka Oval', regex=True)
    df['venue'] = df['venue'].replace('Brisbane Cricket Ground, Woolloongabba, Brisbane', 'Brisbane Cricket Ground, Woolloongabba', regex=True)
    df['venue'] = df['venue'].replace('Gymkhana Club Ground, Nairobi', 'Gymkhana Club Ground', regex=True)
    df['venue'] = df['venue'].replace('Trent Bridge, Nottingham', 'Trent Bridge', regex=True)
    df['venue'] = df['venue'].replace('County Ground, Bristol', 'County Ground', regex=True)
    df['venue'] = df['venue'].replace('Sophia Gardens, Cardiff', 'Sophia Gardens', regex=True)
    df['venue'] = df['venue'].replace('Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium, Lucknow', 'Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium', regex=True)
    df['venue'] = df['venue'].replace('Moara Vlasiei Cricket Ground, Ilfov County', 'Moara Vlasiei Cricket Ground', regex=True)
    df['venue'] = df['venue'].replace('National Stadium, Karachi', 'National Stadium', regex=True)
    print(df['venue'].nunique())
    return df

def clean_data(df):
    #clean venue column
    df = clean_venues(df)
    #remove the unwanted columns
    df = df.drop(['penalty', 'other_wicket_type', 'other_player_dismissed','season'],axis=1)
    df['delivery_id'] = df['match_id'].astype(str) + '_' + df['innings'].astype(str) + '_' + df['ball'].astype(str)
    df = df.set_index('delivery_id')
    df = df.drop(['match_id', 'innings'],axis=1)
    return df

In [323]:
clean_data(df)

Unnamed: 0_level_0,start_date,venue,ball,batting_team,bowling_team,striker,non_striker,bowler,runs_off_bat,extras,wides,noballs,byes,legbyes,wicket_type,player_dismissed
delivery_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1217744_1_0.1,2020-03-04,Terdthai Cricket Ground,0.1,Thailand,Nepal,DF Jacobs,N Pathan,S Lamichhane,0,0,,,,,,
1217744_1_0.2,2020-03-04,Terdthai Cricket Ground,0.2,Thailand,Nepal,DF Jacobs,N Pathan,S Lamichhane,0,0,,,,,,
1217744_1_0.3,2020-03-04,Terdthai Cricket Ground,0.3,Thailand,Nepal,DF Jacobs,N Pathan,S Lamichhane,0,0,,,,,,
1217744_1_0.4,2020-03-04,Terdthai Cricket Ground,0.4,Thailand,Nepal,DF Jacobs,N Pathan,S Lamichhane,0,0,,,,,,
1217744_1_0.5,2020-03-04,Terdthai Cricket Ground,0.5,Thailand,Nepal,DF Jacobs,N Pathan,S Lamichhane,0,0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
598065_2_19.2,2013-05-17,"Rajiv Gandhi International Stadium, Uppal",19.2,Rajasthan Royals,Sunrisers Hyderabad,KK Cooper,PV Tambe,DW Steyn,4,0,,,,,,
598065_2_19.3,2013-05-17,"Rajiv Gandhi International Stadium, Uppal",19.3,Rajasthan Royals,Sunrisers Hyderabad,KK Cooper,PV Tambe,DW Steyn,4,0,,,,,,
598065_2_19.4,2013-05-17,"Rajiv Gandhi International Stadium, Uppal",19.4,Rajasthan Royals,Sunrisers Hyderabad,KK Cooper,PV Tambe,DW Steyn,0,0,,,,,caught,KK Cooper
598065_2_19.5,2013-05-17,"Rajiv Gandhi International Stadium, Uppal",19.5,Rajasthan Royals,Sunrisers Hyderabad,PV Tambe,SK Trivedi,DW Steyn,1,0,,,,,,


In [282]:
#search for similar strings in venue column
df = clean_venues(df)

252
182


In [294]:
# df = df.drop(['penalty', 'other_wicket_type', 'other_player_dismissed', 'season'],axis=1)
# df['delivery_id'] = df['match_id'].astype(str) + '_' + df['innings'].astype(str) + '_' + df['ball'].astype(str)
# df = df.set_index('delivery_id')
# df = df.drop(['match_id', 'innings'],axis=1)
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
cat_cols.remove('start_date')
df[cat_cols] = df[cat_cols].astype('category')

ValueError: list.remove(x): x not in list

In [300]:
df['byes'].value_counts()

1.0    1407
4.0     513
2.0     190
3.0      24
5.0       2
Name: byes, dtype: int64

In [348]:
get_stats(clean_data(df), '2019-12-30', 'striker').loc['V Kohli']

0_runs            0.300902
1_runs            0.412118
2_runs            0.072140
3_runs            0.002373
4_runs            0.115330
6_runs            0.041449
num_dismissals    0.031957
wides             0.023730
Name: V Kohli, dtype: float64