In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

# Load datasets
match_lvl_data = pd.read_csv('match_level_scorecard.csv')
batsman_lvl_data = pd.read_csv('batsman_level_scorecard.csv')
bowler_lvl_data = pd.read_csv('bowler_level_scorecard.csv')
train_data = pd.read_csv('train_data_with_samplefeatures.csv')
test_data = pd.read_csv('test_data_with_samplefeatures.csv')
def giveLastNgamesPlayer(player_id, date, n, bat_or_bowl):
    '''
    Function to get last n games stats of a player before an input date.
    
    Input-
    1. player_id: id of the player to get historical data.
    2. date: date to look-back and get n games. Stats returned are before this input date.
    3. n: Number of historical games stats to return.
    4. bat_or_bowl: Kind of stats to return. {'bat': batting stats to return, 'bowl': bowling stats to return}
    
    Output-None
    
    Returns- dataframe having bowling/batting stats from last n games of a player before an input date. 
    The results are sorted by date.
    '''
    if bat_or_bowl == 'bat':
        df_topick = batsman_lvl_data
        id_col = 'batsman_id'
    else:
        df_topick = bowler_lvl_data
        id_col = 'bowler_id'
        
    return df_topick[(df_topick['match_dt']<date)&(df_topick[id_col]==float(player_id))]\
                .sort_values(by='match_dt', ascending=False).head(n)

def get_top_n_players_stats(player_list, date, n, stat, bat_or_bowl, top_n=3):
    """
    Function to get the top N players' statistics in the last n games.
    
    Parameters:
    - player_list: ':' separated list of player ids in the roster of a team.
    - date: match date of the game to calculate this feature.
    - n: Number of games to look-back and create this feature.
    - stat: Statistic to calculate ('runs', 'wickets', 'strike_rate', etc.)
    - bat_or_bowl: 'bat' for batsmen stats, 'bowl' for bowler stats.
    - top_n: Number of top players to consider for the calculation.
    
    Returns:
    - Sum of the statistic for the top N players.
    """
    player_list = str(player_list).split(':')
    stats = []
    
    for player in player_list:
        if bat_or_bowl == 'bat':
            df_rel = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bat')
            stats.append(df_rel[stat].sum())
        else:
            df_rel = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bowl')
            stats.append(df_rel[stat].sum())
    
    top_n_stats = sorted(stats, reverse=True)[:top_n]
    return np.nansum(top_n_stats)

def add_new_features(data):
    data['team1_top3_batsmen_runs_last10'] = data.progress_apply(lambda x: \
        get_top_n_players_stats(player_list=x['team1_roster_ids'], date=x['match_dt'], n=10, stat='runs', bat_or_bowl='bat'), axis=1)
    data['team2_top3_batsmen_runs_last10'] = data.progress_apply(lambda x: \
        get_top_n_players_stats(player_list=x['team2_roster_ids'], date=x['match_dt'], n=10, stat='runs', bat_or_bowl='bat'), axis=1)
    
    data['team1_top3_bowlers_wickets_last10'] = data.progress_apply(lambda x: \
        get_top_n_players_stats(player_list=x['team1_roster_ids'], date=x['match_dt'], n=10, stat='wicket_count', bat_or_bowl='bowl'), axis=1)
    data['team2_top3_bowlers_wickets_last10'] = data.progress_apply(lambda x: \
        get_top_n_players_stats(player_list=x['team2_roster_ids'], date=x['match_dt'], n=10, stat='wicket_count', bat_or_bowl='bowl'), axis=1)
    
    data['team1_centuries_last10'] = data.progress_apply(lambda x: \
        get_top_n_players_stats(player_list=x['team1_roster_ids'], date=x['match_dt'], n=10, stat='centuries', bat_or_bowl='bat'), axis=1)
    data['team2_centuries_last10'] = data.progress_apply(lambda x: \
        get_top_n_players_stats(player_list=x['team2_roster_ids'], date=x['match_dt'], n=10, stat='centuries', bat_or_bowl='bat'), axis=1)
    
    data['team1_top3_bowlers_economy_last10'] = data.progress_apply(lambda x: \
        get_top_n_players_stats(player_list=x['team1_roster_ids'], date=x['match_dt'], n=10, stat='economy', bat_or_bowl='bowl'), axis=1)
    data['team2_top3_bowlers_economy_last10'] = data.progress_apply(lambda x: \
        get_top_n_players_stats(player_list=x['team2_roster_ids'], date=x['match_dt'], n=10, stat='economy', bat_or_bowl='bowl'), axis=1)
    
    data['team1_top3_batsmen_strike_rate_last10'] = data.progress_apply(lambda x: \
        get_top_n_players_stats(player_list=x['team1_roster_ids'], date=x['match_dt'], n=10, stat='strike_rate', bat_or_bowl='bat'), axis=1)
    data['team2_top3_batsmen_strike_rate_last10'] = data.progress_apply(lambda x: \
        get_top_n_players_stats(player_list=x['team2_roster_ids'], date=x['match_dt'], n=10, stat='strike_rate', bat_or_bowl='bat'), axis=1)
    
    data['team1_wins_last10'] = data.progress_apply(lambda x: \
        winpLastn(x['team1_id'], x['match_dt'], 10), axis=1)
    data['team2_wins_last10'] = data.progress_apply(lambda x: \
        winpLastn(x['team2_id'], x['match_dt'], 10), axis=1)
    
    data['team1_half_centuries_last10'] = data.progress_apply(lambda x: \
        get_top_n_players_stats(player_list=x['team1_roster_ids'], date=x['match_dt'], n=10, stat='half_centuries', bat_or_bowl='bat'), axis=1)
    data['team2_half_centuries_last10'] = data.progress_apply(lambda x: \
        get_top_n_players_stats(player_list=x['team2_roster_ids'], date=x['match_dt'], n=10, stat='half_centuries', bat_or_bowl='bat'), axis=1)
    
    data['team1_top3_bowlers_maidens_last10'] = data.progress_apply(lambda x: \
        get_top_n_players_stats(player_list=x['team1_roster_ids'], date=x['match_dt'], n=10, stat='maiden', bat_or_bowl='bowl'), axis=1)
    data['team2_top3_bowlers_maidens_last10'] = data.progress_apply(lambda x: \
        get_top_n_players_stats(player_list=x['team2_roster_ids'], date=x['match_dt'], n=10, stat='maiden', bat_or_bowl='bowl'), axis=1)
    
    data['team1_sixes_last10'] = data.progress_apply(lambda x: \
        get_top_n_players_stats(player_list=x['team1_roster_ids'], date=x['match_dt'], n=10, stat='Sixes', bat_or_bowl='bat'), axis=1)
    data['team2_sixes_last10'] = data.progress_apply(lambda x: \
        get_top_n_players_stats(player_list=x['team2_roster_ids'], date=x['match_dt'], n=10, stat='Sixes', bat_or_bowl='bat'), axis=1)
    
    data['team1_top3_bowlers_runs_conceded_last10'] = data.progress_apply(lambda x: \
        get_top_n_players_stats(player_list=x['team1_roster_ids'], date=x['match_dt'], n=10, stat='runs', bat_or_bowl='bowl'), axis=1)
    data['team2_top3_bowlers_runs_conceded_last10'] = data.progress_apply(lambda x: \
        get_top_n_players_stats(player_list=x['team2_roster_ids'], date=x['match_dt'], n=10, stat='runs', bat_or_bowl='bowl'), axis=1)
    
    return data

# Adding new features to train and test data
train_data = add_new_features(train_data)
test_data = add_new_features(test_data)

# # Display the updated train and test data
# train_data.shape, train_data.head( &#8203;:citation[oaicite:0]{index=0}&#8203;


C:\Users\riori\anaconda3\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
C:\Users\riori\anaconda3\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll
100%|███████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:18<00:00, 51.47it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:18<00:00, 51.79it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:15<00:00, 60.48it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:15<00:00, 60.19it/s]
  0%|                                                                                        | 1/948 [00:00<00:03, 250.45it/s]


KeyError: 'centuries'

In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

# Load datasets
match_lvl_data = pd.read_csv('match_level_scorecard.csv')
batsman_lvl_data = pd.read_csv('batsman_level_scorecard.csv')
bowler_lvl_data = pd.read_csv('bowler_level_scorecard.csv')
train_data = pd.read_csv('train_data_with_samplefeatures.csv')
test_data = pd.read_csv('test_data_with_samplefeatures.csv')

def giveLastNgamesPlayer(player_id, date, n, bat_or_bowl):
    '''
    Function to get last n games stats of a player before an input date.
    
    Input-
    1. player_id: id of the player to get historical data.
    2. date: date to look-back and get n games. Stats returned are before this input date.
    3. n: Number of historical games stats to return.
    4. bat_or_bowl: Kind of stats to return. {'bat': batting stats to return, 'bowl': bowling stats to return}
    
    Output-None
    
    Returns- dataframe having bowling/batting stats from last n games of a player before an input date. 
    The results are sorted by date.
    '''
    if bat_or_bowl == 'bat':
        df_topick = batsman_lvl_data
        id_col = 'batsman_id'
    else:
        df_topick = bowler_lvl_data
        id_col = 'bowler_id'
        
    return df_topick[(df_topick['match_dt']<date)&(df_topick[id_col]==float(player_id))]\
                .sort_values(by='match_dt', ascending=False).head(n)

def winpLastn(team_id, date, n):
    '''
    Get a team's win % in last n games. If a team has won 3 game out of their last 5, win% is 60%.
    
    Input-
    1. team_id: ID of the team to get their last n games and winner information from those games.
    2. date: match date from which to get last n historical games.
    3. n: look-back window of games.
    
    Output- None
    
    Returns- Float value denoting win% of the team in last n games.
    '''
    # filter out games with either team1/2_id as input team id, match_dt being before current game's date, sort desc by date, and get top n rows (games)
    df_rel = match_lvl_data[(match_lvl_data['match_dt']<date)&\
                      ((match_lvl_data['team1_id']==team_id)|(match_lvl_data['team2_id']==team_id))]\
                        .sort_values(by='match_dt', ascending=False).head(n) 
    win_count = df_rel[df_rel['winner_id']==team_id].shape[0] # count number of rows having winner as the input team
    if win_count == 0:
        return 0
    return round(win_count*100/df_rel.shape[0], 2) # return win% rounded to two decimal points

def get_team_stats(player_list, date, n, stat, bat_or_bowl):
    """
    Function to get the team's statistics in the last n games.
    
    Parameters:
    - player_list: ':' separated list of player ids in the roster of a team.
    - date: match date of the game to calculate this feature.
    - n: Number of games to look-back and create this feature.
    - stat: Statistic to calculate ('runs', 'wickets', 'strike_rate', etc.)
    - bat_or_bowl: 'bat' for batsmen stats, 'bowl' for bowler stats.
    
    Returns:
    - Sum of the statistic for the team.
    """
    player_list = str(player_list).split(':')
    stats = []
    
    for player in player_list:
        if bat_or_bowl == 'bat':
            df_rel = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bat')
            stats.append(df_rel[stat].sum())
        else:
            df_rel = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bowl')
            stats.append(df_rel[stat].sum())
    
    return np.nansum(stats)

def add_new_features(data):
    data['team1_total_runs_last10'] = data.progress_apply(lambda x: \
        get_team_stats(player_list=x['team1_roster_ids'], date=x['match_dt'], n=10, stat='runs', bat_or_bowl='bat'), axis=1)
    data['team2_total_runs_last10'] = data.progress_apply(lambda x: \
        get_team_stats(player_list=x['team2_roster_ids'], date=x['match_dt'], n=10, stat='runs', bat_or_bowl='bat'), axis=1)
    
    data['team1_total_wickets_last10'] = data.progress_apply(lambda x: \
        get_team_stats(player_list=x['team1_roster_ids'], date=x['match_dt'], n=10, stat='wicket_count', bat_or_bowl='bowl'), axis=1)
    data['team2_total_wickets_last10'] = data.progress_apply(lambda x: \
        get_team_stats(player_list=x['team2_roster_ids'], date=x['match_dt'], n=10, stat='wicket_count', bat_or_bowl='bowl'), axis=1)
    
    data['team1_avg_strike_rate_last10'] = data.progress_apply(lambda x: \
        get_team_stats(player_list=x['team1_roster_ids'], date=x['match_dt'], n=10, stat='strike_rate', bat_or_bowl='bat') / 10, axis=1)
    data['team2_avg_strike_rate_last10'] = data.progress_apply(lambda x: \
        get_team_stats(player_list=x['team2_roster_ids'], date=x['match_dt'], n=10, stat='strike_rate', bat_or_bowl='bat') / 10, axis=1)
    
    data['team1_avg_economy_rate_last10'] = data.progress_apply(lambda x: \
        get_team_stats(player_list=x['team1_roster_ids'], date=x['match_dt'], n=10, stat='economy', bat_or_bowl='bowl') / 10, axis=1)
    data['team2_avg_economy_rate_last10'] = data.progress_apply(lambda x: \
        get_team_stats(player_list=x['team2_roster_ids'], date=x['match_dt'], n=10, stat='economy', bat_or_bowl='bowl') / 10, axis=1)
    
    data['team1_wins_last10'] = data.progress_apply(lambda x: \
        winpLastn(x['team1_id'], x['match_dt'], 10), axis=1)
    data['team2_wins_last10'] = data.progress_apply(lambda x: \
        winpLastn(x['team2_id'], x['match_dt'], 10), axis=1)
    
    data['team1_total_fours_last10'] = data.progress_apply(lambda x: \
        get_team_stats(player_list=x['team1_roster_ids'], date=x['match_dt'], n=10, stat='Fours', bat_or_bowl='bat'), axis=1)
    data['team2_total_fours_last10'] = data.progress_apply(lambda x: \
        get_team_stats(player_list=x['team2_roster_ids'], date=x['match_dt'], n=10, stat='Fours', bat_or_bowl='bat'), axis=1)
    
    data['team1_total_sixes_last10'] = data.progress_apply(lambda x: \
        get_team_stats(player_list=x['team1_roster_ids'], date=x['match_dt'], n=10, stat='Sixes', bat_or_bowl='bat'), axis=1)
    data['team2_total_sixes_last10'] = data.progress_apply(lambda x: \
        get_team_stats(player_list=x['team2_roster_ids'], date=x['match_dt'], n=10, stat='Sixes', bat_or_bowl='bat'), axis=1)
    
    data['team1_total_dots_last10'] = data.progress_apply(lambda x: \
        get_team_stats(player_list=x['team1_roster_ids'], date=x['match_dt'], n=10, stat='dots', bat_or_bowl='bowl'), axis=1)
    data['team2_total_dots_last10'] = data.progress_apply(lambda x: \
        get_team_stats(player_list=x['team2_roster_ids'], date=x['match_dt'], n=10, stat='dots', bat_or_bowl='bowl'), axis=1)
    
    data['team1_total_wides_last10'] = data.progress_apply(lambda x: \
        get_team_stats(player_list=x['team1_roster_ids'], date=x['match_dt'], n=10, stat='wides', bat_or_bowl='bowl'), axis=1)
    data['team2_total_wides_last10'] = data.progress_apply(lambda x: \
        get_team_stats(player_list=x['team2_roster_ids'], date=x['match_dt'], n=10, stat='wides', bat_or_bowl='bowl'), axis=1)
    
    data['team1_total_noballs_last10'] = data.progress_apply(lambda x: \
        get_team_stats(player_list=x['team1_roster_ids'], date=x['match_dt'], n=10, stat='noballs', bat_or_bowl='bowl'), axis=1)
    data['team2_total_noballs_last10'] = data.progress_apply(lambda x: \
        get_team_stats(player_list=x['team2_roster_ids'], date=x['match_dt'], n=10, stat='noballs', bat_or_bowl='bowl'), axis=1)
    
    return data

#
train_data = add_new_features(train_data)
test_data = add_new_features(test_data)


100%|███████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:19<00:00, 49.86it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:19<00:00, 48.90it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:16<00:00, 56.16it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:16<00:00, 57.24it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:20<00:00, 47.26it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:21<00:00, 45.14it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:17<00:00, 54.06it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:18<00

In [4]:
train_data.head()

Unnamed: 0,match id,team1,team1_id,team1_roster_ids,team2,team2_id,team2_roster_ids,winner,winner_id,toss winner,...,team1_total_fours_last10,team2_total_fours_last10,team1_total_sixes_last10,team2_total_sixes_last10,team1_total_dots_last10,team2_total_dots_last10,team1_total_wides_last10,team2_total_wides_last10,team1_total_noballs_last10,team2_total_noballs_last10
0,9331181,Ba,11283,9373356.0:7857520.0:4232164.0:4566540.0:329940...,Hl Ph,12634,3500958.0:4231751.0:2735081.0:2035102.0:369833...,Hl Ph,12634,Hl Ph,...,49.0,40.0,19.0,12.0,306.0,176.0,11,9,3,0
1,8797060,Ed,20,2089079.0:6139370.0:2076192.0:62432.0:2083409....,Wt Is,41,4690258.0:4069666.0:4230127.0:1942317.0:161392...,Ed,20,Wt Is,...,118.0,120.0,45.0,92.0,464.0,522.0,23,76,4,2
2,9433269,We,10576,3298427.0:2288789.0:7773338.0:3519011.0:368195...,Ne,8987,4003390.0:1749075.0:1626526.0:4172447.0:551672...,We,10576,Ne,...,117.0,149.0,48.0,69.0,334.0,267.0,29,21,2,0
3,9587073,Ga An Ws,36084,8127230.0:4690328.0:4069666.0:7960847.0:469018...,Bs Rs,36070,3462080.0:2436405.0:1798705.0:7550857.0:574247...,Ga An Ws,36084,Ga An Ws,...,141.0,92.0,108.0,62.0,467.0,491.0,66,69,7,8
4,9516457,Pb Ks,30407,8127181.0:197658.0:4239038.0:2398346.0:5053082...,Gt Ts,48341,1958683.0:7491224.0:8059029.0:4377610.0:225245...,Gt Ts,48341,Gt Ts,...,130.0,169.0,62.0,59.0,529.0,506.0,43,32,6,3


In [5]:
train_data.to_csv('train_data_trial_4.csv', index=False)
test_data.to_csv('test_data_trial_4.csv', index=False)

In [8]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

# Load datasets
match_lvl_data = pd.read_csv('match_level_scorecard.csv')
batsman_lvl_data = pd.read_csv('batsman_level_scorecard.csv')
bowler_lvl_data = pd.read_csv('bowler_level_scorecard.csv')
train_data = pd.read_csv('train_data_with_samplefeatures.csv')
test_data = pd.read_csv('test_data_with_samplefeatures.csv')
def giveLastNgamesPlayer(player_id, date, n, bat_or_bowl):
    '''
    Function to get last n games stats of a player before an input date.
    
    Input-
    1. player_id: id of the player to get historical data.
    2. date: date to look-back and get n games. Stats returned are before this input date.
    3. n: Number of historical games stats to return.
    4. bat_or_bowl: Kind of stats to return. {'bat': batting stats to return, 'bowl': bowling stats to return}
    
    Output-None
    
    Returns- dataframe having bowling/batting stats from last n games of a player before an input date. 
    The results are sorted by date.
    '''
    if bat_or_bowl == 'bat':
        df_topick = batsman_lvl_data
        id_col = 'batsman_id'
    else:
        df_topick = bowler_lvl_data
        id_col = 'bowler_id'
        
    return df_topick[(df_topick['match_dt']<date)&(df_topick[id_col]==float(player_id))]\
                .sort_values(by='match_dt', ascending=False).head(n)

def get_team_stats(player_list, date, n, stat, bat_or_bowl):
    """
    Function to get the team's statistics in the last n games.
    
    Parameters:
    - player_list: ':' separated list of player ids in the roster of a team.
    - date: match date of the game to calculate this feature.
    - n: Number of games to look-back and create this feature.
    - stat: Statistic to calculate ('runs', 'wickets', 'strike_rate', etc.)
    - bat_or_bowl: 'bat' for batsmen stats, 'bowl' for bowler stats.
    
    Returns:
    - Sum of the statistic for the team.
    """
    player_list = str(player_list).split(':')
    stats = []
    
    for player in player_list:
        if bat_or_bowl == 'bat':
            df_rel = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bat')
            stats.append(df_rel[stat].sum())
        else:
            df_rel = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bowl')
            stats.append(df_rel[stat].sum())
    
    return np.nansum(stats)

def add_new_features(data):
    data['team1_total_runs_last10'] = data.progress_apply(lambda x: \
        get_team_stats(player_list=x['team1_roster_ids'], date=x['match_dt'], n=10, stat='runs', bat_or_bowl='bat'), axis=1)
    data['team2_total_runs_last10'] = data.progress_apply(lambda x: \
        get_team_stats(player_list=x['team2_roster_ids'], date=x['match_dt'], n=10, stat='runs', bat_or_bowl='bat'), axis=1)
    
    data['team1_total_wickets_last10'] = data.progress_apply(lambda x: \
        get_team_stats(player_list=x['team1_roster_ids'], date=x['match_dt'], n=10, stat='wicket_count', bat_or_bowl='bowl'), axis=1)
    data['team2_total_wickets_last10'] = data.progress_apply(lambda x: \
        get_team_stats(player_list=x['team2_roster_ids'], date=x['match_dt'], n=10, stat='wicket_count', bat_or_bowl='bowl'), axis=1)
    
    data['team1_avg_strike_rate_last10'] = data.progress_apply(lambda x: \
        get_team_stats(player_list=x['team1_roster_ids'], date=x['match_dt'], n=10, stat='strike_rate', bat_or_bowl='bat') / 10, axis=1)
    data['team2_avg_strike_rate_last10'] = data.progress_apply(lambda x: \
        get_team_stats(player_list=x['team2_roster_ids'], date=x['match_dt'], n=10, stat='strike_rate', bat_or_bowl='bat') / 10, axis=1)
    
    data['team1_avg_economy_rate_last10'] = data.progress_apply(lambda x: \
        get_team_stats(player_list=x['team1_roster_ids'], date=x['match_dt'], n=10, stat='economy', bat_or_bowl='bowl') / 10, axis=1)
    data['team2_avg_economy_rate_last10'] = data.progress_apply(lambda x: \
        get_team_stats(player_list=x['team2_roster_ids'], date=x['match_dt'], n=10, stat='economy', bat_or_bowl='bowl') / 10, axis=1)
    
    data['team1_wins_last10'] = data.progress_apply(lambda x: \
        winpLastn(x['team1_id'], x['match_dt'], 10), axis=1)
    data['team2_wins_last10'] = data.progress_apply(lambda x: \
        winpLastn(x['team2_id'], x['match_dt'], 10), axis=1)
    
    data['team1_total_fours_last10'] = data.progress_apply(lambda x: \
        get_team_stats(player_list=x['team1_roster_ids'], date=x['match_dt'], n=10, stat='Fours', bat_or_bowl='bat'), axis=1)
    data['team2_total_fours_last10'] = data.progress_apply(lambda x: \
        get_team_stats(player_list=x['team2_roster_ids'], date=x['match_dt'], n=10, stat='Fours', bat_or_bowl='bat'), axis=1)
    
    data['team1_total_sixes_last10'] = data.progress_apply(lambda x: \
        get_team_stats(player_list=x['team1_roster_ids'], date=x['match_dt'], n=10, stat='Sixes', bat_or_bowl='bat'), axis=1)
    data['team2_total_sixes_last10'] = data.progress_apply(lambda x: \
        get_team_stats(player_list=x['team2_roster_ids'], date=x['match_dt'], n=10, stat='Sixes', bat_or_bowl='bat'), axis=1)
    
    data['team1_total_dots_last10'] = data.progress_apply(lambda x: \
        get_team_stats(player_list=x['team1_roster_ids'], date=x['match_dt'], n=10, stat='dots', bat_or_bowl='bowl'), axis=1)
    data['team2_total_dots_last10'] = data.progress_apply(lambda x: \
        get_team_stats(player_list=x['team2_roster_ids'], date=x['match_dt'], n=10, stat='dots', bat_or_bowl='bowl'), axis=1)
    
    data['team1_total_wides_last10'] = data.progress_apply(lambda x: \
        get_team_stats(player_list=x['team1_roster_ids'], date=x['match_dt'], n=10, stat='wides', bat_or_bowl='bowl'), axis=1)
    data['team2_total_wides_last10'] = data.progress_apply(lambda x: \
        get_team_stats(player_list=x['team2_roster_ids'], date=x['match_dt'], n=10, stat='wides', bat_or_bowl='bowl'), axis=1)
    
    data['team1_total_noballs_last10'] = data.progress_apply(lambda x: \
        get_team_stats(player_list=x['team1_roster_ids'], date=x['match_dt'], n=10, stat='noballs', bat_or_bowl='bowl'), axis=1)
    data['team2_total_noballs_last10'] = data.progress_apply(lambda x: \
        get_team_stats(player_list=x['team2_roster_ids'], date=x['match_dt'], n=10, stat='noballs', bat_or_bowl='bowl'), axis=1)
    
    # Calculate ratios and drop original columns
    data['total_runs_last10_ratio'] = (data['team1_total_runs_last10'] + 1) / (data['team2_total_runs_last10'] + 1)
    data['total_wickets_last10_ratio'] = (data['team1_total_wickets_last10'] + 1) / (data['team2_total_wickets_last10'] + 1)
    data['avg_strike_rate_last10_ratio'] = (data['team1_avg_strike_rate_last10'] + 1) / (data['team2_avg_strike_rate_last10'] + 1)
    data['avg_economy_rate_last10_ratio'] = (data['team1_avg_economy_rate_last10'] + 1) / (data['team2_avg_economy_rate_last10'] + 1)
    data['wins_last10_ratio'] = (data['team1_wins_last10'] + 1) / (data['team2_wins_last10'] + 1)
    data['total_fours_last10_ratio'] = (data['team1_total_fours_last10'] + 1) / (data['team2_total_fours_last10'] + 1)
    data['total_sixes_last10_ratio'] = (data['team1_total_sixes_last10'] + 1) / (data['team2_total_sixes_last10'] + 1)
    data['total_dots_last10_ratio'] = (data['team1_total_dots_last10'] + 1) / (data['team2_total_dots_last10'] + 1)
    data['total_wides_last10_ratio'] = (data['team1_total_wides_last10'] + 1) / (data['team2_total_wides_last10'] + 1)
    data['total_noballs_last10_ratio'] = (data['team1_total_noballs_last10'] + 1) / (data['team2_total_noballs_last10'] + 1)

    # Drop original columns
    data.drop(columns=['team1_total_runs_last10', 'team2_total_runs_last10', 'team1_total_wickets_last10', 'team2_total_wickets_last10',
                       'team1_avg_strike_rate_last10', 'team2_avg_strike_rate_last10', 'team1_avg_economy_rate_last10', 'team2_avg_economy_rate_last10',
                       'team1_wins_last10', 'team2_wins_last10', 'team1_total_fours_last10', 'team2_total_fours_last10', 'team1_total_sixes_last10',
                       'team2_total_sixes_last10', 'team1_total_dots_last10', 'team2_total_dots_last10', 'team1_total_wides_last10', 'team2_total_wides_last10',
                       'team1_total_noballs_last10', 'team2_total_noballs_last10'], inplace=True)

    return data

# Add new features to train and test data
train_data = add_new_features(train_data)
test_data = add_new_features(test_data)

# Prepare X and y
# X, y = train_data[['toss_winner_01', 'toss_decision_01', 'team_count_50runs_last15', 'team_winp_last5', 'team1only_avg_runs_last15', 
#                    'team1_winp_team2_last15', 'ground_avg_runs_last15', 'total_runs_last10_ratio', 'total_wickets_last10_ratio', 
#                    'avg_strike_rate_last10_ratio', 'avg_economy_rate_last10_ratio', 'wins_last10_ratio', 'total_fours_last10_ratio', 
#                    'total_sixes_last10_ratio', 'total_dots_last10_ratio', 'total_wides_last10_ratio', 'total_noballs_last10_ratio']], train_data['winner_01']



100%|███████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:18<00:00, 51.29it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:18<00:00, 50.54it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:16<00:00, 58.31it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:16<00:00, 58.54it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:19<00:00, 49.53it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:19<00:00, 49.03it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:16<00:00, 57.70it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:16<00

In [9]:
train_data.head()

Unnamed: 0,match id,team1,team1_id,team1_roster_ids,team2,team2_id,team2_roster_ids,winner,winner_id,toss winner,...,total_runs_last10_ratio,total_wickets_last10_ratio,avg_strike_rate_last10_ratio,avg_economy_rate_last10_ratio,wins_last10_ratio,total_fours_last10_ratio,total_sixes_last10_ratio,total_dots_last10_ratio,total_wides_last10_ratio,total_noballs_last10_ratio
0,9331181,Ba,11283,9373356.0:7857520.0:4232164.0:4566540.0:329940...,Hl Ph,12634,3500958.0:4231751.0:2735081.0:2035102.0:369833...,Hl Ph,12634,Hl Ph,...,1.192744,1.652174,1.318388,1.943999,0.672131,1.219512,1.538462,1.734463,1.2,4.0
1,8797060,Ed,20,2089079.0:6139370.0:2076192.0:62432.0:2083409....,Wt Is,41,4690258.0:4069666.0:4230127.0:1942317.0:161392...,Ed,20,Wt Is,...,0.792676,0.746835,0.815299,0.804954,2.904762,0.983471,0.494624,0.889101,0.311688,1.666667
2,9433269,We,10576,3298427.0:2288789.0:7773338.0:3519011.0:368195...,Ne,8987,4003390.0:1749075.0:1626526.0:4172447.0:551672...,We,10576,Ne,...,0.82042,1.22449,0.92916,0.868069,0.756098,0.786667,0.7,1.25,1.363636,3.0
3,9587073,Ga An Ws,36084,8127230.0:4690328.0:4069666.0:7960847.0:469018...,Bs Rs,36070,3462080.0:2436405.0:1798705.0:7550857.0:574247...,Ga An Ws,36084,Ga An Ws,...,1.610184,0.886076,1.18794,0.759255,1.196078,1.526882,1.730159,0.95122,0.957143,0.888889
4,9516457,Pb Ks,30407,8127181.0:197658.0:4239038.0:2398346.0:5053082...,Gt Ts,48341,1958683.0:7491224.0:8059029.0:4377610.0:225245...,Gt Ts,48341,Gt Ts,...,0.91115,1.027778,1.28173,1.191714,0.859155,0.770588,1.05,1.045365,1.333333,1.75


In [10]:
train_data.to_csv('train_data_trial_4.csv', index=False)
test_data.to_csv('test_data_trial_4.csv', index=False)