In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

# Pandas Display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

# Reading relevant data
match_lvl_data = pd.read_csv('/mnt/data/match_level_scorecard.csv')
batsman_lvl_data = pd.read_csv('/mnt/data/batsman_level_scorecard.csv')
bowler_lvl_data = pd.read_csv('/mnt/data/bowler_level_scorecard.csv')
train_data = pd.read_csv('/mnt/data/train_data_with_samplefeatures.csv')

# Progress bar for pandas apply
tqdm.pandas()

# Function to get the last n games stats of a player before an input date using caching
def get_last_n_games_stats(player_id, date, n, bat_or_bowl):
    if bat_or_bowl == 'bat':
        df = batsman_lvl_data
        id_col = 'batsman_id'
    else:
        df = bowler_lvl_data
        id_col = 'bowler_id'

    cache_key = (player_id, date, bat_or_bowl)
    if cache_key not in cache:
        cache[cache_key] = df[(df['match_dt'] < date) & (df[id_col] == float(player_id))].sort_values(by='match_dt', ascending=False).head(n)
    return cache[cache_key]

# Initialize cache
cache = {}

# Function to get the total number of 4s scored by players in the roster of a team in last n games
def no4sLastn(player_list, date, n):
    player_list = str(player_list).split(':')
    total_fours = 0
    for player in player_list:
        player_stats = get_last_n_games_stats(player_id=player, date=date, n=n, bat_or_bowl='bat')
        total_fours += player_stats['Fours'].sum()
    return total_fours

# Function to get the total number of 6s scored by players in the roster of a team in last n games
def no6sLastn(player_list, date, n):
    player_list = str(player_list).split(':')
    total_sixes = 0
    for player in player_list:
        player_stats = get_last_n_games_stats(player_id=player, date=date, n=n, bat_or_bowl='bat')
        total_sixes += player_stats['Sixes'].sum()
    return total_sixes

# Function to get the average strike rate of players in the roster of a team in last n games
def avg_strike_rate_last_n_games(player_list, date, n):
    player_list = str(player_list).split(':')
    total_strike_rate = 0
    total_matches = 0
    for player in player_list:
        player_stats = get_last_n_games_stats(player_id=player, date=date, n=n, bat_or_bowl='bat')
        total_strike_rate += player_stats['strike_rate'].sum()
        total_matches += len(player_stats)
    return total_strike_rate / total_matches if total_matches > 0 else 0

# Function to get the average runs scored by players in the roster of a team in last n games
def avg_runs_last_n_games(player_list, date, n):
    player_list = str(player_list).split(':')
    total_runs = 0
    total_matches = 0
    for player in player_list:
        player_stats = get_last_n_games_stats(player_id=player, date=date, n=n, bat_or_bowl='bat')
        total_runs += player_stats['runs'].sum()
        total_matches += len(player_stats)
    return total_runs / total_matches if total_matches > 0 else 0

# Function to get the total number of balls faced by players in the roster of a team in last n games
def avg_balls_faced_last_n_games(player_list, date, n):
    player_list = str(player_list).split(':')
    total_balls_faced = 0
    for player in player_list:
        player_stats = get_last_n_games_stats(player_id=player, date=date, n=n, bat_or_bowl='bat')
        total_balls_faced += player_stats['balls_faced'].sum()
    return total_balls_faced

# Function to get the average count of wicket kinds by players in the roster of a team in last n games
def avg_wicket_kind_last_n_games(player_list, date, n, wicket_kind):
    player_list = str(player_list).split(':')
    total_wickets = 0
    for player in player_list:
        player_stats = get_last_n_games_stats(player_id=player, date=date, n=n, bat_or_bowl='bat')
        total_wickets += (player_stats['wicket kind'] == wicket_kind).sum()
    return total_wickets

# Function to get the standard deviation of runs scored by players in the roster of a team in last n games
def runs_consistency_last_n_games(player_list, date, n):
    player_list = str(player_list).split(':')
    all_runs = []
    for player in player_list:
        player_stats = get_last_n_games_stats(player_id=player, date=date, n=n, bat_or_bowl='bat')
        all_runs.extend(player_stats['runs'].tolist())
    return np.std(all_runs) if all_runs else 0

# Function to get the standard deviation of strike rates of players in the roster of a team in last n games
def strike_rate_consistency_last_n_games(player_list, date, n):
    player_list = str(player_list).split(':')
    all_strike_rates = []
    for player in player_list:
        player_stats = get_last_n_games_stats(player_id=player, date=date, n=n, bat_or_bowl='bat')
        all_strike_rates.extend(player_stats['strike_rate'].tolist())
    return np.std(all_strike_rates) if all_strike_rates else 0

# Function to get the boundary percentage (percentage of runs from 4s and 6s) by players in the roster of a team in last n games
def boundary_percentage_last_n_games(player_list, date, n):
    player_list = str(player_list).split(':')
    total_runs = 0
    total_boundaries = 0
    for player in player_list:
        player_stats = get_last_n_games_stats(player_id=player, date=date, n=n, bat_or_bowl='bat')
        total_runs += player_stats['runs'].sum()
        total_boundaries += player_stats['Fours'].sum() * 4 + player_stats['Sixes'].sum() * 6
    return (total_boundaries / total_runs) if total_runs > 0 else 0

# List of wicket kinds to compute
wicket_kinds = ['caught', 'bowled', 'lbw', 'run out', 'stumped']

# Compute features for train_data
train_data['team1_count_4runs_last15'] = train_data.progress_apply(lambda x: \
            no4sLastn(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
train_data['team2_count_4runs_last15'] = train_data.progress_apply(lambda x: \
            no4sLastn(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)
train_data['team_count_4runs_last15'] = (train_data['team1_count_4runs_last15']+1)/(train_data['team2_count_4runs_last15']+1)
train_data.drop(columns=['team1_count_4runs_last15','team2_count_4runs_last15'], inplace=True)

train_data['team1_count_6runs_last15'] = train_data.progress_apply(lambda x: \
            no6sLastn(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
train_data['team2_count_6runs_last15'] = train_data.progress_apply(lambda x: \
            no6sLastn(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)
train_data['team_count_6runs_last15'] = (train_data['team1_count_6runs_last15']+1)/(train_data['team2_count_6runs_last15']+1)
train_data.drop(columns=['team1_count_6runs_last15','team2_count_6runs_last15'], inplace=True)

train_data['team1_avg_strike_rate_last15'] = train_data.progress_apply(lambda x: \
            avg_strike_rate_last_n_games(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
train_data['team2_avg_strike_rate_last15'] = train_data.progress_apply(lambda x: \
            avg_strike_rate_last_n_games(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)
train_data['avg_stkrate'] = (train_data['team1_avg_strike_rate_last15']+1)/(train_data['team2_avg_strike_rate_last15']+1)
train_data.drop(columns=['team1_avg_strike_rate_last15','team2_avg_strike_rate_last15'], inplace=True)

train_data['team1_avg_runs_last15'] = train_data.progress_apply(lambda x: \
            avg_runs_last_n_games(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
train_data['team2_avg_runs_last15'] = train_data.progress_apply(lambda x: \
            avg_runs_last_n_games(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)
train_data['avg_runs'] = (train_data['team1_avg_runs_last15']+1)/(train_data['team2_avg_runs_last15']+1)
train_data.drop(columns=['team1_avg_runs_last15','team2_avg_runs_last15'], inplace=True)

train_data['team1_avg_balls_faced_last15'] = train_data.progress_apply(lambda x: \
            avg_balls_faced_last_n_games(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
train_data['team2_avg_balls_faced_last15'] = train_data.progress_apply(lambda x: \
            avg_balls_faced_last_n_games(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)
train_data['avg_balls_faced'] = (train_data['team1_avg_balls_faced_last15']+1)/(train_data['team2_avg_balls_faced_last15']+1)
train_data.drop(columns=['team1_avg_balls_faced_last15','team2_avg_balls_faced_last15'], inplace=True)

# Compute the average count of each wicket kind for team 1 and team 2, and compute the ratio
for wicket_kind in wicket_kinds:
    train_data[f'team1_avg_{wicket_kind}_last15'] = train_data.progress_apply(lambda x: \
                avg_wicket_kind_last_n_games(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15, wicket_kind=wicket_kind), axis=1)
    train_data[f'team2_avg_{wicket_kind}_last15'] = train_data.progress_apply(lambda x: \
                avg_wicket_kind_last_n_games(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15, wicket_kind=wicket_kind), axis=1)
    
    # Taking ratio of (total wicket kind count in last 15 games for team1) to (total wicket kind count in last 15 games for team2). Adding 1 to handle divide by zero exceptions.
    train_data[f'avg_{wicket_kind}'] = (train_data[f'team1_avg_{wicket_kind}_last15']+1)/(train_data[f'team2_avg_{wicket_kind}_last15']+1)
    train_data.drop(columns=[f'team1_avg_{wicket_kind}_last15', f'team2_avg_{wicket_kind}_last15'], inplace=True) # dropping intermediate columns

# Compute additional features for consistency and aggressiveness metrics
train_data['team1_runs_consistency_last15'] = train_data.progress_apply(lambda x: \
            runs_consistency_last_n_games(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
train_data['team2_runs_consistency_last15'] = train_data.progress_apply(lambda x: \
            runs_consistency_last_n_games(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)
train_data['runs_consistency'] = (train_data['team1_runs_consistency_last15']+1)/(train_data['team2_runs_consistency_last15']+1)
train_data.drop(columns=['team1_runs_consistency_last15','team2_runs_consistency_last15'], inplace=True)

train_data['team1_strike_rate_consistency_last15'] = train_data.progress_apply(lambda x: \
            strike_rate_consistency_last_n_games(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
train_data['team2_strike_rate_consistency_last15'] = train_data.progress_apply(lambda x: \
            strike_rate_consistency_last_n_games(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)
train_data['strike_rate_consistency'] = (train_data['team1_strike_rate_consistency_last15']+1)/(train_data['team2_strike_rate_consistency_last15']+1)
train_data.drop(columns=['team1_strike_rate_consistency_last15','team2_strike_rate_consistency_last15'], inplace=True)

train_data['team1_boundary_percentage_last15'] = train_data.progress_apply(lambda x: \
            boundary_percentage_last_n_games(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
train_data['team2_boundary_percentage_last15'] = train_data.progress_apply(lambda x: \
            boundary_percentage_last_n_games(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)
train_data['boundary_percentage'] = (train_data['team1_boundary_percentage_last15']+1)/(train_data['team2_boundary_percentage_last15']+1)
train_data.drop(columns=['team1_boundary_percentage_last15','team2_boundary_percentage_last15'], inplace=True)

# Save the updated train_data to a new CSV file
train_data.to_csv('/mnt/data/train_data_with_new_features.csv', index=False)

print("Feature computation complete and data saved to 'train_data_with_new_features.csv'")

