In [15]:
import pandas as pd
import numpy as np

# Load the datasets
match_df = pd.read_csv('./match_level_scorecard.csv')
bowler_df = pd.read_csv('./bowler_level_scorecard.csv')
batsman_df = pd.read_csv('./batsman_level_scorecard.csv')

# Convert match date to datetime for sorting
match_df['match_dt'] = pd.to_datetime(match_df['match_dt'])
batsman_df['match_dt'] = pd.to_datetime(batsman_df['match_dt'])

# Function to convert string roster IDs to a list of integers
def convert_roster_ids(roster_ids):
    return [int(float(x)) for x in roster_ids.split(':')]

# Function to calculate the number of 50s in the last 15 games for the players in a team roster
def count_50s_last_15_games(roster, batsman_df):
    roster_batsman_df = batsman_df[batsman_df['batsman_id'].isin(roster)]
    last_15_games = roster_batsman_df.groupby('batsman_id').apply(lambda x: x.nlargest(15, 'match_dt')).reset_index(drop=True)
    return (last_15_games['runs'] >= 50).sum()

# Calculate team_count_50runs_last15 for each match
def calculate_team_count_50runs_last15(row):
    team1_roster = convert_roster_ids(row['team1_roster_ids'])
    team2_roster = convert_roster_ids(row['team2_roster_ids'])
    
    team1_50s = count_50s_last_15_games(team1_roster, batsman_df)
    team2_50s = count_50s_last_15_games(team2_roster, batsman_df)
    
    if team2_50s == 0:
        return np.nan  # Use NaN to handle it later
    return team1_50s / team2_50s

match_df['team_count_50runs_last15'] = match_df.apply(calculate_team_count_50runs_last15, axis=1)

# Function to calculate win percentage in the last 5 games for a team
def win_percentage_last_5_games(team_id, match_df):
    team_matches = match_df[(match_df['team1_id'] == team_id) | (match_df['team2_id'] == team_id)].nlargest(5, 'match_dt')
    wins = team_matches[team_matches['winner_id'] == team_id].shape[0]
    return wins / 5.0

# Calculate team_winp_last5 for each match
def calculate_team_winp_last5(row):
    team1_id = row['team1_id']
    team2_id = row['team2_id']
    
    team1_winp = win_percentage_last_5_games(team1_id, match_df)
    team2_winp = win_percentage_last_5_games(team2_id, match_df)
    
    if team2_winp == 0:
        return np.nan  # Use NaN to handle it later
    return team1_winp / team2_winp

match_df['team_winp_last5'] = match_df.apply(calculate_team_winp_last5, axis=1)

# Calculate team1's average runs in last 15 games
def calculate_team1only_avg_runs_last15(team1_id, batsman_df):
    team1_batsman_df = batsman_df[batsman_df['batsman_id'].isin([team1_id])]
    last_15_games = team1_batsman_df.nlargest(15, 'match_dt')
    return last_15_games['runs'].mean()

match_df['team1only_avg_runs_last15'] = match_df['team1_id'].apply(lambda team1_id: calculate_team1only_avg_runs_last15(team1_id, batsman_df))

# Calculate team1's win percentage against team2 in last 15 games
def win_percentage_against_team(team1_id, team2_id, match_df, n=15):
    matchups = match_df[((match_df['team1_id'] == team1_id) & (match_df['team2_id'] == team2_id)) | ((match_df['team1_id'] == team2_id) & (match_df['team2_id'] == team1_id))].nlargest(n, 'match_dt')
    wins = matchups[matchups['winner_id'] == team1_id].shape[0]
    return wins / n

match_df['team1_winp_team2_last15'] = match_df.apply(lambda row: win_percentage_against_team(row['team1_id'], row['team2_id'], match_df), axis=1)

# Calculate ground average runs in last 15 games
def calculate_ground_avg_runs_last15(ground_id, match_df):
    ground_matches = match_df[match_df['ground_id'] == ground_id].nlargest(15, 'match_dt')
    return ground_matches[['inning1_runs', 'inning2_runs']].mean().mean()

match_df['ground_avg_runs_last15'] = match_df['ground_id'].apply(lambda ground_id: calculate_ground_avg_runs_last15(ground_id, match_df))

# Replace NaN values with a large finite number for model training
large_finite_value = 1e6
match_df.fillna(large_finite_value, inplace=True)

# Select the relevant columns
result_df = match_df[['match id', 'team1', 'team1_id', 'team1_roster_ids', 'team2', 'team2_id', 'team2_roster_ids', 'winner', 'winner_id', 'toss winner', 'toss decision', 'venue', 'city', 'match_dt', 'lighting', 'series_name', 'season', 'ground_id', 'team_count_50runs_last15', 'team_winp_last5', 'team1only_avg_runs_last15', 'team1_winp_team2_last15', 'ground_avg_runs_last15']]

# Save the result to a new CSV file
result_df.to_csv('./AdditionalTrain.csv', index=False)


  last_15_games = roster_batsman_df.groupby('batsman_id').apply(lambda x: x.nlargest(15, 'match_dt')).reset_index(drop=True)
  last_15_games = roster_batsman_df.groupby('batsman_id').apply(lambda x: x.nlargest(15, 'match_dt')).reset_index(drop=True)
  last_15_games = roster_batsman_df.groupby('batsman_id').apply(lambda x: x.nlargest(15, 'match_dt')).reset_index(drop=True)
  last_15_games = roster_batsman_df.groupby('batsman_id').apply(lambda x: x.nlargest(15, 'match_dt')).reset_index(drop=True)
  last_15_games = roster_batsman_df.groupby('batsman_id').apply(lambda x: x.nlargest(15, 'match_dt')).reset_index(drop=True)
  last_15_games = roster_batsman_df.groupby('batsman_id').apply(lambda x: x.nlargest(15, 'match_dt')).reset_index(drop=True)
  last_15_games = roster_batsman_df.groupby('batsman_id').apply(lambda x: x.nlargest(15, 'match_dt')).reset_index(drop=True)
  last_15_games = roster_batsman_df.groupby('batsman_id').apply(lambda x: x.nlargest(15, 'match_dt')).reset_index(drop=True)


In [16]:
print(result_df.shape)

(1689, 23)


In [17]:
train_path = './train_data_with_samplefeatures.csv'
train_data = pd.read_csv(train_path)

print(train_data.shape)

(948, 23)


Merging with traindata(given)

In [18]:
merged_df = pd.concat([result_df, train_data]).reset_index(drop=True)

In [19]:
print(merged_df.shape)

(2637, 23)


In [20]:
merged_df.to_csv('./MergedTrainData.csv',index=False)