In [None]:
!pip install catboost

In [None]:
!pip install xgboost

In [None]:
!pip install lightgbm

In [34]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from datetime import datetime, timedelta
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier

from sklearn.model_selection import GridSearchCV

In [35]:
data = pd.read_csv('./csv/train_data_with_samplefeatures.csv')
data = data.fillna(0)
data['winner'] = data.apply(lambda row: 0 if row['winner'] == row['team1'] else 1, axis=1)

In [36]:
label_encoders = {}

categorical_features = ['team1', 'team2', 'toss winner', 'toss decision',
                        'venue', 'city', 'lighting', 'series_name', 'season']


for col in categorical_features:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le
    
    
target_encoder = LabelEncoder()
data['winner'] = target_encoder.fit_transform(data['winner'])



In [37]:
data['match_dt'] = pd.to_datetime(data['match_dt'])

data['year'] = data['match_dt'].dt.year
data['month'] = data['match_dt'].dt.month
data['day'] = data['match_dt'].dt.day
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 948 entries, 0 to 947
Data columns (total 26 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   match id                   948 non-null    int64         
 1   team1                      948 non-null    int32         
 2   team1_id                   948 non-null    int64         
 3   team1_roster_ids           948 non-null    object        
 4   team2                      948 non-null    int32         
 5   team2_id                   948 non-null    int64         
 6   team2_roster_ids           948 non-null    object        
 7   winner                     948 non-null    int64         
 8   winner_id                  948 non-null    int64         
 9   toss winner                948 non-null    int32         
 10  toss decision              948 non-null    int32         
 11  venue                      948 non-null    int32         
 12  city    

In [38]:
# add team column to batter_df

team_df = pd.read_csv('./csv/train_data_with_samplefeatures.csv')
batter_df = pd.read_csv('./csv/batsman_level_scorecard.csv')

player_team_mapping={}
for _, row in team_df.iterrows():
    match_id = row['match id']
    team_name = row['team1']
    roster = row['team1_roster_ids'].split(':')
    for player_id in roster:
        player_team_mapping[(match_id, float(player_id))] = team_name

for _, row in team_df.iterrows():
    match_id = row['match id']
    team_name = row['team2']
    roster = row['team2_roster_ids'].split(':')
    for player_id in roster:
        player_team_mapping[(match_id, float(player_id))] = team_name
        
batter_df['team'] = batter_df.apply(lambda x: player_team_mapping.get((x['match id'], x['batsman_id']), None), axis=1)
print(batter_df)

       match id batsman  batsman_id                           batsman_details  \
0       8638034   KD Ce   7907451.0  NZ:Right-hand bat:Right-arm medium-fast:   
1       8638034   TL St   4381761.0                   NZ:Right-hand bat:None:   
2       8638034   HR Cr   4949790.0     NZ:Right-hand bat:Right-arm offbreak:   
3       8638034   BR Hn   3834305.0  NZ:Right-hand bat:Right-arm medium-fast:   
4       8638034   SC Kn   3776849.0  NZ:Right-hand bat:Right-arm fast-medium:   
...         ...     ...         ...                                       ...   
24478   9433633   CP Wd   2173688.0  ENG:Right-hand bat:Left-arm medium-fast:   
24479   9433633   JJ Wy   5241564.0    ENG:Right-hand bat:Right-arm offbreak:   
24480   9433633    G Rn   4898074.0                   SA:Right-hand bat:None:   
24481   9433633   BJ Ws   8048074.0                  ENG:Right-hand bat:None:   
24482   9433633   BG Ch   6988596.0  ENG:Left-hand bat:Right-arm medium-fast:   

       is_batsman_captain  

In [39]:
# calculate team top 4 batsman average from the team they are playing

average_runs_by_batsman = batter_df.groupby(['team', 'batsman_id'])['runs'].mean().reset_index()
average_runs_dict = average_runs_by_batsman.set_index(["batsman_id", "team"])["runs"].to_dict()

def mean_top4_avg_scores(roster, team, avg_dict):
    player_ids = roster.split(":")
    top4_avg_scores = []

    for player_id in player_ids:
        key = (float(player_id), team)
        if key in avg_dict:
            avg_score = avg_dict[key]
            top4_avg_scores.append(avg_score)
    
    if top4_avg_scores:
        top4_avg_scores = sorted(top4_avg_scores, reverse=True)[:4]
        mean_top4_avg = sum(top4_avg_scores) / len(top4_avg_scores)
    else:
        mean_top4_avg = 0
    
    return mean_top4_avg
team_df["team1_top4_avg"] = team_df.apply(lambda row: mean_top4_avg_scores(row["team1_roster_ids"], row["team1"], average_runs_dict), axis=1)
team_df["team2_top4_avg"] = team_df.apply(lambda row: mean_top4_avg_scores(row["team2_roster_ids"], row["team2"], average_runs_dict), axis=1)
print(team_df)

     match id     team1  team1_id  \
0     9331181        Ba     11283   
1     8797060        Ed        20   
2     9433269        We     10576   
3     9587073  Ga An Ws     36084   
4     9516457     Pb Ks     30407   
..        ...       ...       ...   
943   9128601     Pb Ks     30407   
944   9433241        Mx      8700   
945   9097227        Bd     22497   
946   9516695     Rn Rs     30428   
947   9433633        He      7727   

                                      team1_roster_ids     team2  team2_id  \
0    9373356.0:7857520.0:4232164.0:4566540.0:329940...     Hl Ph     12634   
1    2089079.0:6139370.0:2076192.0:62432.0:2083409....     Wt Is        41   
2    3298427.0:2288789.0:7773338.0:3519011.0:368195...        Ne      8987   
3    8127230.0:4690328.0:4069666.0:7960847.0:469018...     Bs Rs     36070   
4    8127181.0:197658.0:4239038.0:2398346.0:5053082...     Gt Ts     48341   
..                                                 ...       ...       ...   
943  2789

In [40]:
# calculate no. of rel batsman(avg run from that team>20)

rel_bats=average_runs_by_batsman[average_runs_by_batsman['runs']>20]
rel_bats_dict=rel_bats.set_index(["batsman_id", "team"])["runs"].to_dict()

def mean_top4_count_rel(roster, team, avg_dict):
    player_ids = roster.split(":")
    top4_avg_scores = []

    for player_id in player_ids:
        key = (float(player_id), team)
        if key in avg_dict:
            avg_score = avg_dict[key]
            top4_avg_scores.append(avg_score)
    
    return len(top4_avg_scores)

team_df["team1_count_rel_bats"] = team_df.apply(lambda row: mean_top4_count_rel(row["team1_roster_ids"], row["team1"], rel_bats_dict), axis=1)
team_df["team2_count_rel_bats"] = team_df.apply(lambda row: mean_top4_count_rel(row["team2_roster_ids"], row["team2"], rel_bats_dict), axis=1)
team_df

Unnamed: 0,match id,team1,team1_id,team1_roster_ids,team2,team2_id,team2_roster_ids,winner,winner_id,toss winner,...,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,team1_top4_avg,team2_top4_avg,team1_count_rel_bats,team2_count_rel_bats
0,9331181,Ba,11283,9373356.0:7857520.0:4232164.0:4566540.0:329940...,Hl Ph,12634,3500958.0:4231751.0:2735081.0:2035102.0:369833...,Hl Ph,12634,Hl Ph,...,7398,1.666667,0.672131,139.000000,100.00,157.178571,20.375000,17.625000,2,1
1,8797060,Ed,20,2089079.0:6139370.0:2076192.0:62432.0:2083409....,Wt Is,41,4690258.0:4069666.0:4230127.0:1942317.0:161392...,Ed,20,Wt Is,...,1406,1.285714,1.952381,156.000000,50.00,103.500000,21.375000,24.172890,3,3
2,9433269,We,10576,3298427.0:2288789.0:7773338.0:3519011.0:368195...,Ne,8987,4003390.0:1749075.0:1626526.0:4172447.0:551672...,We,10576,Ne,...,251,0.857143,0.672131,173.266667,0.00,154.333333,26.270175,21.473611,4,3
3,9587073,Ga An Ws,36084,8127230.0:4690328.0:4069666.0:7960847.0:469018...,Bs Rs,36070,3462080.0:2436405.0:1798705.0:7550857.0:574247...,Ga An Ws,36084,Ga An Ws,...,14300,2.166667,1.975610,164.266667,50.00,144.250000,28.887683,20.958333,4,3
4,9516457,Pb Ks,30407,8127181.0:197658.0:4239038.0:2398346.0:5053082...,Gt Ts,48341,1958683.0:7491224.0:8059029.0:4377610.0:225245...,Gt Ts,48341,Gt Ts,...,7118,0.818182,1.327869,164.666667,0.00,189.000000,24.121732,33.132305,4,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
943,9128601,Pb Ks,30407,2789079.0:197658.0:2398346.0:2827327.0:2082044...,Ci Sr Ks,30414,7422673.0:249087.0:3519011.0:5001170.0:232000....,Pb Ks,30407,Ci Sr Ks,...,5004,0.823529,1.000000,147.333333,66.67,166.400000,27.325700,28.108333,4,4
944,9433241,Mx,8700,4421689.0:7752989.0:1941743.0:4489722.0:767287...,St,9701,6139370.0:7694581.0:3294444.0:3239102.0:632036...,St,9701,St,...,1042,1.571429,0.012346,167.400000,0.00,170.466667,26.071429,25.654764,4,4
945,9097227,Bd,22497,4239773.0:1941743.0:3007969.0:4172972.0:155625...,Wn Pe,23869,323049.0:4876122.0:4164978.0:1837205.0:3373138...,Wn Pe,23869,Bd,...,1224,3.000000,1.000000,,0.00,,25.610119,24.993056,3,3
946,9516695,Rn Rs,30428,8058959.0:2162782.0:2981614.0:4690188.0:212569...,Ss Hd,36014,5958840.0:7491294.0:3127354.0:3057312.0:420349...,Ss Hd,36014,Rn Rs,...,4661,0.789474,1.487805,182.800000,66.67,133.375000,33.748904,41.836806,4,5


In [41]:
# avg strike rate of top4 batsman by the no. of balls they faced

average_strike = batter_df.groupby(['team', 'batsman_id']).agg({
    'strike_rate': 'mean',
    'balls_faced': 'mean'
}).reset_index()
average_strike = average_strike[average_strike['balls_faced'] > 10]
average_strike_dict = average_strike.set_index(["batsman_id","team"])["strike_rate"].to_dict()
balls_faced_dict = average_strike.set_index(["batsman_id","team"])["balls_faced"].to_dict()

def mean_top4_avg_strikes(roster, team, avg_dict, balls_dict):
    player_ids = roster.split(":")
    valid_player_ids = [(float(player_id), team) for player_id in player_ids 
                        if (float(player_id), team) in avg_dict and (float(player_id), team) in balls_dict]
    player_stats = [(player_id, avg_dict[player_id], balls_dict[player_id]) for player_id in valid_player_ids]
    top4_players = sorted(player_stats, key=lambda x: x[2], reverse=True)[:4]
    
    if top4_players:
        top4_avg_strike_rate = sum(player[1] for player in top4_players) / len(top4_players)
    else:
        top4_avg_strike_rate = 0
    return top4_avg_strike_rate


team_df["team1_top4_bats"] = team_df.apply(lambda row: mean_top4_avg_strikes(row["team1_roster_ids"], row["team1"], average_strike_dict, balls_faced_dict), axis=1)
team_df["team2_top4_bats"] = team_df.apply(lambda row: mean_top4_avg_strikes(row["team2_roster_ids"], row["team2"], average_strike_dict, balls_faced_dict), axis=1)
print(team_df) 

     match id     team1  team1_id  \
0     9331181        Ba     11283   
1     8797060        Ed        20   
2     9433269        We     10576   
3     9587073  Ga An Ws     36084   
4     9516457     Pb Ks     30407   
..        ...       ...       ...   
943   9128601     Pb Ks     30407   
944   9433241        Mx      8700   
945   9097227        Bd     22497   
946   9516695     Rn Rs     30428   
947   9433633        He      7727   

                                      team1_roster_ids     team2  team2_id  \
0    9373356.0:7857520.0:4232164.0:4566540.0:329940...     Hl Ph     12634   
1    2089079.0:6139370.0:2076192.0:62432.0:2083409....     Wt Is        41   
2    3298427.0:2288789.0:7773338.0:3519011.0:368195...        Ne      8987   
3    8127230.0:4690328.0:4069666.0:7960847.0:469018...     Bs Rs     36070   
4    8127181.0:197658.0:4239038.0:2398346.0:5053082...     Gt Ts     48341   
..                                                 ...       ...       ...   
943  2789

In [42]:
# avg top 4 runs from a team against a specific team


combined_df=pd.merge(team_df,batter_df,on='match id',how='inner')
combined_df['opponents']=combined_df.apply(lambda row:row['team2'] if row['team']==row['team1'] else row['team1'],axis=1)
grouped_df=combined_df.groupby(['batsman_id','team','opponents'])['runs'].mean().reset_index()
grouped_df.rename(columns={'runs':'avg_runs_ag'},inplace=True)

avg_ag_run_dict={}
for _,row in grouped_df.iterrows():
    key=(row['batsman_id'],row['team'],row['opponents'])
    avg_ag_run_dict[key]=row['avg_runs_ag']
print(avg_ag_run_dict)

def mean_top4_ag(roster, team, opponents, avg_dict):

    player_ids = roster.split(":")
    
    valid_player_ids = [(float(player_id), team, opponents) for player_id in player_ids
                        if (float(player_id), team, opponents) in avg_dict]
    
    player_stats = [(player_id, avg_dict[player_id]) for player_id in valid_player_ids]

    top4_players = sorted(player_stats, key=lambda x: x[1], reverse=True)[:4]

    if top4_players:
        top4_avg_ag = sum(player[1] for player in top4_players) / len(top4_players)
    else:
        top4_avg_ag = 0
    
    return top4_avg_ag


team_df['team1_avg_top4_ag'] = team_df.apply(lambda row: mean_top4_ag(row["team1_roster_ids"], row["team1"], row["team2"], avg_ag_run_dict), axis=1)
team_df['team2_avg_top4_ag'] = team_df.apply(lambda row: mean_top4_ag(row["team2_roster_ids"], row["team2"], row["team1"], avg_ag_run_dict), axis=1)
print(team_df)
# team_df['team1_avg_top4_ag'].describe()

{(34061.0, 'Ne', 'Bm Bs'): 13.0, (34061.0, 'Ne', 'Dm'): 0.0, (34061.0, 'Ne', 'Le'): 12.5, (34061.0, 'Ne', 'Ye'): 56.0, (34061.0, 'Sy Ss', 'Ae Ss'): 12.666666666666666, (34061.0, 'Sy Ss', 'Be Ht'): 32.0, (34061.0, 'Sy Ss', 'Ht Hs'): 0.0, (34061.0, 'Sy Ss', 'Me Rs'): 14.5, (34061.0, 'Sy Ss', 'Me Ss'): 4.0, (34061.0, 'Sy Ss', 'Ph Ss'): 8.0, (34061.0, 'Sy Ss', 'Sy Tr'): 4.0, (37351.0, 'Aa', 'Ed'): 12.5, (37351.0, 'Aa', 'Ia'): 20.0, (37351.0, 'Aa', 'Pn'): 55.0, (37351.0, 'Aa', 'Si La'): 31.6, (37351.0, 'Aa', 'Wt Is'): 36.5, (37351.0, 'Ka Kt Rs', 'Di Cs'): 3.0, (37351.0, 'Ka Kt Rs', 'Lw Sr Gs'): 14.0, (37351.0, 'Ka Kt Rs', 'Rn Rs'): 31.0, (37351.0, 'Ka Kt Rs', 'Ss Hd'): 7.0, (37351.0, 'Me Rs', 'Ae Ss'): 63.0, (37351.0, 'Me Rs', 'Be Ht'): 13.25, (37351.0, 'Me Rs', 'Ht Hs'): 75.0, (37351.0, 'Me Rs', 'Me Ss'): 27.25, (37351.0, 'Me Rs', 'Sy Ss'): 24.0, (37351.0, 'Me Rs', 'Sy Tr'): 76.0, (41740.0, 'Sy Ss', 'Ae Ss'): 11.333333333333334, (41740.0, 'Sy Ss', 'Be Ht'): 28.0, (41740.0, 'Sy Ss', 'Ht Hs'

In [43]:
# avg top 4 runs from a team in a specific venue
combined_df=pd.merge(team_df,batter_df,on='match id',how='inner')
combined_df['opponents']=combined_df.apply(lambda row:row['team2'] if row['team']==row['team1'] else row['team1'],axis=1)
grouped_df=combined_df.groupby(['batsman_id','team','venue'])['runs'].mean().reset_index()
grouped_df.rename(columns={'runs':'avg_runs_ve'},inplace=True)

avg_ve_run_dict={}
for _,row in grouped_df.iterrows():
    key=(row['batsman_id'],row['team'],row['venue'])
    avg_ve_run_dict[key]=row['avg_runs_ve']

def mean_top4_ve(roster, team, venue, avg_dict):
    player_ids = roster.split(":")
    
    valid_player_ids = [(float(player_id), team, venue) for player_id in player_ids
                        if (float(player_id), team, venue) in avg_dict]
    
    player_stats = [(player_id, avg_dict[player_id]) for player_id in valid_player_ids]
    top4_players = sorted(player_stats, key=lambda x: x[1], reverse=True)[:4]
    if top4_players:
        top4_avg_ve = sum(player[1] for player in top4_players) / len(top4_players)
    else:
        top4_avg_ve = 0
    
    return top4_avg_ve

team_df['team1_avg_top4_ve'] = team_df.apply(lambda row: mean_top4_ag(row["team1_roster_ids"], row["team1"], row["venue"], avg_ve_run_dict), axis=1)
team_df['team2_avg_top4_ve'] = team_df.apply(lambda row: mean_top4_ag(row["team2_roster_ids"], row["team2"], row["venue"], avg_ve_run_dict), axis=1)
print(team_df)

     match id     team1  team1_id  \
0     9331181        Ba     11283   
1     8797060        Ed        20   
2     9433269        We     10576   
3     9587073  Ga An Ws     36084   
4     9516457     Pb Ks     30407   
..        ...       ...       ...   
943   9128601     Pb Ks     30407   
944   9433241        Mx      8700   
945   9097227        Bd     22497   
946   9516695     Rn Rs     30428   
947   9433633        He      7727   

                                      team1_roster_ids     team2  team2_id  \
0    9373356.0:7857520.0:4232164.0:4566540.0:329940...     Hl Ph     12634   
1    2089079.0:6139370.0:2076192.0:62432.0:2083409....     Wt Is        41   
2    3298427.0:2288789.0:7773338.0:3519011.0:368195...        Ne      8987   
3    8127230.0:4690328.0:4069666.0:7960847.0:469018...     Bs Rs     36070   
4    8127181.0:197658.0:4239038.0:2398346.0:5053082...     Gt Ts     48341   
..                                                 ...       ...       ...   
943  2789

In [44]:
# avg top 4 runs from a team in a specific innings
combined_df=pd.merge(team_df,batter_df,on='match id',how='inner')
combined_df['opponents']=combined_df.apply(lambda row:row['team2'] if row['team']==row['team1'] else row['team1'],axis=1)


def determine_innings(row):
    if(row['team']==row['toss winner']):
        if row['toss decision']=='bat':
            return 1
        else:
            return 2
    else:
        if row['toss decision']=='bat':
            return 2
        else:
            return 1

def deter_innings(toss_winner,team,toss_decision):
    if(team==toss_winner):
        if toss_decision=='bat':
            return 1
        else:
            return 2
    else:
        if toss_decision=='bat':
            return 2
        else:
            return 1        
combined_df['innings']=combined_df.apply(determine_innings,axis=1)
grouped_df=combined_df.groupby(['batsman_id','team','innings'])['runs'].mean().reset_index()
grouped_df.rename(columns={'runs':'avg_runs_in'},inplace=True)
avg_in_run_dict={}


for _,row in grouped_df.iterrows():
    key=(row['batsman_id'],row['team'],row['innings'])
    avg_in_run_dict[key]=row['avg_runs_in']


def mean_top4_in(roster, team, toss_decision,toss_winner, avg_dict):
    innings=deter_innings(toss_winner,team,toss_decision)
    player_ids = roster.split(":")
    
    valid_player_ids = [(float(player_id), team, innings) for player_id in player_ids
                        if (float(player_id), team, innings) in avg_dict]
    
    player_stats = [(player_id, avg_dict[player_id]) for player_id in valid_player_ids]

    top4_players = sorted(player_stats, key=lambda x: x[1], reverse=True)[:4]

    if top4_players:
        top4_avg_ag = sum(player[1] for player in top4_players) / len(top4_players)
    else:
        top4_avg_ag = 0
    
    return top4_avg_ag


team_df['team1_avg_top4_in'] = team_df.apply(lambda row: mean_top4_in(row["team1_roster_ids"], row["team1"], row["toss decision"],row['toss winner'], avg_in_run_dict), axis=1)
team_df['team2_avg_top4_in'] = team_df.apply(lambda row: mean_top4_in(row["team2_roster_ids"], row["team2"], row["toss decision"],row['toss winner'], avg_in_run_dict), axis=1)
print(team_df)

     match id     team1  team1_id  \
0     9331181        Ba     11283   
1     8797060        Ed        20   
2     9433269        We     10576   
3     9587073  Ga An Ws     36084   
4     9516457     Pb Ks     30407   
..        ...       ...       ...   
943   9128601     Pb Ks     30407   
944   9433241        Mx      8700   
945   9097227        Bd     22497   
946   9516695     Rn Rs     30428   
947   9433633        He      7727   

                                      team1_roster_ids     team2  team2_id  \
0    9373356.0:7857520.0:4232164.0:4566540.0:329940...     Hl Ph     12634   
1    2089079.0:6139370.0:2076192.0:62432.0:2083409....     Wt Is        41   
2    3298427.0:2288789.0:7773338.0:3519011.0:368195...        Ne      8987   
3    8127230.0:4690328.0:4069666.0:7960847.0:469018...     Bs Rs     36070   
4    8127181.0:197658.0:4239038.0:2398346.0:5053082...     Gt Ts     48341   
..                                                 ...       ...       ...   
943  2789

In [45]:
# dictionary to avg runs in last 5 matches
batter_df['match_dt'] = pd.to_datetime(batter_df['match_dt'])

# Function to get batsman form based on last 5 matches relative to each match date
def get_batsman_form(batsman_id, df, threshold=25, matches=5):
    batsman_data = df[df['batsman_id'] == batsman_id].sort_values(by='match_dt', ascending=False)
    batsman_form = {}

    for _, row in batsman_data.iterrows():
        start_date = row['match_dt'] - timedelta(days=matches-1)  # Start date for the last 'matches' matches
        recent_matches = batsman_data[(batsman_data['match_dt'] >= start_date) & (batsman_data['match_dt'] <= row['match_dt'])]
        if not recent_matches.empty:
            avg_runs = recent_matches['runs'].mean()
    return avg_runs

# Create the dictionary
batsman_form_dict = {}
for batsman_id in batter_df['batsman_id'].unique():
    batsman_form_dict[batsman_id] = get_batsman_form(batsman_id, batter_df)

print(batsman_form_dict)



{7907451.0: 7.0, 4381761.0: 46.0, 4949790.0: 9.0, 3834305.0: 28.0, 3776849.0: 18.0, 6718340.0: 1.0, 31464.0: 8.0, 258649.0: 19.0, 2653993.0: 13.0, 6718326.0: 91.0, 6718382.0: 40.0, 37351.0: 18.0, 46794.0: 87.0, 1594319.0: 8.0, 7534687.0: 1.0, 7537067.0: 1.0, 5406540.0: 7.0, 2231928.0: 3.0, 181404.0: 33.0, 1506098.0: 48.0, 1749075.0: 45.0, 36665.0: 11.0, 2083409.0: 10.0, 7869987.0: 48.0, 7620283.0: 10.0, 3063696.0: 43.0, 34061.0: 6.0, 3200756.0: 0.0, 4756982.0: 24.0, 7455818.0: 0.0, 49496.0: 2.0, 2076192.0: 20.0, 4002340.0: 2.0, 7620269.0: 40.0, 2535420.0: 31.0, 4967738.0: 5.0, 3995991.0: 22.0, 2286437.0: 1.0, 87191.0: 19.0, 5786766.0: 17.0, 3114803.0: 20.0, 7543647.0: 3.0, 2319638.0: 49.0, 256080.0: 6.0, 2690498.0: 3.0, 6129276.0: 14.0, 2666705.0: 6.0, 6317142.0: 10.0, 3834375.0: 0.0, 2236086.0: 14.0, 7918280.0: 17.0, 3913447.0: 13.0, 4223883.0: 31.0, 2161599.0: 12.0, 5788418.0: 0.0, 319948.0: 26.0, 6249256.0: 8.0, 2340372.0: 3.0, 1655436.0: 75.0, 363047.0: 10.0, 2720759.0: 2.0, 233711

In [46]:
# team average by last 5 match

def avg_last5(roster,avg_dict):
    player_ids=roster.split(":")
    player_scores=[]
    for player_id in player_ids:
        key=(float(player_id))
        if key in avg_dict:
            score=avg_dict[key]
            player_scores.append(score)
#             print(player_scores)
    if player_scores:
        player_scores = sorted(player_scores, reverse=True)[:4]
        mean_top4_avg = sum(player_scores) / len(player_scores)
    else:
        mean_top4_avg = 0
    return mean_top4_avg
team_df['team1_last5_avg']=team_df.apply(lambda row: avg_last5(row['team1_roster_ids'],batsman_form_dict),axis=1)
team_df['team2_last5_avg']=team_df.apply(lambda row: avg_last5(row['team2_roster_ids'],batsman_form_dict),axis=1)
print(team_df)

     match id     team1  team1_id  \
0     9331181        Ba     11283   
1     8797060        Ed        20   
2     9433269        We     10576   
3     9587073  Ga An Ws     36084   
4     9516457     Pb Ks     30407   
..        ...       ...       ...   
943   9128601     Pb Ks     30407   
944   9433241        Mx      8700   
945   9097227        Bd     22497   
946   9516695     Rn Rs     30428   
947   9433633        He      7727   

                                      team1_roster_ids     team2  team2_id  \
0    9373356.0:7857520.0:4232164.0:4566540.0:329940...     Hl Ph     12634   
1    2089079.0:6139370.0:2076192.0:62432.0:2083409....     Wt Is        41   
2    3298427.0:2288789.0:7773338.0:3519011.0:368195...        Ne      8987   
3    8127230.0:4690328.0:4069666.0:7960847.0:469018...     Bs Rs     36070   
4    8127181.0:197658.0:4239038.0:2398346.0:5053082...     Gt Ts     48341   
..                                                 ...       ...       ...   
943  2789

In [47]:
# count of batsman by in form by last 5 match (avg_run>25)

def count_player_form(roster, avg_dict):
    player_ids = roster.split(":")
    count_ones=0

    for player_id in player_ids:
        key = (float(player_id))
        if key in avg_dict and avg_dict[key]>=25:
            count_ones+=1
    return count_ones

team_df['team1_count_bat_form']=team_df.apply(lambda row: count_player_form(row['team1_roster_ids'],batsman_form_dict),axis=1)
team_df['team2_count_bat_form']=team_df.apply(lambda row: count_player_form(row['team2_roster_ids'],batsman_form_dict),axis=1)
team_df

Unnamed: 0,match id,team1,team1_id,team1_roster_ids,team2,team2_id,team2_roster_ids,winner,winner_id,toss winner,...,team1_avg_top4_ag,team2_avg_top4_ag,team1_avg_top4_ve,team2_avg_top4_ve,team1_avg_top4_in,team2_avg_top4_in,team1_last5_avg,team2_last5_avg,team1_count_bat_form,team2_count_bat_form
0,9331181,Ba,11283,9373356.0:7857520.0:4232164.0:4566540.0:329940...,Hl Ph,12634,3500958.0:4231751.0:2735081.0:2035102.0:369833...,Hl Ph,12634,Hl Ph,...,15.750000,32.500000,13.416667,19.250000,15.541667,23.750000,40.50,10.00,2,1
1,8797060,Ed,20,2089079.0:6139370.0:2076192.0:62432.0:2083409....,Wt Is,41,4690258.0:4069666.0:4230127.0:1942317.0:161392...,Ed,20,Wt Is,...,26.975000,29.898810,27.000000,28.925000,20.714103,24.409091,27.75,39.25,3,4
2,9433269,We,10576,3298427.0:2288789.0:7773338.0:3519011.0:368195...,Ne,8987,4003390.0:1749075.0:1626526.0:4172447.0:551672...,We,10576,Ne,...,28.466667,24.375000,48.000000,26.438889,25.579861,24.715909,26.50,48.50,2,5
3,9587073,Ga An Ws,36084,8127230.0:4690328.0:4069666.0:7960847.0:469018...,Bs Rs,36070,3462080.0:2436405.0:1798705.0:7550857.0:574247...,Ga An Ws,36084,Ga An Ws,...,30.000000,34.666667,35.125000,33.000000,28.328571,21.487500,26.75,20.50,1,1
4,9516457,Pb Ks,30407,8127181.0:197658.0:4239038.0:2398346.0:5053082...,Gt Ts,48341,1958683.0:7491224.0:8059029.0:4377610.0:225245...,Gt Ts,48341,Gt Ts,...,26.250000,33.250000,26.125000,33.250000,27.700466,30.354167,20.75,25.00,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
943,9128601,Pb Ks,30407,2789079.0:197658.0:2398346.0:2827327.0:2082044...,Ci Sr Ks,30414,7422673.0:249087.0:3519011.0:5001170.0:232000....,Pb Ks,30407,Ci Sr Ks,...,34.583333,28.791667,32.229167,28.729167,28.067133,23.738095,21.00,34.00,1,3
944,9433241,Mx,8700,4421689.0:7752989.0:1941743.0:4489722.0:767287...,St,9701,6139370.0:7694581.0:3294444.0:3239102.0:632036...,St,9701,St,...,24.250000,35.500000,24.250000,27.304167,27.700000,28.980769,38.00,37.50,2,3
945,9097227,Bd,22497,4239773.0:1941743.0:3007969.0:4172972.0:155625...,Wn Pe,23869,323049.0:4876122.0:4164978.0:1837205.0:3373138...,Wn Pe,23869,Bd,...,37.500000,35.875000,26.200000,24.791667,31.416667,30.645833,26.25,14.50,2,1
946,9516695,Rn Rs,30428,8058959.0:2162782.0:2981614.0:4690188.0:212569...,Ss Hd,36014,5958840.0:7491294.0:3127354.0:3057312.0:420349...,Ss Hd,36014,Rn Rs,...,43.125000,30.625000,30.916667,40.250000,36.312500,30.291209,33.00,31.75,3,3


In [48]:
# adding team section to bowler df

bowler_df = pd.read_csv('./csv/bowler_level_scorecard.csv')
bowler_df['team'] = bowler_df.apply(lambda x: player_team_mapping.get((x['match id'], x['bowler_id']), None), axis=1)
print(bowler_df)


       match id    bowler  bowler_id  \
0       8638034     BV Ss  6718396.0   
1       8638034     HK Bt  1585464.0   
2       8638034     JS Nm  2486896.0   
3       8638034  LV vn Bk  3083667.0   
4       8638034     PF Yd  4950294.0   
...         ...       ...        ...   
18534   9433633     BC Hl  1482249.0   
18535   9433633     CP Wd  2173688.0   
18536   9433633     JA Tr  8833075.0   
18537   9433633     LA Dn  1482998.0   
18538   9433633     NT Es  5788418.0   

                                   bowler_details  is_bowler_captain  \
0        NZ:Right-hand bat:Right-arm medium-fast:                0.0   
1         NZ:Left-hand bat:Right-arm medium-fast:                0.0   
2         NZ:Left-hand bat:Right-arm medium-fast:                0.0   
3       NED:Right-hand bat:Right-arm medium-fast:                0.0   
4              NZ:Right-hand bat:Legbreak googly:                0.0   
...                                           ...                ...   
18534        EN

In [49]:
# avg eco best and worst top 4
average_eco = bowler_df.groupby(['team', 'bowler_id'])['economy'].mean().reset_index()
average_eco_dict=average_eco.set_index(["bowler_id","team"])["economy"].to_dict()
def mean_eco_top(roster, team, avg_dict):
    player_ids = roster.split(":")
    top4_eco = []

    for player_id in player_ids:
        key = (float(player_id), team)
        if key in avg_dict:
            avg_eco = avg_dict.get(key, 0)
            top4_eco.append(avg_eco)

    top4_eco = sorted(top4_eco)[:4]# Get top 4 eco
    
    if len(top4_eco) == 0:
        return 0
    return sum(top4_eco) / len(top4_eco)
def mean_eco_bot(roster, team, avg_dict):
    player_ids = roster.split(":")
    top4_eco = []

    for player_id in player_ids:
        key = (float(player_id), team)
        if key in avg_dict:
            avg_eco = avg_dict.get(key, 0)
            top4_eco.append(avg_eco)

    top4_eco = sorted(top4_eco,reverse=True)[:4]# Get top 4 eco
    
    if len(top4_eco) == 0:
        return 0
    return sum(top4_eco) / len(top4_eco)
team_df["team1_top4_eco"] = team_df.apply(lambda row: mean_eco_top(row["team1_roster_ids"], row["team1"], average_eco_dict), axis=1)
team_df["team2_top4_eco"] = team_df.apply(lambda row: mean_eco_top(row["team2_roster_ids"], row["team2"], average_eco_dict), axis=1)
team_df["team1_bot4_eco"] = team_df.apply(lambda row: mean_eco_bot(row["team1_roster_ids"], row["team1"], average_eco_dict), axis=1)
team_df["team2_bot4_eco"] = team_df.apply(lambda row: mean_eco_bot(row["team2_roster_ids"], row["team2"], average_eco_dict), axis=1)
print(team_df) 

     match id     team1  team1_id  \
0     9331181        Ba     11283   
1     8797060        Ed        20   
2     9433269        We     10576   
3     9587073  Ga An Ws     36084   
4     9516457     Pb Ks     30407   
..        ...       ...       ...   
943   9128601     Pb Ks     30407   
944   9433241        Mx      8700   
945   9097227        Bd     22497   
946   9516695     Rn Rs     30428   
947   9433633        He      7727   

                                      team1_roster_ids     team2  team2_id  \
0    9373356.0:7857520.0:4232164.0:4566540.0:329940...     Hl Ph     12634   
1    2089079.0:6139370.0:2076192.0:62432.0:2083409....     Wt Is        41   
2    3298427.0:2288789.0:7773338.0:3519011.0:368195...        Ne      8987   
3    8127230.0:4690328.0:4069666.0:7960847.0:469018...     Bs Rs     36070   
4    8127181.0:197658.0:4239038.0:2398346.0:5053082...     Gt Ts     48341   
..                                                 ...       ...       ...   
943  2789

In [50]:
# dictionary of bowler by their last 5 match
bowler_df['match_dt'] = pd.to_datetime(bowler_df['match_dt'])

# Function to get bowler form based on last 5 matches relative to each match date
def get_bowler_form(bowler_id, df,matches=5):
    bowler_data = df[df['bowler_id'] == bowler_id].sort_values(by='match_dt', ascending=False)
    bowler_form = {}

    for _, row in bowler_data.iterrows():
        start_date = row['match_dt'] - timedelta(days=matches-1)  # Start date for the last 'matches' matches
        recent_matches = bowler_data[(bowler_data['match_dt'] >= start_date) & (bowler_data['match_dt'] <= row['match_dt'])]
#         print(recent_matches)
        if not recent_matches.empty:
            avg_eco = recent_matches['economy'].mean()
    return avg_eco

# Create the dictionary
bowler_form_dict = {}
for bowler_id in bowler_df['bowler_id'].unique():
    bowler_form_dict[bowler_id] = get_bowler_form(bowler_id, bowler_df)

print(bowler_form_dict)


{6718396.0: 10.5, 1585464.0: 8.5, 2486896.0: 5.75, 3083667.0: 7.75, 4950294.0: 5.25, 6718382.0: 8.0, 4950364.0: 13.5, 3834305.0: 7.0, 7543647.0: 7.25, 3566240.0: 10.33, 3776849.0: 5.25, 6718340.0: 15.0, 1612610.0: 8.0, 5509524.0: 12.0, 3876613.0: 10.0, 5788320.0: 7.5, 5497274.0: 10.25, 8193310.0: 8.5, 1594319.0: 11.33, 1905847.0: 7.0, 7620346.0: 16.5, 181404.0: 10.0, 8339701.0: 6.67, 7537067.0: 5.0, 3995991.0: 3.33, 4967738.0: 9.0, 2535420.0: 9.75, 5419546.0: 16.0, 3890984.0: 8.25, 6818776.0: 7.0, 7353828.0: 10.0, 7455818.0: 7.33, 4756982.0: 11.74, 3200756.0: 9.5, 34061.0: 11.67, 1506077.0: 6.5, 49496.0: 4.67, 2690498.0: 7.25, 3834375.0: 8.75, 4195827.0: 8.75, 2236086.0: 7.25, 2666705.0: 4.75, 7907458.0: 8.75, 3731307.0: 6.25, 2275195.0: 6.5, 6308098.0: 6.75, 7877232.0: 11.5, 2337117.0: 10.0, 4985546.0: 8.25, 2161599.0: 15.0, 309056.0: 6.0, 5788418.0: 2.75, 6732004.0: 9.0, 6722540.0: 9.5, 3125849.0: 5.5, 3834319.0: 9.0, 7871310.0: 10.0, 3913433.0: 8.0, 1495731.0: 15.0, 4136404.0: 9.0, 

In [51]:
# team avg eco best and worst absed on last 5 matches

def eco_last5(roster,avg_dict):
    player_ids=roster.split(":")
    player_eco=[]
    for player_id in player_ids:
        key=(float(player_id))
        if key in avg_dict:
            eco=avg_dict[key]
            player_eco.append(eco)
    if player_eco:
        player_eco = sorted(player_eco)[:4]
        mean_top4_avg = sum(player_eco) / len(player_eco)
    else:
        mean_top4_avg = 0
    return mean_top4_avg
def ec_last5(roster,avg_dict):
    player_ids=roster.split(":")
    player_eco=[]
    for player_id in player_ids:
        key=(float(player_id))
        if key in avg_dict:
            eco=avg_dict[key]
            player_eco.append(eco)
    if player_eco:
        player_eco = sorted(player_eco,reverse=True)[:4]
        mean_top4_avg = sum(player_eco) / len(player_eco)
    else:
        mean_top4_avg = 0
    return mean_top4_avg
team_df['team1_last5_eco_top']=team_df.apply(lambda row: eco_last5(row['team1_roster_ids'],bowler_form_dict),axis=1)
team_df['team2_last5_eco_top']=team_df.apply(lambda row: eco_last5(row['team2_roster_ids'],bowler_form_dict),axis=1)
team_df['team1_last5_eco_bot']=team_df.apply(lambda row: ec_last5(row['team1_roster_ids'],bowler_form_dict),axis=1)
team_df['team2_last5_eco_bot']=team_df.apply(lambda row: ec_last5(row['team2_roster_ids'],bowler_form_dict),axis=1)
print(team_df)

     match id     team1  team1_id  \
0     9331181        Ba     11283   
1     8797060        Ed        20   
2     9433269        We     10576   
3     9587073  Ga An Ws     36084   
4     9516457     Pb Ks     30407   
..        ...       ...       ...   
943   9128601     Pb Ks     30407   
944   9433241        Mx      8700   
945   9097227        Bd     22497   
946   9516695     Rn Rs     30428   
947   9433633        He      7727   

                                      team1_roster_ids     team2  team2_id  \
0    9373356.0:7857520.0:4232164.0:4566540.0:329940...     Hl Ph     12634   
1    2089079.0:6139370.0:2076192.0:62432.0:2083409....     Wt Is        41   
2    3298427.0:2288789.0:7773338.0:3519011.0:368195...        Ne      8987   
3    8127230.0:4690328.0:4069666.0:7960847.0:469018...     Bs Rs     36070   
4    8127181.0:197658.0:4239038.0:2398346.0:5053082...     Gt Ts     48341   
..                                                 ...       ...       ...   
943  2789

In [52]:
#  count of bowlers in form (avg eco<=7)
def count_player_form_bowl(roster, avg_dict):
    player_ids = roster.split(":")
    count_ones=0

    for player_id in player_ids:
        key = (float(player_id))
        if key in avg_dict and avg_dict[key]<=7:
            count_ones+=1
    return count_ones

team_df['team1_count_bowl_form']=team_df.apply(lambda row: count_player_form_bowl(row['team1_roster_ids'],bowler_form_dict),axis=1)
team_df['team2_count_bowl_form']=team_df.apply(lambda row: count_player_form_bowl(row['team2_roster_ids'],bowler_form_dict),axis=1)
print(team_df)

     match id     team1  team1_id  \
0     9331181        Ba     11283   
1     8797060        Ed        20   
2     9433269        We     10576   
3     9587073  Ga An Ws     36084   
4     9516457     Pb Ks     30407   
..        ...       ...       ...   
943   9128601     Pb Ks     30407   
944   9433241        Mx      8700   
945   9097227        Bd     22497   
946   9516695     Rn Rs     30428   
947   9433633        He      7727   

                                      team1_roster_ids     team2  team2_id  \
0    9373356.0:7857520.0:4232164.0:4566540.0:329940...     Hl Ph     12634   
1    2089079.0:6139370.0:2076192.0:62432.0:2083409....     Wt Is        41   
2    3298427.0:2288789.0:7773338.0:3519011.0:368195...        Ne      8987   
3    8127230.0:4690328.0:4069666.0:7960847.0:469018...     Bs Rs     36070   
4    8127181.0:197658.0:4239038.0:2398346.0:5053082...     Gt Ts     48341   
..                                                 ...       ...       ...   
943  2789

In [53]:
# count of rel bowlers avg eco<7.5
rel_bowl=average_eco[average_eco["economy"]<7.5]
rel_bowl_dic=rel_bowl.set_index(["bowler_id","team"])["economy"].to_dict()
def count_eco_top(roster, team, avg_dict):
    player_ids = roster.split(":")
    top4_eco = []

    for player_id in player_ids:
        key = (float(player_id), team)
        if key in avg_dict:
            avg_eco = avg_dict.get(key, 0)
            top4_eco.append(avg_eco)

    return len(top4_eco)
team_df["team1_count_rel_bowl"] = team_df.apply(lambda row: count_eco_top(row["team1_roster_ids"], row["team1"], rel_bowl_dic), axis=1)
team_df["team2_count_rel_bowl"] = team_df.apply(lambda row: count_eco_top(row["team2_roster_ids"], row["team2"], rel_bowl_dic), axis=1)
print(team_df)

     match id     team1  team1_id  \
0     9331181        Ba     11283   
1     8797060        Ed        20   
2     9433269        We     10576   
3     9587073  Ga An Ws     36084   
4     9516457     Pb Ks     30407   
..        ...       ...       ...   
943   9128601     Pb Ks     30407   
944   9433241        Mx      8700   
945   9097227        Bd     22497   
946   9516695     Rn Rs     30428   
947   9433633        He      7727   

                                      team1_roster_ids     team2  team2_id  \
0    9373356.0:7857520.0:4232164.0:4566540.0:329940...     Hl Ph     12634   
1    2089079.0:6139370.0:2076192.0:62432.0:2083409....     Wt Is        41   
2    3298427.0:2288789.0:7773338.0:3519011.0:368195...        Ne      8987   
3    8127230.0:4690328.0:4069666.0:7960847.0:469018...     Bs Rs     36070   
4    8127181.0:197658.0:4239038.0:2398346.0:5053082...     Gt Ts     48341   
..                                                 ...       ...       ...   
943  2789

In [54]:
# avg eco of team against a particular team top4

comb_df=pd.merge(team_df,bowler_df,on='match id',how='inner')
comb_df['opponents']=comb_df.apply(lambda row:row['team2'] if row['team']==row['team1'] else row['team1'],axis=1)
group_df=comb_df.groupby(['bowler_id','team','opponents'])['economy'].mean().reset_index()
group_df.rename(columns={'economy':'avg_eco_ag'},inplace=True)

avg_ag_eco_dict={}
for _,row in group_df.iterrows():
    key=(row['bowler_id'],row['team'],row['opponents'])
    avg_ag_eco_dict[key]=row['avg_eco_ag']


def mean_top4_eco_ag(roster, team, opponents, avg_dict):

    player_ids = roster.split(":")
    
    valid_player_ids = [(float(player_id), team, opponents) for player_id in player_ids
                        if (float(player_id), team, opponents) in avg_dict]
    
    player_stats = [(player_id, avg_dict[player_id]) for player_id in valid_player_ids]

    top4_players = sorted(player_stats, key=lambda x: x[1])[:4]

    if top4_players:
        top4_avg_ag = sum(player[1] for player in top4_players) / len(top4_players)
    else:
        top4_avg_ag = 0
    
    return top4_avg_ag
def mean_bot4_eco_ag(roster, team, opponents, avg_dict):

    player_ids = roster.split(":")
    
    valid_player_ids = [(float(player_id), team, opponents) for player_id in player_ids
                        if (float(player_id), team, opponents) in avg_dict]
    
    player_stats = [(player_id, avg_dict[player_id]) for player_id in valid_player_ids]

    top4_players = sorted(player_stats, key=lambda x: x[1], reverse=True)[:4]

    if top4_players:
        top4_avg_ag = sum(player[1] for player in top4_players) / len(top4_players)
    else:
        top4_avg_ag = 0
    
    return top4_avg_ag


team_df['team1_eco_bot4_ag'] = team_df.apply(lambda row: mean_bot4_eco_ag(row["team1_roster_ids"], row["team1"], row["team2"], avg_ag_eco_dict), axis=1)
team_df['team2_eco_bot4_ag'] = team_df.apply(lambda row: mean_bot4_eco_ag(row["team2_roster_ids"], row["team2"], row["team1"], avg_ag_eco_dict), axis=1)
team_df['team1_eco_top4_ag'] = team_df.apply(lambda row: mean_top4_eco_ag(row["team1_roster_ids"], row["team1"], row["team2"], avg_ag_eco_dict), axis=1)
team_df['team2_eco_top4_ag'] = team_df.apply(lambda row: mean_top4_eco_ag(row["team2_roster_ids"], row["team2"], row["team1"], avg_ag_eco_dict), axis=1)
print(team_df)


     match id     team1  team1_id  \
0     9331181        Ba     11283   
1     8797060        Ed        20   
2     9433269        We     10576   
3     9587073  Ga An Ws     36084   
4     9516457     Pb Ks     30407   
..        ...       ...       ...   
943   9128601     Pb Ks     30407   
944   9433241        Mx      8700   
945   9097227        Bd     22497   
946   9516695     Rn Rs     30428   
947   9433633        He      7727   

                                      team1_roster_ids     team2  team2_id  \
0    9373356.0:7857520.0:4232164.0:4566540.0:329940...     Hl Ph     12634   
1    2089079.0:6139370.0:2076192.0:62432.0:2083409....     Wt Is        41   
2    3298427.0:2288789.0:7773338.0:3519011.0:368195...        Ne      8987   
3    8127230.0:4690328.0:4069666.0:7960847.0:469018...     Bs Rs     36070   
4    8127181.0:197658.0:4239038.0:2398346.0:5053082...     Gt Ts     48341   
..                                                 ...       ...       ...   
943  2789

In [55]:
# avg eco of team in a venue top4

comb_df=pd.merge(team_df,bowler_df,on='match id',how='inner')
comb_df['opponents']=comb_df.apply(lambda row:row['team2'] if row['team']==row['team1'] else row['team1'],axis=1)
group_df=comb_df.groupby(['bowler_id','team','venue'])['economy'].mean().reset_index()
group_df.rename(columns={'economy':'avg_eco_ve'},inplace=True)

avg_ve_eco_dict={}
for _,row in group_df.iterrows():
    key=(row['bowler_id'],row['team'],row['venue'])
    avg_ve_eco_dict[key]=row['avg_eco_ve']

def mean_top4_eco_ve(roster, team, venue, avg_dict):
    player_ids = roster.split(":")
    
    valid_player_ids = [(float(player_id), team, venue) for player_id in player_ids
                        if (float(player_id), team, venue) in avg_dict]
    
    player_stats = [(player_id, avg_dict[player_id]) for player_id in valid_player_ids]

    top4_players = sorted(player_stats, key=lambda x: x[1])[:4]

    if top4_players:
        top4_avg_ve = sum(player[1] for player in top4_players) / len(top4_players)
    else:
        top4_avg_ve = 0
    
    return top4_avg_ve
def mean_bot4_eco_ve(roster, team, venue, avg_dict):

    player_ids = roster.split(":")
    
    valid_player_ids = [(float(player_id), team, venue) for player_id in player_ids
                        if (float(player_id), team, venue) in avg_dict]
    
    player_stats = [(player_id, avg_dict[player_id]) for player_id in valid_player_ids]

    top4_players = sorted(player_stats, key=lambda x: x[1], reverse=True)[:4]

    if top4_players:
        top4_avg_ve = sum(player[1] for player in top4_players) / len(top4_players)
    else:
        top4_avg_ve = 0
    
    return top4_avg_ve


team_df['team1_eco_bot4_ve'] = team_df.apply(lambda row: mean_bot4_eco_ve(row["team1_roster_ids"], row["team1"], row["venue"], avg_ve_eco_dict), axis=1)
team_df['team2_eco_bot4_ve'] = team_df.apply(lambda row: mean_bot4_eco_ve(row["team2_roster_ids"], row["team2"], row["venue"], avg_ve_eco_dict), axis=1)
team_df['team1_eco_top4_ve'] = team_df.apply(lambda row: mean_top4_eco_ve(row["team1_roster_ids"], row["team1"], row["venue"], avg_ve_eco_dict), axis=1)
team_df['team2_eco_top4_ve'] = team_df.apply(lambda row: mean_top4_eco_ve(row["team2_roster_ids"], row["team2"], row["venue"], avg_ve_eco_dict), axis=1)
print(team_df)


     match id     team1  team1_id  \
0     9331181        Ba     11283   
1     8797060        Ed        20   
2     9433269        We     10576   
3     9587073  Ga An Ws     36084   
4     9516457     Pb Ks     30407   
..        ...       ...       ...   
943   9128601     Pb Ks     30407   
944   9433241        Mx      8700   
945   9097227        Bd     22497   
946   9516695     Rn Rs     30428   
947   9433633        He      7727   

                                      team1_roster_ids     team2  team2_id  \
0    9373356.0:7857520.0:4232164.0:4566540.0:329940...     Hl Ph     12634   
1    2089079.0:6139370.0:2076192.0:62432.0:2083409....     Wt Is        41   
2    3298427.0:2288789.0:7773338.0:3519011.0:368195...        Ne      8987   
3    8127230.0:4690328.0:4069666.0:7960847.0:469018...     Bs Rs     36070   
4    8127181.0:197658.0:4239038.0:2398346.0:5053082...     Gt Ts     48341   
..                                                 ...       ...       ...   
943  2789

In [56]:
# avg eco of team top 4 by innings
combined_df=pd.merge(team_df,bowler_df,on='match id',how='inner')
combined_df['opponents']=combined_df.apply(lambda row:row['team2'] if row['team']==row['team1'] else row['team1'],axis=1)


def determine_innings(row):
    if(row['team']==row['toss winner']):
        if row['toss decision']=='field':
            return 1
        else:
            return 2
    else:
        if row['toss decision']=='field':
            return 2
        else:
            return 1

def deter_innings(toss_winner,team,toss_decision):
    if(team==toss_winner):
        if toss_decision=='field':
            return 1
        else:
            return 2
    else:
        if toss_decision=='field':
            return 2
        else:
            return 1        
combined_df['innings']=combined_df.apply(determine_innings,axis=1)

grouped_df=combined_df.groupby(['bowler_id','team','innings'])['economy'].mean().reset_index()
grouped_df.rename(columns={'economy':'avg_eco_in'},inplace=True)

avg_in_eco_dict={}
for _,row in grouped_df.iterrows():
    key=(row['bowler_id'],row['team'],row['innings'])
    avg_in_eco_dict[key]=row['avg_eco_in']


def eco_top4_in(roster, team, toss_decision,toss_winner, avg_dict):
    innings=deter_innings(toss_winner,team,toss_decision)
    player_ids = roster.split(":")
    
    valid_player_ids = [(float(player_id), team, innings) for player_id in player_ids
                        if (float(player_id), team, innings) in avg_dict]
    
    player_stats = [(player_id, avg_dict[player_id]) for player_id in valid_player_ids]

    top4_players = sorted(player_stats, key=lambda x: x[1])[:4]

    if top4_players:
        top4_avg_ag = sum(player[1] for player in top4_players) / len(top4_players)
    else:
        top4_avg_ag = 0
    
    return top4_avg_ag

def eco_bot4_in(roster, team, toss_decision,toss_winner, avg_dict):
    innings=deter_innings(toss_winner,team,toss_decision)
    player_ids = roster.split(":")
    
    valid_player_ids = [(float(player_id), team, innings) for player_id in player_ids
                        if (float(player_id), team, innings) in avg_dict]
    
    player_stats = [(player_id, avg_dict[player_id]) for player_id in valid_player_ids]

    top4_players = sorted(player_stats, key=lambda x: x[1],reverse=True)[:4]

    if top4_players:
        top4_avg_ag = sum(player[1] for player in top4_players) / len(top4_players)
    else:
        top4_avg_ag = 0
    
    return top4_avg_ag

team_df['team1_eco_top4_in'] = team_df.apply(lambda row: eco_top4_in(row["team1_roster_ids"], row["team1"], row["toss decision"],row['toss winner'], avg_in_eco_dict), axis=1)
team_df['team2_eco_top4_in'] = team_df.apply(lambda row: eco_top4_in(row["team2_roster_ids"], row["team2"], row["toss decision"],row['toss winner'], avg_in_eco_dict), axis=1)
team_df['team1_eco_bot4_in'] = team_df.apply(lambda row: eco_bot4_in(row["team1_roster_ids"], row["team1"], row["toss decision"],row['toss winner'], avg_in_eco_dict), axis=1)
team_df['team2_eco_bot4_in'] = team_df.apply(lambda row: eco_bot4_in(row["team2_roster_ids"], row["team2"], row["toss decision"],row['toss winner'], avg_in_eco_dict), axis=1)
print(team_df)


     match id     team1  team1_id  \
0     9331181        Ba     11283   
1     8797060        Ed        20   
2     9433269        We     10576   
3     9587073  Ga An Ws     36084   
4     9516457     Pb Ks     30407   
..        ...       ...       ...   
943   9128601     Pb Ks     30407   
944   9433241        Mx      8700   
945   9097227        Bd     22497   
946   9516695     Rn Rs     30428   
947   9433633        He      7727   

                                      team1_roster_ids     team2  team2_id  \
0    9373356.0:7857520.0:4232164.0:4566540.0:329940...     Hl Ph     12634   
1    2089079.0:6139370.0:2076192.0:62432.0:2083409....     Wt Is        41   
2    3298427.0:2288789.0:7773338.0:3519011.0:368195...        Ne      8987   
3    8127230.0:4690328.0:4069666.0:7960847.0:469018...     Bs Rs     36070   
4    8127181.0:197658.0:4239038.0:2398346.0:5053082...     Gt Ts     48341   
..                                                 ...       ...       ...   
943  2789

In [57]:
# avg wickets taken by top 2 bowlers
average_wck = bowler_df.groupby(['team', 'bowler_id'])['wicket_count'].mean().reset_index()
average_wck_dict=average_wck.set_index(["bowler_id","team"])["wicket_count"].to_dict()

def sum_top4_avg_wcks(roster, team, avg_dict):
    player_ids = roster.split(":")
    top_avg_wcks = []

    for player_id in player_ids:
        key = (float(player_id), team)
        if key in avg_dict:
            avg_wck = avg_dict[key]
            top_avg_wcks.append(avg_wck)

    top_avg_wcks = sorted(top_avg_wcks, reverse=True)[:2]
    
    if top_avg_wcks:
        return sum(top_avg_wcks) / len(top_avg_wcks)
    else:
        return 0
    
team_df["team1_top2_wck"] = team_df.apply(lambda row: sum_top4_avg_wcks(row["team1_roster_ids"], row["team1"], average_wck_dict), axis=1)
team_df["team2_top2_wck"] = team_df.apply(lambda row: sum_top4_avg_wcks(row["team2_roster_ids"], row["team2"], average_wck_dict), axis=1)
print(team_df)   


     match id     team1  team1_id  \
0     9331181        Ba     11283   
1     8797060        Ed        20   
2     9433269        We     10576   
3     9587073  Ga An Ws     36084   
4     9516457     Pb Ks     30407   
..        ...       ...       ...   
943   9128601     Pb Ks     30407   
944   9433241        Mx      8700   
945   9097227        Bd     22497   
946   9516695     Rn Rs     30428   
947   9433633        He      7727   

                                      team1_roster_ids     team2  team2_id  \
0    9373356.0:7857520.0:4232164.0:4566540.0:329940...     Hl Ph     12634   
1    2089079.0:6139370.0:2076192.0:62432.0:2083409....     Wt Is        41   
2    3298427.0:2288789.0:7773338.0:3519011.0:368195...        Ne      8987   
3    8127230.0:4690328.0:4069666.0:7960847.0:469018...     Bs Rs     36070   
4    8127181.0:197658.0:4239038.0:2398346.0:5053082...     Gt Ts     48341   
..                                                 ...       ...       ...   
943  2789

In [58]:
# avg wickets taken by top 2 bowler against a particular team
combined_df = pd.merge(team_df, bowler_df, on='match id', how='inner')

combined_df['opponents'] = combined_df.apply(lambda row: row['team2'] if row['team'] == row['team1'] else row['team1'], axis=1)


grouped_df = combined_df.groupby(['bowler_id', 'team', 'opponents'])['wicket_count'].mean().reset_index()
grouped_df.rename(columns={'wicket_count': 'avg_wck_ag'}, inplace=True)


avg_ag_wck_dict = {}
for _, row in grouped_df.iterrows():
    key = (row['bowler_id'], row['team'], row['opponents'])
    avg_ag_wck_dict[key] = row['avg_wck_ag']
    
def mean_top2_wck_ag(roster, team, opponents, avg_dict):
    player_ids = roster.split(":")
    valid_player_ids = [(float(player_id), team, opponents) for player_id in player_ids 
                        if (float(player_id), team, opponents) in avg_dict]
    player_stats = [(player_id, avg_dict[player_id]) for player_id in valid_player_ids]
    top2_players = sorted(player_stats, key=lambda x: x[1], reverse=True)[:2]
    if top2_players:
        top2_avg_ag = sum(player[1] for player in top2_players) / len(top2_players)
    else:
        top2_avg_ag = 0
    return top2_avg_ag


team_df['team1_wck_top2_ag'] = team_df.apply(lambda row: mean_top2_wck_ag(row["team1_roster_ids"], row["team1"], row["team2"], avg_ag_wck_dict), axis=1)
team_df['team2_wck_top2_ag'] = team_df.apply(lambda row: mean_top2_wck_ag(row["team2_roster_ids"], row["team2"], row["team1"], avg_ag_wck_dict), axis=1)

print(team_df)


     match id     team1  team1_id  \
0     9331181        Ba     11283   
1     8797060        Ed        20   
2     9433269        We     10576   
3     9587073  Ga An Ws     36084   
4     9516457     Pb Ks     30407   
..        ...       ...       ...   
943   9128601     Pb Ks     30407   
944   9433241        Mx      8700   
945   9097227        Bd     22497   
946   9516695     Rn Rs     30428   
947   9433633        He      7727   

                                      team1_roster_ids     team2  team2_id  \
0    9373356.0:7857520.0:4232164.0:4566540.0:329940...     Hl Ph     12634   
1    2089079.0:6139370.0:2076192.0:62432.0:2083409....     Wt Is        41   
2    3298427.0:2288789.0:7773338.0:3519011.0:368195...        Ne      8987   
3    8127230.0:4690328.0:4069666.0:7960847.0:469018...     Bs Rs     36070   
4    8127181.0:197658.0:4239038.0:2398346.0:5053082...     Gt Ts     48341   
..                                                 ...       ...       ...   
943  2789

In [59]:
# avg wcks by top 2 bowler by venue
combined_df = pd.merge(team_df, bowler_df, on='match id', how='inner')

combined_df['opponents'] = combined_df.apply(lambda row: row['team2'] if row['team'] == row['team1'] else row['team1'], axis=1)


grouped_df = combined_df.groupby(['bowler_id', 'team', 'venue'])['wicket_count'].mean().reset_index()
grouped_df.rename(columns={'wicket_count': 'avg_wck_ve'}, inplace=True)


avg_ve_wck_dict = {}
for _, row in grouped_df.iterrows():
    key = (row['bowler_id'], row['team'], row['venue'])
    avg_ve_wck_dict[key] = row['avg_wck_ve']
    
def mean_top2_wck_ve(roster, team, opponents, avg_dict):
    player_ids = roster.split(":")
    valid_player_ids = [(float(player_id), team, opponents) for player_id in player_ids 
                        if (float(player_id), team, opponents) in avg_dict]
    player_stats = [(player_id, avg_dict[player_id]) for player_id in valid_player_ids]
    top2_players = sorted(player_stats, key=lambda x: x[1], reverse=True)[:2]
    if top2_players:
        top2_avg_ag = sum(player[1] for player in top2_players) / len(top2_players)
    else:
        top2_avg_ag = 0
    return top2_avg_ag


team_df['team1_wck_top2_ve'] = team_df.apply(lambda row: mean_top2_wck_ve(row["team1_roster_ids"], row["team1"], row["venue"], avg_ve_wck_dict), axis=1)
team_df['team2_wck_top2_ve'] = team_df.apply(lambda row: mean_top2_wck_ve(row["team2_roster_ids"], row["team2"], row["venue"], avg_ve_wck_dict), axis=1)

print(team_df)


     match id     team1  team1_id  \
0     9331181        Ba     11283   
1     8797060        Ed        20   
2     9433269        We     10576   
3     9587073  Ga An Ws     36084   
4     9516457     Pb Ks     30407   
..        ...       ...       ...   
943   9128601     Pb Ks     30407   
944   9433241        Mx      8700   
945   9097227        Bd     22497   
946   9516695     Rn Rs     30428   
947   9433633        He      7727   

                                      team1_roster_ids     team2  team2_id  \
0    9373356.0:7857520.0:4232164.0:4566540.0:329940...     Hl Ph     12634   
1    2089079.0:6139370.0:2076192.0:62432.0:2083409....     Wt Is        41   
2    3298427.0:2288789.0:7773338.0:3519011.0:368195...        Ne      8987   
3    8127230.0:4690328.0:4069666.0:7960847.0:469018...     Bs Rs     36070   
4    8127181.0:197658.0:4239038.0:2398346.0:5053082...     Gt Ts     48341   
..                                                 ...       ...       ...   
943  2789

In [60]:
# avg wcks by top 2 bowler by innings
combined_df=pd.merge(team_df,bowler_df,on='match id',how='inner')
combined_df['opponents']=combined_df.apply(lambda row:row['team2'] if row['team']==row['team1'] else row['team1'],axis=1)


def determine_innings(row):
    if(row['team']==row['toss winner']):
        if row['toss decision']=='field':
            return 1
        else:
            return 2
    else:
        if row['toss decision']=='field':
            return 2
        else:
            return 1

def deter_innings(toss_winner,team,toss_decision):
    if(team==toss_winner):
        if toss_decision=='field':
            return 1
        else:
            return 2
    else:
        if toss_decision=='field':
            return 2
        else:
            return 1        
combined_df['innings']=combined_df.apply(determine_innings,axis=1)

grouped_df=combined_df.groupby(['bowler_id','team','innings'])['wicket_count'].mean().reset_index()
grouped_df.rename(columns={'wicket_count':'avg_wck_in'},inplace=True)

avg_in_wck_dict={}
for _,row in grouped_df.iterrows():
    key=(row['bowler_id'],row['team'],row['innings'])
    avg_in_wck_dict[key]=row['avg_wck_in']


def wck_top4_in(roster, team, toss_decision,toss_winner, avg_dict):
    innings=deter_innings(toss_winner,team,toss_decision)
    player_ids = roster.split(":")
    
    valid_player_ids = [(float(player_id), team, innings) for player_id in player_ids
                        if (float(player_id), team, innings) in avg_dict]
    
    player_stats = [(player_id, avg_dict[player_id]) for player_id in valid_player_ids]

    top4_players = sorted(player_stats, key=lambda x: x[1],reverse=True)[:2]

    if top4_players:
        top4_avg_ag = sum(player[1] for player in top4_players) / len(top4_players)
    else:
        top4_avg_ag = 0
    
    return top4_avg_ag


team_df['team1_wck_top2_in'] = team_df.apply(lambda row: wck_top4_in(row["team1_roster_ids"], row["team1"], row["toss decision"],row['toss winner'], avg_in_wck_dict), axis=1)
team_df['team2_wck_top2_in'] = team_df.apply(lambda row: wck_top4_in(row["team2_roster_ids"], row["team2"], row["toss decision"],row['toss winner'], avg_in_wck_dict), axis=1)
print(team_df)


     match id     team1  team1_id  \
0     9331181        Ba     11283   
1     8797060        Ed        20   
2     9433269        We     10576   
3     9587073  Ga An Ws     36084   
4     9516457     Pb Ks     30407   
..        ...       ...       ...   
943   9128601     Pb Ks     30407   
944   9433241        Mx      8700   
945   9097227        Bd     22497   
946   9516695     Rn Rs     30428   
947   9433633        He      7727   

                                      team1_roster_ids     team2  team2_id  \
0    9373356.0:7857520.0:4232164.0:4566540.0:329940...     Hl Ph     12634   
1    2089079.0:6139370.0:2076192.0:62432.0:2083409....     Wt Is        41   
2    3298427.0:2288789.0:7773338.0:3519011.0:368195...        Ne      8987   
3    8127230.0:4690328.0:4069666.0:7960847.0:469018...     Bs Rs     36070   
4    8127181.0:197658.0:4239038.0:2398346.0:5053082...     Gt Ts     48341   
..                                                 ...       ...       ...   
943  2789

In [61]:
team_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 948 entries, 0 to 947
Data columns (total 71 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   match id                   948 non-null    int64  
 1   team1                      948 non-null    object 
 2   team1_id                   948 non-null    int64  
 3   team1_roster_ids           948 non-null    object 
 4   team2                      948 non-null    object 
 5   team2_id                   948 non-null    int64  
 6   team2_roster_ids           948 non-null    object 
 7   winner                     948 non-null    object 
 8   winner_id                  948 non-null    int64  
 9   toss winner                948 non-null    object 
 10  toss decision              948 non-null    object 
 11  venue                      948 non-null    object 
 12  city                       948 non-null    object 
 13  match_dt                   948 non-null    object 

In [62]:
columns_to_add=[]
data=pd.merge(data,team_df[['match id','team1_top4_avg','team2_top4_avg','team1_top4_bats','team2_top4_bats','team1_top4_eco',
                            'team2_top4_eco','team1_bot4_eco','team2_bot4_eco','team1_top2_wck','team2_top2_wck','team1_avg_top4_ag','team2_avg_top4_ag','team1_eco_bot4_ag','team2_eco_bot4_ag','team1_eco_top4_ag','team2_eco_top4_ag',
                           'team1_wck_top2_ag','team2_wck_top2_ag','team1_avg_top4_ve','team2_avg_top4_ve','team1_eco_top4_ve',
                           'team2_eco_top4_ve','team1_eco_bot4_ve','team2_eco_bot4_ve','team1_wck_top2_ve','team2_wck_top2_ve',
                           'team1_count_rel_bats','team2_count_rel_bats','team1_count_rel_bowl','team2_count_rel_bowl','team1_avg_top4_in',
                           'team2_avg_top4_in','team1_eco_top4_in','team2_eco_top4_in','team1_eco_bot4_in','team2_eco_bot4_in',
                           'team1_wck_top2_in','team2_wck_top2_in','team1_count_bat_form','team2_count_bat_form','team1_last5_avg','team2_last5_avg',
                           'team1_last5_eco_top','team2_last5_eco_top','team1_last5_eco_bot','team2_last5_eco_bot','team1_count_bowl_form',
                           'team2_count_bowl_form']],on='match id',how='left')
data


Unnamed: 0,match id,team1,team1_id,team1_roster_ids,team2,team2_id,team2_roster_ids,winner,winner_id,toss winner,...,team1_count_bat_form,team2_count_bat_form,team1_last5_avg,team2_last5_avg,team1_last5_eco_top,team2_last5_eco_top,team1_last5_eco_bot,team2_last5_eco_bot,team1_count_bowl_form,team2_count_bowl_form
0,9331181,6,11283,9373356.0:7857520.0:4232164.0:4566540.0:329940...,43,12634,3500958.0:4231751.0:2735081.0:2035102.0:369833...,1,12634,44,...,2,1,40.50,10.00,6.3550,6.6250,8.1675,6.6250,2,1
1,8797060,35,20,2089079.0:6139370.0:2076192.0:62432.0:2083409....,138,41,4690258.0:4069666.0:4230127.0:1942317.0:161392...,0,20,138,...,3,4,27.75,39.25,5.9800,4.8125,8.6875,9.3750,4,4
2,9433269,135,10576,3298427.0:2288789.0:7773338.0:3519011.0:368195...,81,8987,4003390.0:1749075.0:1626526.0:4172447.0:551672...,0,10576,81,...,2,5,26.50,48.50,7.9575,6.9600,11.1450,10.6450,1,2
3,9587073,39,36084,8127230.0:4690328.0:4069666.0:7960847.0:469018...,13,36070,3462080.0:2436405.0:1798705.0:7550857.0:574247...,0,36084,34,...,1,1,26.75,20.50,6.5000,6.2925,10.6475,10.6450,2,3
4,9516457,96,30407,8127181.0:197658.0:4239038.0:2398346.0:5053082...,38,48341,1958683.0:7491224.0:8059029.0:4377610.0:225245...,1,48341,40,...,1,2,20.75,25.00,6.1875,5.6875,7.5625,7.7500,4,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
943,9128601,96,30407,2789079.0:197658.0:2398346.0:2827327.0:2082044...,15,30414,7422673.0:249087.0:3519011.0:5001170.0:232000....,0,30407,16,...,1,3,21.00,34.00,6.1250,7.3125,8.8750,9.7500,3,2
944,9433241,81,8700,4421689.0:7752989.0:1941743.0:4489722.0:767287...,114,9701,6139370.0:7694581.0:3294444.0:3239102.0:632036...,1,9701,114,...,2,3,38.00,37.50,7.6250,8.9575,8.8750,10.8125,1,0
945,9097227,7,22497,4239773.0:1941743.0:3007969.0:4172972.0:155625...,136,23869,323049.0:4876122.0:4164978.0:1837205.0:3373138...,1,23869,6,...,2,1,26.25,14.50,5.5825,5.7700,6.9575,9.3125,4,2
946,9516695,105,30428,8058959.0:2162782.0:2981614.0:4690188.0:212569...,113,36014,5958840.0:7491294.0:3127354.0:3057312.0:420349...,1,36014,103,...,3,3,33.00,31.75,5.6250,4.4375,9.0625,9.9375,5,4


In [63]:
data['team_ratio_avg']=data['team1_top4_avg']-data['team2_top4_avg']
data['team_ratio_bats']=data['team1_top4_bats']-data['team2_top4_bats']
data['team_ratio_eco_top']=data['team1_top4_eco']-data['team2_top4_eco']
data['team_ratio_eco_bot']=data['team1_bot4_eco']-data['team2_bot4_eco']
data['team_ratio_wck']=data['team1_top2_wck']-data['team2_top2_wck']
data['team_ratio_avg_ag']=data['team1_avg_top4_ag']-data['team1_avg_top4_ag']
data['team_ratio_avg_ve']=data['team1_avg_top4_ve']-data['team1_avg_top4_ve']
data['team_ratio_avg_in']=data['team1_avg_top4_in']-data['team1_avg_top4_in']
data['team_ratio_eco_top_ag']=data['team1_eco_top4_ag']-data['team1_eco_top4_ag']
data['team_ratio_eco_top_ve']=data['team1_eco_top4_ve']-data['team1_eco_top4_ve']
data['team_ratio_eco_top_in']=data['team1_eco_top4_in']-data['team1_eco_top4_in']
data['team_ratio_eco_bot_ag']=data['team1_eco_bot4_ag']-data['team1_eco_bot4_ag']
data['team_ratio_eco_bot_ve']=data['team1_eco_bot4_ve']-data['team1_eco_bot4_ve']
data['team_ratio_eco_bot_in']=data['team1_eco_bot4_in']-data['team1_eco_bot4_in']
data['team_ratio_wck_top_ag']=data['team1_wck_top2_ag']-data['team1_wck_top2_ag']
data['team_ratio_wck_top_ve']=data['team1_wck_top2_ve']-data['team1_wck_top2_ve']
data['team_ratio_wck_top_in']=data['team1_wck_top2_in']-data['team1_wck_top2_in']
data['team_ratio_rel_bats']=data['team1_count_rel_bats']-data['team2_count_rel_bats']
data['team_ratio_rel_bowl']=data['team1_count_rel_bowl']-data['team2_count_rel_bowl']
data['team_ratio_bat_form']=data['team1_count_bat_form']-data['team2_count_bat_form']
data['team_ratio_bowl_form']=data['team1_count_bowl_form']-data['team2_count_bowl_form']
data['team_ratio_last5_avg']=data['team1_last5_avg']-data['team2_last5_avg']
data['team_ratio_last5_eco_top']=data['team1_last5_eco_top']-data['team2_last5_eco_top']
data['team_ratio_last5_eco_bot']=data['team1_last5_eco_bot']-data['team2_last5_eco_bot']
data.to_csv('./csv/data.csv',index=False)

In [64]:
features = ['team1', 'team2', 'toss winner', 'toss decision', 'venue', 'city', 'lighting', 'series_name',
            'team_count_50runs_last15', 'team_winp_last5', 'team1only_avg_runs_last15',
            'team1_winp_team2_last15', 'ground_avg_runs_last15', 'year', 'month', 'day','team1_top4_avg','team2_top4_avg',
           'team1_top4_eco','team2_top4_eco','team1_top4_bats','team2_top4_bats','team1_top2_wck','team2_top2_wck','team1_avg_top4_ag','team2_avg_top4_ag',
           'team1_eco_bot4_ag','team2_eco_bot4_ag','team1_eco_top4_ag','team2_eco_top4_ag','team1_bot4_eco','team2_bot4_eco',
           'team1_avg_top4_ve','team2_avg_top4_ve','team1_eco_top4_ve','team2_eco_top4_ve','team1_eco_bot4_ve','team2_eco_bot4_ve',
           'team1_avg_top4_in','team2_avg_top4_in','team1_eco_top4_in','team2_eco_top4_in','team1_eco_bot4_in','team2_eco_bot4_in']


X = data[features]
y = data['winner']
# used features at random that increases the accuracy
# addinf 'team1_wck_top2_ag' and for team 2 reduces accuracy same goes with venue and innings
# adding rel bats reduced xgb rest same
# rel bowl decreases in all
# count form reduces all
# team_last5_avg reduces all

In [65]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(y_train)

109    1
77     0
549    0
798    0
82     1
      ..
106    1
270    0
860    1
435    0
102    1
Name: winner, Length: 758, dtype: int64


In [66]:
# GBM

gbm = GradientBoostingClassifier()
gbm.fit(X_train, y_train)

y_pred_gbm = gbm.predict(X_val)

print('GBM accuracy: ', accuracy_score(y_val, y_pred_gbm))
# GBM accuracy:  0.8052631578947368

GBM accuracy:  0.8052631578947368


In [67]:
# XGB
# unique_classes = np.unique(y_train)
# print(unique_classes)

le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)

xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, y_train_encoded)

y_pred_xgb = xgb_model.predict(X_val)

print('XGBoost Accuracy: ', accuracy_score(y_val, y_pred_xgb))
# 0.7631578947368421

XGBoost Accuracy:  0.7631578947368421


In [68]:
# CATB

catboost_model = CatBoostClassifier(silent=True)
catboost_model.fit(X_train, y_train)

y_pred_catboost = catboost_model.predict(X_val)


print("CatBoost Accuracy: ", accuracy_score(y_val, y_pred_catboost))
# 0.7736842105263158

CatBoost Accuracy:  0.7736842105263158
