In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# Load the data
df = pd.read_csv('data/aggregated_df.csv')
df.columns

Index(['match_id', 'inning', 'bowling_team', 'batter', 'batter_total_balls',
       'batter_SR', 'batter_score', 'dismissal_kind', 'date', 'match_type',
       'venue', 'no_of_right_arm_pacers_faced', 'no_of_left_arm_pacers_faced',
       'no_of_right_arm_offspinners_faced',
       'no_of_right_arm_legspinners_faced', 'no_of_left_arm_legspinners_faced',
       'no_of_left_arm_offspinners_faced', 'score_last_5', 'balls_last_5',
       'score_last_1', 'balls_last_1', 'score_last_2', 'balls_last_2',
       'score_last_3', 'balls_last_3', 'score_last_4', 'balls_last_4'],
      dtype='object')

## Add number of innings, avg and S/R up until that match, number of 30+ scores, number of 50+ scores, consistency, and form

In [2]:
df.drop(columns = ['date', 'batter_SR', 'no_of_right_arm_pacers_faced', 'no_of_left_arm_pacers_faced', 'no_of_right_arm_offspinners_faced', 
                        'no_of_right_arm_legspinners_faced', 'no_of_left_arm_legspinners_faced', 'no_of_left_arm_offspinners_faced', 
                        'score_last_5', 'balls_last_5', 'score_last_1', 'balls_last_1', 'score_last_2', 'balls_last_2', 'score_last_3', 
                        'balls_last_3', 'score_last_4', 'balls_last_4'], inplace = True)
df.columns

Index(['match_id', 'inning', 'bowling_team', 'batter', 'batter_total_balls',
       'batter_score', 'dismissal_kind', 'match_type', 'venue'],
      dtype='object')

In [3]:
# pd.isna(df['dismissal_kind'][len(df)-1])

In [4]:
df['dismissed'] = df['dismissal_kind'].notna().astype(int)

In [5]:
df['dismissed']

0        0
1        1
2        0
3        1
4        1
        ..
15461    1
15462    1
15463    0
15464    1
15465    0
Name: dismissed, Length: 15466, dtype: int64

In [6]:
df.drop(columns = 'dismissal_kind', inplace = True)

In [7]:
df.columns

Index(['match_id', 'inning', 'bowling_team', 'batter', 'batter_total_balls',
       'batter_score', 'match_type', 'venue', 'dismissed'],
      dtype='object')

In [8]:
df['cumulative_runs'] = df.groupby('batter')['batter_score'].cumsum()
df['cumulative_dismissals'] = df.groupby('batter')['dismissed'].cumsum()
df['cumulative_balls_faced'] = df.groupby('batter')['batter_total_balls'].cumsum()

# Average up until the previous match
df['average'] = df.groupby('batter')['cumulative_runs'].shift(1) / df.groupby('batter')['cumulative_dismissals'].shift(1)

# Strike rate up until the previous match
df['strike_rate'] = (df.groupby('batter')['cumulative_runs'].shift(1) / df.groupby('batter')['cumulative_balls_faced'].shift(1)) * 100

# Handling initial matches
# df['average'].fillna(0, inplace = True)
df.fillna({"average": 0, "strike_rate": 0}, inplace = True)
# df['strike_rate'].fillna(0, inplace = True) 

In [9]:
# Cumulative 30+ scores up until the previous match
df['cumulative_30_plus_inclusive'] = df.groupby('batter')['batter_score'].apply(lambda x: (x.apply(lambda score: 1 if 30 <= score < 50 else 0)).cumsum()).reset_index(level=0, drop=True)
df['cumulative_30_plus'] = df.groupby('batter')['cumulative_30_plus_inclusive'].shift(1).fillna(0).astype(int)

# Cumulative 50+ scores up until the previous match
df['cumulative_50_plus_inclusive'] = df.groupby('batter')['batter_score'].apply(lambda x: (x.apply(lambda score: 1 if score >= 50 else 0)).cumsum()).reset_index(level=0, drop=True)
df['cumulative_50_plus'] = df.groupby('batter')['cumulative_50_plus_inclusive'].shift(1).fillna(0).astype(int)

df['innings_played'] = df.groupby('batter')['inning'].cumcount() + 1

In [10]:
df = df.drop(columns = ["cumulative_runs", "cumulative_dismissals", "cumulative_balls_faced", "cumulative_30_plus_inclusive", "cumulative_50_plus_inclusive"])
df.tail()

Unnamed: 0,match_id,inning,bowling_team,batter,batter_total_balls,batter_score,match_type,venue,dismissed,average,strike_rate,cumulative_30_plus,cumulative_50_plus,innings_played
15461,1370353,2,Gujarat Titans,DP Conway,25.0,47,Final,"Narendra Modi Stadium, Ahmedabad",1,46.157895,139.427663,4,9,22
15462,1370353,2,Gujarat Titans,MS Dhoni,1.0,0,Final,"Narendra Modi Stadium, Ahmedabad",1,37.094891,135.955056,41,24,217
15463,1370353,2,Gujarat Titans,RA Jadeja,6.0,15,Final,"Narendra Modi Stadium, Ahmedabad",0,25.254717,128.270244,19,2,169
15464,1370353,2,Gujarat Titans,RD Gaikwad,16.0,26,Final,"Narendra Modi Stadium, Ahmedabad",1,41.186047,135.19084,13,15,51
15465,1370353,2,Gujarat Titans,S Dube,21.0,32,Final,"Narendra Modi Stadium, Ahmedabad",0,28.263158,141.501976,5,6,47


In [11]:
df.columns

Index(['match_id', 'inning', 'bowling_team', 'batter', 'batter_total_balls',
       'batter_score', 'match_type', 'venue', 'dismissed', 'average',
       'strike_rate', 'cumulative_30_plus', 'cumulative_50_plus',
       'innings_played'],
      dtype='object')

- No. of matches
- Avg
- S/R
- no of 30
- no of 50
######
- 1 is equal importance
- 3 is moderate of one over another
- 5 is essential
- 7 is very strong
- 9 is extreme importance
######
0    1    2     3     4     5

1    1    1/4   1/6   1/3   1/4

2    4    1     1/3   3     2

3    6    3     1     6     5

4    3    1/3   1/6   1     1/2

5    4    1/2   1/5   2     1



In [12]:
# Main comparison matrix (transposed)
matrix = np.array([
    [1,    1/4,  1/6,  1/3,  1/4],
    [4,    1,    1/3,  3,    2  ],
    [6,    3,    1,    6,    5  ],
    [3,    1/3,  1/6,  1,    1/2],
    [4,    1/2,  1/5,  2,    1  ]
])

In [13]:
A1 = np.ones((5,1))

for i in range(5):
    for j in range(5):
        A1[i][0] *= matrix[i][j]
    A1[i][0] = A1[i][0]**(1/5)

# A1 = np.prod(matrix, axis=1) ** (1/5)
# A1 = A1.reshape(-1, 1)

np.set_printoptions(suppress=True)
A1

array([[0.32219701],
       [1.51571657],
       [3.51948203],
       [0.60836434],
       [0.9563525 ]])

In [14]:
summation = A1.sum()
summation

6.922112444617791

In [15]:
A2 = A1/summation
A2

array([[0.04654605],
       [0.21896734],
       [0.50844046],
       [0.08788709],
       [0.13815905]])

In [16]:
#ahp = np.ones((5,1))
AHP = np.dot(matrix,A2)
AHP

array([[0.24986343],
       [1.11461109],
       [2.66273662],
       [0.45433397],
       [0.71128921]])

In [17]:
A4 = AHP/A2
A4

array([[5.3680906 ],
       [5.09030743],
       [5.23706675],
       [5.16951864],
       [5.14833589]])

In [18]:
mean = A4.mean()
mean
CI = (mean - 5) / 4
CI
CR = CI/1.11
CR
# CR SHOULD BE LESS THAN 0.1 

0.045645014310930376

In [19]:
df['avg_ahp'] = df['average'].apply(lambda x: 1 if x < 20 else (2 if 20 <= x < 25 else (3 if 25 <= x < 30 else (4 if 30 <= x < 40 else (5 if x >= 40 else None)))))

# bins = [-float('inf'), 10, 20, 30, 40, float('inf')]
# labels = [1, 2, 3, 4, 5]

# df['avg_ahp'] = pd.cut(df['average'], bins=bins, labels=labels, right=False)

df['sr_ahp'] = df['strike_rate'].apply(lambda x: 1 if x < 100 else (2 if 100 <= x < 120 else (3 if 120 <= x < 140 else (4 if 140 <= x < 160 else (5 if x >= 160 else None)))))

df['innings_ahp'] = df['innings_played'].apply(lambda x: 1 if x < 20 else (2 if 20 <= x < 40 else (3 if 40 <= x < 60 else (4 if 60 <= x < 80 else (5 if x >= 80 else None)))))

df['30_ahp'] = df['cumulative_30_plus'].apply(lambda x: 1 if x < 5 else (2 if 5 <= x < 15 else (3 if 15 <= x < 25 else (4 if 25 <= x < 35 else (5 if x >= 35 else None)))))

df['50_ahp'] = df['cumulative_50_plus'].apply(lambda x: 1 if x < 5 else (2 if 5 <= x < 10 else (3 if 10 <= x < 20 else (4 if 20 <= x < 25 else (5 if x >= 25 else None)))))

df['consistency'] = df['innings_ahp']*AHP[0][0] + df['avg_ahp']*AHP[1][0] + df['sr_ahp']*AHP[2][0] + df['30_ahp']*AHP[3][0] + df['50_ahp']*AHP[4][0]

df.tail()

Unnamed: 0,match_id,inning,bowling_team,batter,batter_total_balls,batter_score,match_type,venue,dismissed,average,strike_rate,cumulative_30_plus,cumulative_50_plus,innings_played,avg_ahp,sr_ahp,innings_ahp,30_ahp,50_ahp,consistency
15461,1370353,2,Gujarat Titans,DP Conway,25.0,47,Final,"Narendra Modi Stadium, Ahmedabad",1,46.157895,139.427663,4,9,22,5,3,2,1,2,15.937905
15462,1370353,2,Gujarat Titans,MS Dhoni,1.0,0,Final,"Narendra Modi Stadium, Ahmedabad",1,37.094891,135.955056,41,24,217,4,3,5,5,4,18.812798
15463,1370353,2,Gujarat Titans,RA Jadeja,6.0,15,Final,"Narendra Modi Stadium, Ahmedabad",0,25.254717,128.270244,19,2,169,3,3,5,3,1,14.655651
15464,1370353,2,Gujarat Titans,RD Gaikwad,16.0,26,Final,"Narendra Modi Stadium, Ahmedabad",1,41.186047,135.19084,13,15,51,5,3,3,2,3,17.353391
15465,1370353,2,Gujarat Titans,S Dube,21.0,32,Final,"Narendra Modi Stadium, Ahmedabad",0,28.263158,141.501976,5,6,47,3,4,3,2,2,17.075616


In [20]:
last_5_dismissed = []

In [21]:
# Step 1: Sort the DataFrame by batter and match_id
df_sorted = df.sort_values(by=['batter', 'match_id'])

# Step 2: Group by batter
grouped = df_sorted.groupby('batter')

# Step 3: Calculate rolling statistics
rolling_stats = grouped.apply(
    lambda x: x.assign(
        total_runs_5_inc=x['batter_score'].rolling(5, min_periods=1).sum(),
        balls_faced_5_inc=x['batter_total_balls'].rolling(5, min_periods=1).sum(),
        dismissals_5_inc=x['dismissed'].rolling(5, min_periods=1).sum(),
        no_of_30_plus_5_inc=x['batter_score'].rolling(5, min_periods=1).apply(lambda y: ((y >= 30) & (y < 50)).sum()),
        no_of_50_plus_5_inc=x['batter_score'].rolling(5, min_periods=1).apply(lambda y: (y >= 50).sum())
    )
)

rolling_stats = rolling_stats.reset_index(drop=True)

rolling_stats['total_runs_5'] = rolling_stats.groupby('batter')['total_runs_5_inc'].shift(1).fillna(0).astype(int)
rolling_stats['balls_faced_5'] = rolling_stats.groupby('batter')['balls_faced_5_inc'].shift(1).fillna(0).astype(int)
rolling_stats['dismissals_5'] = rolling_stats.groupby('batter')['dismissals_5_inc'].shift(1).fillna(0).astype(int)
rolling_stats['no_of_30_plus_5'] = rolling_stats.groupby('batter')['no_of_30_plus_5_inc'].shift(1).fillna(0).astype(int)
rolling_stats['no_of_50_plus_5'] = rolling_stats.groupby('batter')['no_of_50_plus_5_inc'].shift(1).fillna(0).astype(int)

# Reset index to match the original DataFrame
rolling_stats = rolling_stats.reset_index(drop=True)

# Step 4: Compute required metrics
rolling_stats['average_5'] = rolling_stats['total_runs_5'] / rolling_stats['dismissals_5']
rolling_stats['strike_rate_5'] = rolling_stats['total_runs_5'] / rolling_stats['balls_faced_5'] * 100

# Drop intermediate columns
# rolling_stats = rolling_stats.drop(columns=['balls_faced_5', 'dismissals_5'])

# Step 5: Merge the rolling statistics back to the original DataFrame based on the original index
# df_final = pd.concat([df, rolling_stats[['dismissals_5','total_runs_5', 'no_of_30_plus_5', 'no_of_50_plus_5', 'average_5', 'strike_rate_5']]], axis=1)


  rolling_stats = grouped.apply(


In [22]:
df = pd.merge(df, rolling_stats[['total_runs_5', 'balls_faced_5', 'dismissals_5', 'no_of_30_plus_5', 'no_of_50_plus_5', 'average_5', 'strike_rate_5', 'batter', 'match_id']], on = ['batter', 'match_id'], how = 'left')
df.columns

Index(['match_id', 'inning', 'bowling_team', 'batter', 'batter_total_balls',
       'batter_score', 'match_type', 'venue', 'dismissed', 'average',
       'strike_rate', 'cumulative_30_plus', 'cumulative_50_plus',
       'innings_played', 'avg_ahp', 'sr_ahp', 'innings_ahp', '30_ahp',
       '50_ahp', 'consistency', 'total_runs_5', 'balls_faced_5',
       'dismissals_5', 'no_of_30_plus_5', 'no_of_50_plus_5', 'average_5',
       'strike_rate_5'],
      dtype='object')

- No. of matches
- Avg
- S/R
- no of 30
- no of 50
######
- 1 is equal importance
- 3 is moderate of one over another
- 5 is essential
- 7 is very strong
- 9 is extreme importance
######
0    1    2     3     4     5

1    1    1/4   1/6   1/4   1/7

2    4    1     1/3   2     1/3

3    6    3     1     4     1/2

4    4    1/2   1/4   1     1/3

5    7    3     2     3     1 




In [23]:
matrix = np.array([
    [1,    1/3,  2,    1/3],
    [3,    1,    4,    1/2],
    [1/2,  1/4,  1,    1/3],
    [3,    2,    3,    1]
])



In [24]:
A1 = np.ones((4,1))

for i in range(4):
    for j in range(4):
        A1[i][0] *= matrix[i][j]
    A1[i][0] = A1[i][0]**(1/4)
# A1 = np.prod(matrix, axis=1) ** (1/5)
# A1 = A1.reshape(-1, 1)
np.set_printoptions(suppress=True)
A1

array([[0.68658905],
       [1.56508458],
       [0.451801  ],
       [2.05976714]])

In [25]:
summation = A1.sum()
summation
A2 = A1/summation
A2

array([[0.14414323],
       [0.32857551],
       [0.09485158],
       [0.43242969]])

In [26]:
#AHP = np.ones((5,1))
AHP = np.dot(matrix,A2)
AHP

array([[0.58751478],
       [1.35662635],
       [0.3932103 ],
       [1.80656512]])

In [27]:
A4 = AHP/A2
A4

array([[4.0759097 ],
       [4.12881156],
       [4.14553246],
       [4.17770838]])

In [28]:
mean = A4.mean()
mean
CI = (mean - 4) / 3
CI

0.04399684124422679

In [29]:
CR = CI/0.89
CR
# CR SHOULD BE LESS THAN 0.1 

0.04943465308340089

In [30]:
df['avg_fahp'] = df['average_5'].apply(lambda x: 1 if x < 20 else (2 if 20 <= x < 25 else (3 if 25 <= x < 30 else (4 if 30 <= x < 40 else (5 if x >= 40 else None)))))

# bins = [-float('inf'), 10, 20, 30, 40, float('inf')]
# labels = [1, 2, 3, 4, 5]

# df['avg_fahp'] = pd.cut(df['average'], bins=bins, labels=labels, right=False)

df['sr_fahp'] = df['strike_rate_5'].apply(lambda x: 1 if x < 100 else (2 if 100 <= x < 120 else (3 if 120 <= x < 140 else (4 if 140 <= x < 160 else (5 if x >= 160 else None)))))

# df['innings_fahp'] = df['innings_played'].apply(lambda x: 1 if x < 20 else (2 if 20 <= x < 40 else (3 if 40 <= x < 60 else (4 if 60 <= x < 80 else (5 if x >= 80 else None)))))

df['30_fahp'] = df['no_of_30_plus_5'].apply(lambda x: 1 if x < 5 else (2 if 5 <= x < 15 else (3 if 15 <= x < 25 else (4 if 25 <= x < 35 else (5 if x >= 35 else None)))))

df['50_fahp'] = df['no_of_50_plus_5'].apply(lambda x: 1 if x < 5 else (2 if 5 <= x < 10 else (3 if 10 <= x < 20 else (4 if 20 <= x < 25 else (5 if x >= 25 else None)))))

df['form'] = df['avg_fahp']*AHP[0][0] + df['sr_fahp']*AHP[1][0] + df['30_fahp']*AHP[2][0] + df['50_fahp']*AHP[3][0]

df.tail()

Unnamed: 0,match_id,inning,bowling_team,batter,batter_total_balls,batter_score,match_type,venue,dismissed,average,...,dismissals_5,no_of_30_plus_5,no_of_50_plus_5,average_5,strike_rate_5,avg_fahp,sr_fahp,30_fahp,50_fahp,form
15593,1370353,2,Gujarat Titans,DP Conway,25.0,47,Final,"Narendra Modi Stadium, Ahmedabad",1,46.157895,...,5,3,1,42.2,124.852071,5.0,3.0,1,1,9.207228
15594,1370353,2,Gujarat Titans,MS Dhoni,1.0,0,Final,"Narendra Modi Stadium, Ahmedabad",1,37.094891,...,2,0,0,15.0,142.857143,1.0,4.0,1,1,8.213796
15595,1370353,2,Gujarat Titans,RA Jadeja,6.0,15,Final,"Narendra Modi Stadium, Ahmedabad",0,25.254717,...,4,0,0,23.75,130.136986,2.0,3.0,1,1,7.444684
15596,1370353,2,Gujarat Titans,RD Gaikwad,16.0,26,Final,"Narendra Modi Stadium, Ahmedabad",1,41.186047,...,5,1,2,42.0,148.93617,5.0,4.0,1,1,10.563855
15597,1370353,2,Gujarat Titans,S Dube,21.0,32,Final,"Narendra Modi Stadium, Ahmedabad",0,28.263158,...,3,1,0,40.666667,160.526316,5.0,5.0,1,1,11.920481
