In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# Load the data
df = pd.read_csv('data/aggregated_df.csv')
df.columns

Index(['match_id', 'inning', 'bowling_team', 'batter', 'batter_total_balls',
       'batter_SR', 'batter_score', 'dismissal_kind', 'date', 'match_type',
       'venue', 'no_of_right_arm_pacers_faced', 'no_of_left_arm_pacers_faced',
       'no_of_right_arm_offspinners_faced',
       'no_of_right_arm_legspinners_faced', 'no_of_left_arm_legspinners_faced',
       'no_of_left_arm_offspinners_faced', 'score_last_5', 'balls_last_5',
       'score_last_1', 'balls_last_1', 'score_last_2', 'balls_last_2',
       'score_last_3', 'balls_last_3', 'score_last_4', 'balls_last_4'],
      dtype='object')

## Add number of innings, avg and S/R up until that match, number of 30+ scores, number of 50+ scores, consistency, and form

In [2]:
df.drop(columns = ['date', 'batter_SR', 'no_of_right_arm_pacers_faced', 'no_of_left_arm_pacers_faced', 'no_of_right_arm_offspinners_faced', 
                        'no_of_right_arm_legspinners_faced', 'no_of_left_arm_legspinners_faced', 'no_of_left_arm_offspinners_faced', 
                        'score_last_5', 'balls_last_5', 'score_last_1', 'balls_last_1', 'score_last_2', 'balls_last_2', 'score_last_3', 
                        'balls_last_3', 'score_last_4', 'balls_last_4'], inplace = True)
df.columns

Index(['match_id', 'inning', 'bowling_team', 'batter', 'batter_total_balls',
       'batter_score', 'dismissal_kind', 'match_type', 'venue'],
      dtype='object')

In [3]:
# pd.isna(df['dismissal_kind'][len(df)-1])

In [4]:
df['dismissed'] = df['dismissal_kind'].notna().astype(int)

In [5]:
df['dismissed']

0        0
1        1
2        0
3        1
4        1
        ..
15461    1
15462    1
15463    0
15464    1
15465    0
Name: dismissed, Length: 15466, dtype: int64

In [6]:
df.drop(columns = 'dismissal_kind', inplace = True)

In [7]:
df.columns

Index(['match_id', 'inning', 'bowling_team', 'batter', 'batter_total_balls',
       'batter_score', 'match_type', 'venue', 'dismissed'],
      dtype='object')

In [8]:
# Number of innings
df['no_of_matches'] = df.groupby('batter')['batter'].cumcount() + 1

# Average
df['average'] = df.groupby('batter')['batter_score'].cumsum() / df.groupby('batter')['dismissed'].cumsum()

# Strike Rate (S/R)
df['strike_rate'] = (df.groupby('batter')['batter_score'].cumsum() / df.groupby('batter')['batter_total_balls'].cumsum()) * 100

# # Consistency
# df['consistency'] = df.groupby('batter')['batter_score'].expanding().mean()

# # Form
# df['form'] = df.groupby('batter')['batter_score'].rolling(window=5, min_periods=1).mean().reset_index(level=0, drop=True)


In [9]:
df['cumulative_30_plus'] = df.groupby(['batter'])['batter_score'].apply(lambda x: (x.apply(lambda score: 1 if 30 <= score < 50 else 0)).cumsum()).reset_index(level=0, drop=True)
df['cumulative_50_plus'] = df.groupby(['batter'])['batter_score'].apply(lambda x: (x.apply(lambda score: 1 if score >= 50 else 0)).cumsum()).reset_index(level=0, drop=True)

df.tail()

Unnamed: 0,match_id,inning,bowling_team,batter,batter_total_balls,batter_score,match_type,venue,dismissed,no_of_matches,average,strike_rate,cumulative_30_plus,cumulative_50_plus
15461,1370353,2,Gujarat Titans,DP Conway,25.0,47,Final,"Narendra Modi Stadium, Ahmedabad",1,22,46.2,141.284404,5,9
15462,1370353,2,Gujarat Titans,MS Dhoni,1.0,0,Final,"Narendra Modi Stadium, Ahmedabad",1,217,36.826087,135.918695,41,24
15463,1370353,2,Gujarat Titans,RA Jadeja,6.0,15,Final,"Narendra Modi Stadium, Ahmedabad",0,169,25.396226,128.619207,19,2
15464,1370353,2,Gujarat Titans,RD Gaikwad,16.0,26,Final,"Narendra Modi Stadium, Ahmedabad",1,51,40.840909,135.520362,13,15
15465,1370353,2,Gujarat Titans,S Dube,21.0,32,Final,"Narendra Modi Stadium, Ahmedabad",0,47,29.105263,141.794872,6,6


- No. of matches
- Avg
- S/R
- no of 30
- no of 50
######
- 1 is equal importance
- 3 is moderate of one over another
- 5 is essential
- 7 is very strong
- 9 is extreme importance
######
0    1    2     3     4     5

1    1    1/4   1/6   1/3   1/4

2    4    1     1/3   3     2

3    6    3     1     6     5

4    3    1/3   1/6   1     1/2

5    4    1/2   1/5   2     1



In [10]:
# Main comparison matrix (transposed)
matrix = np.array([
    [1,    1/4,  1/6,  1/3,  1/4],
    [4,    1,    1/3,  3,    2  ],
    [6,    3,    1,    6,    5  ],
    [3,    1/3,  1/6,  1,    1/2],
    [4,    1/2,  1/5,  2,    1  ]
])

# Scale of relative importance
# scale = {
#     1: "equal importance",
#     3: "moderate of one over another",
#     5: "essential",
#     7: "very strong",
#     9: "extreme importance"
# }

# You can access the scale like this: scale[1], scale[3], etc.