## Cricket Dataset:

- Applying the KNN to decide the batter's position.

- Dataset source: [kaggle 🔗](https://www.kaggle.com/datasets/notkrishna/cricket-statistics-for-all-formats)
---

Imports & Data Load:

In [47]:
import os
from typing import Literal
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

In [48]:
# data load:

# data_files = ['odb', 'tb', 'twb']

odi = pd.read_csv(os.path.join('dataset','Batting', 'ODI data.csv'))
test = pd.read_csv(os.path.join('dataset','Batting', 'test.csv'))
t20 = pd.read_csv(os.path.join('dataset','Batting', 't20.csv'))

In [49]:
print(f"ODI Batting Data Sample:")
odi.head(5)

ODI Batting Data Sample:


Unnamed: 0.1,Unnamed: 0,Player,Span,Mat,Inns,NO,Runs,HS,Ave,BF,SR,100,50,0,Unnamed: 13
0,0,SR Tendulkar (INDIA),1989-2012,463,452,41,18426,200*,44.83,21367,86.23,49,96,20,
1,1,KC Sangakkara (Asia/ICC/SL),2000-2015,404,380,41,14234,169,41.98,18048,78.86,25,93,15,
2,2,RT Ponting (AUS/ICC),1995-2012,375,365,39,13704,164,42.03,17046,80.39,30,82,20,
3,3,ST Jayasuriya (Asia/SL),1989-2011,445,433,18,13430,189,32.36,14725,91.2,28,68,34,
4,4,DPMD Jayawardene (Asia/SL),1998-2015,448,418,39,12650,144,33.37,16020,78.96,19,77,28,


In [50]:
print(f"Test Format Batting Data Sample:")
test.head(5)

Test Format Batting Data Sample:


Unnamed: 0.1,Unnamed: 0,Player,Span,Mat,Inns,NO,Runs,HS,Ave,100,50,0,Unnamed: 11
0,0,SR Tendulkar (INDIA),1989-2013,200,329,33,15921,248*,53.78,51,68,14,
1,1,RT Ponting (AUS),1995-2012,168,287,29,13378,257,51.85,41,62,17,
2,2,JH Kallis (ICC/SA),1995-2013,166,280,40,13289,224,55.37,45,58,16,
3,3,R Dravid (ICC/INDIA),1996-2012,164,286,32,13288,270,52.31,36,63,8,
4,4,AN Cook (ENG),2006-2018,161,291,16,12472,294,45.35,33,57,9,


In [51]:
print(f"T-20 Batting Data Sample.")
t20.head(5)

T-20 Batting Data Sample.


Unnamed: 0.1,Unnamed: 0,Player,Span,Mat,Inns,NO,Runs,HS,Ave,BF,SR,100,50,0,4s,6s,Unnamed: 15
0,0,V Kohli (INDIA),2010-2019,75,70,20,2633,94*,52.66,1907,138.07,0,24,2,247,71,
1,1,RG Sharma (INDIA),2007-2019,104,96,14,2633,118,32.1,1905,138.21,4,19,6,234,120,
2,2,MJ Guptill (NZ),2009-2019,83,80,7,2436,105,33.36,1810,134.58,2,15,2,215,113,
3,3,Shoaib Malik (ICC/PAK),2006-2019,111,104,30,2263,75,30.58,1824,124.06,0,7,1,186,61,
4,4,BB McCullum (NZ),2005-2015,71,70,10,2140,123,35.66,1571,136.21,2,13,3,199,91,


---
Data Cleaning pipeline & Format Col. addition:


In [52]:
def clean_data(df: pd.DataFrame, format_name: Literal['ODI', 'TEST', 'T20']):
    df = df.rename(columns= lambda x: x.strip())
    df['Format'] = format_name
    df.drop(columns=["Unnamed: 0"], errors='ignore', inplace=True)
    return df

In [53]:
odi = clean_data(odi, 'ODI')
test = clean_data(test, 'TEST')
t20 = clean_data(t20, 'T20')

In [54]:
print(f"ODI Data Shape: {odi.shape}")
print(f"TEST Data Shape: {test.shape}")
print(f"T20 Data Shape: {t20.shape}")

ODI Data Shape: (2500, 15)
TEST Data Shape: (3001, 13)
T20 Data Shape: (2006, 17)


---
Merge the data:

In [55]:
# All Formats Merged Data:
df_batting = pd.concat([odi, t20, test], ignore_index=True)

# Limited Overs:
ltd_overs_batting_df = pd.concat([odi, t20], ignore_index=True)

# Unlimited Overs:
ultd_batting_df = test
print(f"Merged Limited Overs Batting data shape: {ltd_overs_batting_df.shape}")

Merged Limited Overs Batting data shape: (4506, 18)


---
Preprocessing: Meaningful data to numeric conversion:

In [56]:
ltd_overs_batting_df.columns

Index(['Player', 'Span', 'Mat', 'Inns', 'NO', 'Runs', 'HS', 'Ave', 'BF', 'SR',
       '100', '50', '0', 'Unnamed: 13', 'Format', '4s', '6s', 'Unnamed: 15'],
      dtype='object')

In [57]:
ultd_batting_df.columns

Index(['Player', 'Span', 'Mat', 'Inns', 'NO', 'Runs', 'HS', 'Ave', '100', '50',
       '0', 'Unnamed: 11', 'Format'],
      dtype='object')

In [58]:
test.columns

Index(['Player', 'Span', 'Mat', 'Inns', 'NO', 'Runs', 'HS', 'Ave', '100', '50',
       '0', 'Unnamed: 11', 'Format'],
      dtype='object')

In [59]:

cols = ['Mat', 'Inns', 'NO', 'Runs', 'HS', 'Ave', 'BF', 'SR', '100', '50', '0']
for col in cols:
    t20[col] = pd.to_numeric(t20[col].astype(str).str.replace('*',''), errors='coerce')
    # test[col]= pd.to_numeric(test[col].astype(str).str.replace('*',''), errors='coerce')
    odi[col] = pd.to_numeric(odi[col].astype(str).str.replace('*',''), errors='coerce')
    ltd_overs_batting_df[col] = pd.to_numeric(ltd_overs_batting_df[col].astype(str).str.replace("*", ''), errors='coerce')
    
for col in ['Mat', 'Inns', 'NO', 'Runs', 'HS', 'Ave', '100', '50','0']:
    test[col] = pd.to_numeric(test[col].astype(str).str.replace('*',''), errors='coerce')

In [60]:
ltd_overs_batting_df.isnull().sum()

Player            0
Span              0
Mat               0
Inns            162
NO              162
Runs            162
HS              162
Ave             458
BF              162
SR              194
100             162
50              162
0               162
Unnamed: 13    4506
Format            0
4s             2500
6s             2500
Unnamed: 15    4506
dtype: int64

---
Feature Engineering [White Ball Matches Stats]: 
- Creating new features based on averages, strike rates, 4s, 6s.

- Planning to create features like consistency, boundary runs & percentage, aggresion index, etc...

In [61]:

# 1. Boundary Runs:
# ltd_overs_batting_df['Boundary Runs'] = ltd_overs_batting_df['4s']*4 + ltd_overs_batting_df['6s']*6

# 2. Boundary Precentage:
# ltd_overs_batting_df['Boundary Rate'] = ltd_overs_batting_df['Boundary Runs'] / ltd_overs_batting_df['Runs']

# 3. Aggression Index:
ltd_overs_batting_df['Frequency 50 100'] = (ltd_overs_batting_df['50'] + ltd_overs_batting_df['100']) / ltd_overs_batting_df['Inns']   # big innings
ltd_overs_batting_df['Aggression Index'] = ltd_overs_batting_df['SR']*ltd_overs_batting_df['Frequency 50 100'] 

# 4. Consistency:
ltd_overs_batting_df['Consistency'] = ltd_overs_batting_df["Ave"] * (ltd_overs_batting_df["Inns"] - ltd_overs_batting_df['0']) / ltd_overs_batting_df["Inns"]

# 5. Experience:
# ltd_overs_batting_df['Experience Years'] = ltd_overs_batting_df["Span"].apply(lambda x: abs(int(x.split('-')[1]) - int(x.split('-')[0])))

# 6. Not Out Ratio:
ltd_overs_batting_df['NO Ratio'] = ltd_overs_batting_df["NO"] / ltd_overs_batting_df["Inns"]

# Null Values drop:

ltd_overs_batting_df = ltd_overs_batting_df.dropna(subset=['Runs', 'Ave', 'SR'])
ltd_overs_batting_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4041 entries, 0 to 4352
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Player            4041 non-null   object 
 1   Span              4041 non-null   object 
 2   Mat               4041 non-null   int64  
 3   Inns              4041 non-null   float64
 4   NO                4041 non-null   float64
 5   Runs              4041 non-null   float64
 6   HS                4041 non-null   float64
 7   Ave               4041 non-null   float64
 8   BF                4041 non-null   float64
 9   SR                4041 non-null   float64
 10  100               4041 non-null   float64
 11  50                4041 non-null   float64
 12  0                 4041 non-null   float64
 13  Unnamed: 13       0 non-null      float64
 14  Format            4041 non-null   object 
 15  4s                1672 non-null   object 
 16  6s                1672 non-null   object 
 17  

### Basic Analysis:


Top Scorers:

In [62]:
top_t20 = t20.sort_values('Runs', ascending=False).head(5)
top_test = test.sort_values('Runs', ascending=False).head(5)
top_odi = odi.sort_values('Runs', ascending=False).head(5)
fig = make_subplots(rows=3, cols=1, row_heights=[1,1,1], subplot_titles=(
    "T-20",
    "ODI",
    "Test",
))
fig.update_layout(title='Top 5 batters (By runs)',
                  autosize=False,
                  width=750,height=650)
fig.add_bar(x = top_t20['Runs'], y = top_t20['Player'], orientation='h', row=1, col=1, name='T20')
fig.add_bar(x = top_odi['Runs'], y = top_odi['Player'], orientation='h', row=2, col=1, name='ODI')
fig.add_bar(x = top_test['Runs'], y = top_test['Player'], orientation='h', row=3, col=1, name='TEST')
fig.show()

Aggression v/s Consistency [In white Ball Cricket]:

In [63]:
ltd_overs_batting_df.columns

Index(['Player', 'Span', 'Mat', 'Inns', 'NO', 'Runs', 'HS', 'Ave', 'BF', 'SR',
       '100', '50', '0', 'Unnamed: 13', 'Format', '4s', '6s', 'Unnamed: 15',
       'Frequency 50 100', 'Aggression Index', 'Consistency', 'NO Ratio'],
      dtype='object')

In [64]:
fig = px.scatter(
    ltd_overs_batting_df,
    x = 'Consistency', 
    y = 'Aggression Index',
    color = 'Format',
    hover_data=['Player', 'Runs','Aggression Index', 'Consistency' ,'Ave', '100', '50'])
fig.show()

In [38]:
fig = px.scatter(
    ltd_overs_batting_df,
    x = 'Runs', 
    y = 'Inns',
    color = 'Format',
    hover_data=['Player', 'Runs' ,'Ave', '100', '50'])
fig.show()

In [65]:
# Outliers check
cols = (
    'Inns',
    'Mat',
    'Runs', 
    "Ave", 
    "SR", 
    "100", 
    "50", 
    "0" , 
)

fig = make_subplots(rows=4, cols=2, subplot_titles=cols)
fig.update_layout(title='Outliers:')
fig.add_trace(go.Box(y = ltd_overs_batting_df[cols[0]]), row=1, col=1)
fig.add_trace(go.Box(y = ltd_overs_batting_df[cols[1]]), row=1, col=2)
fig.add_trace(go.Box(y = ltd_overs_batting_df[cols[2]]), row=2, col=1)
fig.add_trace(go.Box(y = ltd_overs_batting_df[cols[3]]), row=2, col=2)
fig.add_trace(go.Box(y = ltd_overs_batting_df[cols[4]]), row=3, col=1)
fig.add_trace(go.Box(y = ltd_overs_batting_df[cols[5]]), row=3, col=2)
fig.add_trace(go.Box(y = ltd_overs_batting_df[cols[6]]), row=4, col=1)
fig.add_trace(go.Box(y = ltd_overs_batting_df[cols[7]]), row=4, col=2)

---
Feature Selection:

In [22]:
ltd_overs_batting_df.head(3).T

Unnamed: 0,0,1,2
Player,SR Tendulkar (INDIA),KC Sangakkara (Asia/ICC/SL),RT Ponting (AUS/ICC)
Span,1989-2012,2000-2015,1995-2012
Mat,463,404,375
Inns,452.0,380.0,365.0
NO,41.0,41.0,39.0
Runs,18426.0,14234.0,13704.0
HS,200.0,169.0,164.0
Ave,44.83,41.98,42.03
BF,21367.0,18048.0,17046.0
SR,86.23,78.86,80.39


In [23]:
features = [
    'Inns',
    'Mat',
    'Runs', 
    "Ave", 
    "SR", 
    "100", 
    "50", 
    "0" , 
]


ltd_overs_df_model = ltd_overs_batting_df[features].dropna()
ltd_overs_df_model.shape

(4041, 8)

In [24]:
ltd_overs_df_model.isnull().sum()

Inns    0
Mat     0
Runs    0
Ave     0
SR      0
100     0
50      0
0       0
dtype: int64

In [25]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# X = scaler.fit_transform(ltd_overs_df_model)

In [120]:
# Assiging thr Roles [Target Positions]:
def assign_position(r) -> Literal['Opener', 'Middle', 'Finisher']:
    if r['SR'] >= 135 :
    # if r['SR'] >= 135 and r['Ave'] < 30:
        return 'Finisher'
    elif r['Ave'] > 35 and r['SR'] > 90:
        return 'Opener'
    else:
        return 'Middle'

# def assign_position(row):
#     if row["SR"] >= 110 and row["Aggression Index"] >= 55:
#         return "Finisher"
#     elif row["Consistency"] > 35 and row["SR"] > 85:
#         return "Opener"
#     else:
#         return "Middle"
    
ltd_overs_batting_df["Role"] = ltd_overs_batting_df.apply(assign_position, axis=1)
ltd_overs_df_model["Role"] = ltd_overs_batting_df["Role"]

In [121]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

ltd_overs_df_model['Encoded Role'] = encoder.fit_transform(ltd_overs_df_model['Role'])
# ltd_overs_df_model
# Finisher: 0
# Middle: 1
# Opener: 2

In [122]:
X_df = ltd_overs_df_model.drop(columns=['Role', 'Encoded Role'])

In [123]:
X_df

# Handling Skewness:
cols = [
    'Inns',
    'Mat',
    'Runs', 
    # "Ave", 
    # "SR", 
    # "100", 
    # "50", 
    # "0" , 
]

for col in cols:
    X_df[col] = np.log1p(X_df[col])     

In [124]:
X = scaler.fit_transform(X_df)
# X = ltd_overs_df_model.drop(columns=['Role', 'Encoded Role'])
y = ltd_overs_df_model['Encoded Role']

In [125]:
ltd_overs_df_model['Encoded Role'].value_counts()

Encoded Role
1    3714
0     229
2      98
Name: count, dtype: int64

In [119]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

--- 
Data is imbalanced so I will first undersample and later over sample the data to avoid major fake values [dummy model thing].

In [None]:
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score

# smote = SMOTE(random_state=0)
# X_train_oversampled, y_train_oversampled = smote.fit_resample(X_train, y_train)


under = RandomUnderSampler(sampling_strategy={1:1000})
X_under, y_under = under.fit_resample(X_train, y_train)
print(f"UnderSampled Shape: {X_under.shape} {y_under.value_counts()}")
over = SMOTE(sampling_strategy={0: 1000, 2: 1000})
X_balanced, y_balanced = over.fit_resample(X_under, y_under)

pd.DataFrame(y_balanced).value_counts()

UnderSampled Shape: (1200, 8) Encoded Role
1    1000
0     126
2      74
Name: count, dtype: int64


Encoded Role
0               1000
1               1000
2               1000
Name: count, dtype: int64

In [85]:
# Oversampled Data:
y_balanced.value_counts()

Encoded Role
0    1000
1    1000
2    1000
Name: count, dtype: int64

In [90]:
accuracies = []
k_values = range(4,12)

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_balanced, y_balanced)
    y_pred = knn.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    accuracies.append(acc)

In [None]:
fig = go.Figure()
fig.update_layout(title = "Accuracy v/s k_values:")
fig.add_trace(
    go.Scatter(
        x = list(k_values),
        y = accuracies
    )
)

In [None]:
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(X_balanced, y_balanced)

y_pred = knn.predict(X_test)
print(classification_report(y_test, y_pred))
print(f"Accuracy Scoere")

              precision    recall  f1-score   support

           0       0.14      0.66      0.23        29
           1       0.98      0.80      0.88       764
           2       0.26      0.69      0.37        16

    accuracy                           0.80       809
   macro avg       0.46      0.72      0.49       809
weighted avg       0.93      0.80      0.85       809



In [102]:
X_test.shape

(809, 8)

In [107]:
test_df = pd.DataFrame([
    {
        'Inns': 4,
        'Mat' : 3,             
        'Runs' : 127,           
        'Ave' : 63.5,              
        'SR' : 89.43,              
        '100' : 0,              
        '50' : 2,               
        '0' : 0,  
    }]
)

In [108]:
sai_sudarshan_transformed_df = scaler.transform(test_df)


In [109]:
knn.predict(sai_sudarshan_transformed_df)

array([2])

In [110]:
encoder.inverse_transform(knn.predict(sai_sudarshan_transformed_df))

array(['Opener'], dtype=object)