## Cricket Data [KNN]:

- Working with generated Dataset: data generation script inside `datasets` directory.


Dataset Details:

Target Class:
Player Name (id) - Mapping:
- 1 : "Virat Kohli"
- 2 : "Rohit Sharma"
- 3 : "Shubman Gill"
- 4 : "Yashasvi Jaiswal"
- 5 : "KL Rahul"

Library imports and dataset load.

In [173]:
import numpy as np, pandas as pd
import seaborn as sns, matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
# 
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [174]:
data = pd.read_csv('datasets/cricket_knn_indian_players_balanced.csv')
data.head()

Unnamed: 0,temperature,wind_speed,opposition_strength,best_batter,pitch_dry,pitch_dusty,pitch_green,pitch_hard
0,17.773839,26.25072,2,1,True,False,False,False
1,29.046128,6.664756,4,1,True,False,False,False
2,33.747134,18.069309,5,1,True,False,False,False
3,19.46508,28.431556,2,1,True,False,False,False
4,25.59446,29.967055,7,1,True,False,False,False


In [175]:
batters = [
    {'id': 1, 'name': 'Virat Kohli'},
    {'id': 2, 'name': 'Rohit Sharma'},
    {'id': 3, 'name': 'Shubman Gill'},
    {'id': 4, 'name': 'Yashasvi Jaiswal'},
    {'id': 5, 'name': 'KL Rahul'},
]

### Basic Visualization:

Correlation:

In [176]:
px.imshow(data.corr(), text_auto=True)

mask = np.triu(np.ones_like(data.corr(), dtype=bool))
rLT = data.corr().mask(mask)

heat = go.Heatmap(
    z = rLT,
    # text = rLT.apply(lambda x: round(x, 2) if str(x).lower() != str(np.nan)),
    x = rLT.columns.values,
    y = rLT.columns.values,
    zmin= -0.25, zmax=1,
    xgap=1, ygap=1,
    colorscale='RdBu',
    texttemplate='%{text}'
)
title = "Cricket Data Correlation:"
layout = go.Layout(
    title_text = title,
    title_x = 0.5,
    width=600, 
    height=600,
    xaxis_showgrid=False,
    yaxis_showgrid=False,
    yaxis_autorange='reversed'
)

fig=go.Figure(data=[heat], layout=layout)
fig.show()

In [177]:
px.histogram(data, x='best_batter', color='best_batter')

In [178]:
data['best_batter'].value_counts()

best_batter
1    1000
2    1000
3    1000
4    1000
5    1000
Name: count, dtype: int64

In [179]:
# data = data.drop('opposition_strength', inplace=True)

In [180]:
px.scatter(data, x='wind_speed', y='temperature', hover_data=['best_batter', 'temperature', 'wind_speed'], color='best_batter')

In [181]:
data.groupby('opposition_strength').count()['best_batter']

opposition_strength
1     393
2     390
3     393
4     395
5     385
6     403
7     414
8     774
9     728
10    725
Name: best_batter, dtype: int64

In [182]:
px.density_heatmap(data, x='opposition_strength', y = 'best_batter', title='Batter Performance Based on Opposition Strength')

### Data Preparation:

In [219]:
X = data[['temperature', 'wind_speed', 'pitch_green', 'pitch_dry', 'pitch_hard', 'pitch_dusty']]
y = data['best_batter']

Data Splitting:

In [220]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)

print(f"""
Train Data Shape: {X_train.shape}
Test Data Shape: {X_test.shape}      
""")


Train Data Shape: (4000, 6)
Test Data Shape: (1000, 6)      



Standardizing the data:

In [231]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# X_train_scaled = X_train
# X_test_scaled = X_test

KNN Model Load and Fit the data....

In [232]:
from sklearn.neighbors import KNeighborsClassifier

In [244]:
accuracies = []
k_indices = range(1,201)

for k in k_indices:
    knn = KNeighborsClassifier(n_neighbors=k, metric='minkowski')
    knn.fit(X_train_scaled, y_train)
    y_pred = knn.predict(X_test_scaled)
    accuracies.append(accuracy_score(y_test, y_pred))

In [245]:
fig = go.Figure()
fig.update_layout(title = "Accuracy v/s k_values:")
fig.add_trace(
    go.Scatter(
        x = list(k_indices),
        y = accuracies
    )
)

In [246]:
best_k_value = list(k_indices)[accuracies.index(max(accuracies))]
print(f"Best K: {best_k_value}")

knn = KNeighborsClassifier(n_neighbors=best_k_value)
knn.fit(X_train_scaled, y_train)
y_pred = knn.predict(X_test_scaled)


Best K: 42


In [247]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.59      1.00      0.75       208
           2       0.56      0.98      0.72       190
           3       0.47      0.14      0.22       193
           4       0.50      0.46      0.48       202
           5       0.51      0.18      0.27       207

    accuracy                           0.55      1000
   macro avg       0.53      0.55      0.49      1000
weighted avg       0.53      0.55      0.49      1000



---