In [4]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib

In [5]:
# Load batting data
batting_df = pd.read_csv("public/players.csv")

# Load bowling data
bowling_df = pd.read_csv("public/odi_bowling.csv", encoding='latin1')

# Normalize player names
batting_df["Player_Name"] = batting_df["Player_Name"].str.strip().str.lower()
bowling_df["Player Name"] = bowling_df["Player Name"].str.strip().str.lower()

# Aggregate bowling stats by player
bowling_agg = bowling_df.groupby("Player Name").agg({
    "Overs": "mean",
    "Wkts": "mean",
    "Econ": "mean",
    "matches": "sum"
}).reset_index()

# Rename columns for clarity
bowling_agg.columns = ["Player_Name", "Avg_Overs", "Avg_Wkts", "Avg_Econ", "Total_Matches"]

# Merge batting and bowling stats
merged_df = pd.merge(batting_df, bowling_agg, on="Player_Name", how="left")
merged_df.fillna(0, inplace=True)  # Fill NaNs for players with no bowling stats

# Preview merged data
merged_df.head()


Unnamed: 0,Player_Name,Player_Type,Role,Runs,Balls_Faced,Strike_Rate,Opponent_Team,Opponent_Bowler,Bowler_Type,Bowler_Variation,...,Ground_Type,Ground_Dimensions,Pitch_Type,Weather,Shot_Strength,Weakness,Avg_Overs,Avg_Wkts,Avg_Econ,Total_Matches
0,dananjaya de silva,All-rounder,6,24,38,63.15,India,Siraj,Right Arm Fast,Bouncer,...,Balanced,74m x 78m,Balanced,Humid,On-side,Right Arm Mystery,0.0,0.0,0.0,0.0
1,pathum nissanka,Batsman,3,6,17,35.29,Bangladesh,Mehidy,Right Arm Off Spin,Off Break,...,Bowling Friendly,70m x 72m,Bowling Friendly,Dry,Off-side,Left Arm Chinaman,0.0,0.0,0.0,0.0
2,dananjaya de silva,All-rounder,6,46,59,77.96,England,Topley,Left Arm Medium,Swing,...,Batting Friendly,75m x 78m,Batting Friendly,Cloudy,Short Ball,Left Arm Chinaman,0.0,0.0,0.0,0.0
3,pathum nissanka,Batsman,3,24,25,96.0,South Africa,Fortuin,Left Arm Orthodox,Arm Ball,...,Balanced,72m x 76m,Balanced,Hot,Short Ball,Right Arm Fast,0.0,0.0,0.0,0.0
4,kusal mendis,Batsman,4,23,51,45.09,India,Bumrah,Right Arm Fast,Yorker,...,Batting Friendly,75m x 78m,Batting Friendly,Cloudy,Off-side,Left Arm Chinaman,0.0,0.0,0.0,0.0


In [6]:
# Encode player role and type
role_encoder = LabelEncoder()
type_encoder = LabelEncoder()

merged_df["Role_encoded"] = role_encoder.fit_transform(merged_df["Role"])
merged_df["Player_Type_encoded"] = type_encoder.fit_transform(merged_df["Player_Type"])

# Save encoders for later use (e.g., during inference)
joblib.dump(role_encoder, "ml/role_encoder.pkl")
joblib.dump(type_encoder, "ml/type_encoder.pkl")

['ml/type_encoder.pkl']

In [7]:
# Define input features (X) and target (y)
features = [
    "Runs", "Balls_Faced", "Strike_Rate",
    "Player_Type_encoded", "Avg_Overs", "Avg_Wkts", "Avg_Econ", "Total_Matches"
]

X = merged_df[features]                  # Input features
y = merged_df["Role_encoded"]           # Target: Encoded role

# Split into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [8]:
# Train Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save trained model
joblib.dump(model, "ml/model.pkl")

['ml/model.pkl']

In [9]:
# Predict on test data
y_pred = model.predict(X_test)

# Generate classification report
print(classification_report(
    y_test, y_pred,
    target_names=[str(cls) for cls in role_encoder.classes_]
))

              precision    recall  f1-score   support

           1       0.93      0.90      0.91       283
           2       0.66      0.82      0.73       142
           3       0.92      0.88      0.90       591
           4       0.92      0.89      0.91       322
           5       0.93      0.95      0.94       360
           6       0.97      0.89      0.93       196
           7       0.89      1.00      0.94        93
           8       1.00      1.00      1.00         7
           9       1.00      1.00      1.00         6

    accuracy                           0.90      2000
   macro avg       0.91      0.93      0.92      2000
weighted avg       0.91      0.90      0.90      2000

