# Project 1: Player Performance (T20 Cricket) – Regression
Goal: Predict **runs_scored** using recent form & context features.

### Tasks
1. Load data and explore distributions.
2. Encode categoricals; train/test split.
3. Baseline **LinearRegression**; improve with **RandomForestRegressor**.
4. Evaluate with **MAE, RMSE, R²**; plot feature importances.
5. Stretch: cross-validation, SHAP (optional), per-player error analysis.

In [None]:
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt

df = pd.read_csv('../datasets/cricket_t20_player_performance.csv')
df.head()

In [None]:
# Train-test split and pipeline
X = df.drop(columns=['runs_scored','match_id'])
y = df['runs_scored']
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()
pre = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
], remainder='passthrough')

X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42)

lin = Pipeline([('pre', pre), ('model', LinearRegression())])
lin.fit(X_tr, y_tr)
pred_lin = lin.predict(X_te)
mae = mean_absolute_error(y_te, pred_lin)
rmse = mean_squared_error(y_te, pred_lin, squared=False)
r2 = r2_score(y_te, pred_lin)
print({'MAE':mae, 'RMSE':rmse, 'R2':r2})

In [None]:
# Random Forest Regressor
rf = Pipeline([('pre', pre), ('model', RandomForestRegressor(n_estimators=300, random_state=42))])
rf.fit(X_tr, y_tr)
pred_rf = rf.predict(X_te)
mae = mean_absolute_error(y_te, pred_rf)
rmse = mean_squared_error(y_te, pred_rf, squared=False)
r2 = r2_score(y_te, pred_rf)
print({'MAE':mae, 'RMSE':rmse, 'R2':r2})

# Feature importances (approx via permutation on processed features is complex).
print('Trained RF model. You can use permutation_importance for insights.')

In [None]:
# Per-player error analysis
res = pd.DataFrame({'y_true': y_te, 'y_pred': pred_rf})
subset = df.loc[y_te.index, ['player']].reset_index(drop=True)
res = pd.concat([subset, res], axis=1)
res['abs_err'] = (res['y_true'] - res['y_pred']).abs()
print(res.groupby('player')['abs_err'].mean().sort_values(ascending=False).head(10))