In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

FEATURES_PATH = r"C:\Users\Chiranjeev Singh\Desktop\New folder (2)\data\player_features.csv"
MATCHES_PATH = r"C:\Users\Chiranjeev Singh\Desktop\New folder (2)\data\raw\matches.csv"  # Adjust if stored elsewhere

# Load features
df = pd.read_csv(FEATURES_PATH)
df = df.dropna(subset=['fantasy_points', 'match_id'])

# Mark players in top 11 fantasy scorers
df['in_dream11'] = df.groupby('match_id')['fantasy_points'].rank(ascending=False, method='first') <= 11
df['in_dream11'] = df['in_dream11'].astype(int)

# Drop leakage/irrelevant columns
drop_cols = [
    'fantasy_points', 'player', 'batting_team', 'bowling_team',
    'team1', 'team2', 'toss_winner', 'toss_decision',
    'venue', 'opponent', 'innings_type'
]

# Load matches and extract year
matches = pd.read_csv(MATCHES_PATH)

# Use 'season' or 'date' depending on what's available
if 'season' in matches.columns:
    matches = matches[['id', 'season']].rename(columns={'id': 'match_id', 'season': 'year'})
elif 'date' in matches.columns:
    matches['year'] = pd.to_datetime(matches['date']).dt.year
    matches = matches[['id', 'year']].rename(columns={'id': 'match_id'})
else:
    raise KeyError("Neither 'season' nor 'date' column found in matches.csv")

# Merge into player features
df = df.merge(matches, on='match_id', how='left')
df['year'] = pd.to_numeric(df['year'], errors='coerce')  # ensure numeric


# Split train/test based on year
train_df = df[df['year'] <= 2023]
test_df = df[df['year'] == 2024]  # Optional: use later for manual testing

# Prepare X and y
X_train = train_df.drop(columns=[col for col in drop_cols if col in df.columns] + ['in_dream11', 'match_id', 'year'])
y_train = train_df['in_dream11']
X_train = X_train.select_dtypes(include=[np.number]).fillna(0)

X_test = test_df.drop(columns=[col for col in drop_cols if col in df.columns] + ['in_dream11', 'match_id', 'year'])
X_test = X_test.select_dtypes(include=[np.number]).fillna(0)
y_test = test_df['in_dream11']

# Try multiple models
models = {
    "RandomForest": RandomForestClassifier(n_estimators=150, random_state=42),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=150, random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=5000)
}

print("\n🧪 Model Evaluation (2024 test):\n")
best_model = None
best_f1 = 0

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    report = classification_report(y_test, preds, output_dict=True, zero_division=0)
    f1 = report['1']['f1-score']
    print(f"📊 {name} - F1 Score: {f1:.4f}")

    if f1 > best_f1:
        best_f1 = f1
        best_model = model
        best_model_name = name

print(f"\n✅ Best Model: {best_model_name} (F1 = {best_f1:.4f})")



🧪 Model Evaluation (2024 test):

📊 RandomForest - F1 Score: 0.9056
📊 GradientBoosting - F1 Score: 0.9080
📊 LogisticRegression - F1 Score: 0.9100

✅ Best Model: LogisticRegression (F1 = 0.9100)


In [59]:
import joblib

# Replace this with your actual trained model variable
joblib.dump(best_model, r'C:\Users\Chiranjeev Singh\Desktop\New folder (2)\models\model_dream11.pkl')


['C:\\Users\\Chiranjeev Singh\\Desktop\\New folder (2)\\models\\model_dream11.pkl']