In [5]:
# ===================== 1. Install & Import Libraries =====================
# Uncomment if needed:
# !pip install xgboost

import numpy as np
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# ===================== 2. Load CSV Files =====================
match = pd.read_csv('matches.csv')
delivery = pd.read_csv('deliveries.csv')

# ===================== 3. Replace old team names =====================
team_map = {
    'Delhi Daredevils': 'Delhi Capitals',
    'Deccan Chargers': 'Sunrisers Hyderabad',
    'Gujarat Lions': 'Gujarat Titans',
    'Kings XI Punjab': 'Punjab Kings'
}
match.replace({'team1': team_map, 'team2': team_map, 'winner': team_map}, inplace=True)

# ===================== 4. Filter consistent teams =====================
teams = [
    "Chennai Super Kings", "Mumbai Indians", "Royal Challengers Bangalore",
    "Kolkata Knight Riders", "Sunrisers Hyderabad", "Delhi Capitals",
    "Rajasthan Royals", "Punjab Kings", "Lucknow Super Giants", "Gujarat Titans"
]
match = match[match['team1'].isin(teams) & match['team2'].isin(teams)]

# ===================== 5. First Innings Total =====================
first_innings_score = delivery.groupby(['match_id', 'inning'])['total_runs'].sum().reset_index()
first_innings_score = first_innings_score[first_innings_score['inning'] == 1]
match = match.merge(first_innings_score[['match_id', 'total_runs']], left_on='id', right_on='match_id', how='left')

# ===================== 6. Filter Second Innings =====================
delivery = delivery[delivery['inning'] == 2]

# Merge with match details
match_data = match[['id', 'city', 'winner', 'total_runs']]
delivery = delivery.merge(match_data, left_on='match_id', right_on='id')

# ===================== 7. Compute Features =====================
# Current score as cumulative total_runs_x for second innings deliveries
delivery['current_score'] = delivery.groupby('match_id')['total_runs_x'].cumsum()

# Run left and balls left
delivery['run_left'] = delivery['total_runs_y'] - delivery['current_score']
delivery['ball_number'] = (delivery['over'] - 1) * 6 + delivery['ball']
delivery['balls_left'] = 120 - delivery['ball_number']

# Wickets left
delivery['wicket'] = delivery['dismissal_kind'].notnull().astype(int)
delivery['wickets'] = 10 - delivery.groupby('match_id')['wicket'].cumsum()

# CRR and RRR with protection against division by zero
delivery['crr'] = delivery['current_score'] * 6 / (delivery['ball_number'])
delivery['rrr'] = delivery['run_left'] * 6 / (delivery['balls_left'])

# Replace infinite and NaN values resulting from division by zero or invalid data
delivery.replace([np.inf, -np.inf], np.nan, inplace=True)

# Result (1 if batting team wins, 0 otherwise)
delivery['result'] = (delivery['batting_team'] == delivery['winner']).astype(int)

# ===================== 8. Select Final Data =====================
final_data = delivery[['batting_team', 'bowling_team', 'city',
                       'run_left', 'balls_left', 'wickets',
                       'crr', 'rrr', 'result']]

# Drop NaNs safely by re-assigning to avoid SettingWithCopyWarning
final_data = final_data.dropna().reset_index(drop=True)

# Shuffle dataset
final_data = final_data.sample(frac=1, random_state=42).reset_index(drop=True)

# ===================== 9. Train/Test Split =====================
X = final_data.drop('result', axis=1)
y = final_data['result']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ===================== 10. Create Pipeline =====================
ct = ColumnTransformer([
    ('onehot', OneHotEncoder(handle_unknown='ignore'), ['batting_team', 'bowling_team', 'city']),
    ('scale', StandardScaler(), ['run_left', 'balls_left', 'wickets', 'crr', 'rrr'])
])

pipeline = Pipeline([
    ('preprocessing', ct),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42, solver='liblinear'))
])

# ===================== 11. Train Model =====================
pipeline.fit(X_train, y_train)

# ===================== 12. Evaluate Model =====================
y_pred = pipeline.predict(X_test)
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n✅ Classification Report:\n", classification_report(y_test, y_pred))
print("✅ Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# ===================== 13. Make Single Prediction =====================
print("🔮 Prediction Probability (sample):", pipeline.predict_proba(X_test.iloc[[0]]))

# ===================== 14. Save Model =====================
with open('ipl_win_predictor_v2.pkl', 'wb') as f:
    pickle.dump(pipeline, f)
print("💾 Model saved as 'ipl_win_predictor_v2.pkl'")


✅ Accuracy: 0.8564356435643564

✅ Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.86      0.87     12186
           1       0.82      0.85      0.84      9226

    accuracy                           0.86     21412
   macro avg       0.85      0.86      0.85     21412
weighted avg       0.86      0.86      0.86     21412

✅ Confusion Matrix:
 [[10528  1658]
 [ 1416  7810]]
🔮 Prediction Probability (sample): [[0.97402916 0.02597084]]
💾 Model saved as 'ipl_win_predictor_v2.pkl'
