In [13]:
#  Step 1: Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import joblib
from pathlib import Path


In [14]:
DATA_PATH = Path(r"C:\Users\admin\Desktop\ipl_winning_prediction\ipl_colab.csv")
df = pd.read_csv(DATA_PATH)
df.head()

Unnamed: 0,mid,date,venue,batting_team,bowling_team,batsman,bowler,runs,wickets,overs,runs_last_5,wickets_last_5,striker,non-striker,total
0,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,P Kumar,1,0,0.1,1,0,0,0,222
1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,1,0,0.2,1,0,0,0,222
2,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.2,2,0,0,0,222
3,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.3,2,0,0,0,222
4,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.4,2,0,0,0,222


In [15]:
# Keep only relevant features
data = df[["batting_team", "bowling_team", "venue", "total"]].copy()

# Target: Winning Team (assume team with more runs wins)
# Let's add match-wise winner from highest total
df["match_id"] = df.groupby(["venue", "date"]).ngroup()
match_totals = df.groupby(["match_id", "batting_team"])["total"].sum().reset_index()
winners = match_totals.loc[match_totals.groupby("match_id")["total"].idxmax()]
winners["won"] = 1

# Merge back to original data
final = df.merge(winners[["match_id", "batting_team", "won"]],
                 left_on=["match_id", "batting_team"],
                 right_on=["match_id", "batting_team"],
                 how="left").fillna(0)

# Keep one row per match
final_data = final[["batting_team", "bowling_team", "venue", "total", "won"]]
# Drop duplicates – create an explicit copy
final_data = final_data.drop_duplicates().copy()

# Encode teams and venue
le1 = LabelEncoder()
le2 = LabelEncoder()
le3 = LabelEncoder()

final_data["batting_team"] = le1.fit_transform(final_data["batting_team"])
final_data["bowling_team"] = le2.fit_transform(final_data["bowling_team"])
final_data["venue"] = le3.fit_transform(final_data["venue"])

# Train-test split
X = final_data[["batting_team", "bowling_team", "venue", "total"]]
y = final_data["won"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Train model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# ✅ Create 'models' folder if not already present
import os
os.makedirs("models", exist_ok=True)



# Save model & encoders
joblib.dump(model, "models/ipl_model.pkl")
joblib.dump(le1, "models/le_batting.pkl")
joblib.dump(le2, "models/le_bowling.pkl")
joblib.dump(le3, "models/le_venue.pkl")








['models/le_venue.pkl']