In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import xgboost as xgb
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.cluster import KMeans
import datetime
from scipy import stats
import re





In [2]:

# Suppress warnings
warnings.filterwarnings('ignore')

# Set visualization style
plt.style.use('fivethirtyeight')
sns.set(style='whitegrid')
plt.rcParams['figure.figsize'] = (14, 8)
plt.rcParams['axes.grid'] = True


In [3]:

# Part 1: Data Loading and Initial Exploration
print("Part 1: Data Loading and Initial Exploration")
print("="*80)

# Load all datasets
matches_df = pd.read_csv('matches.csv')
deliveries_df = pd.read_csv('deliveries.csv')
fixtures_df = pd.read_csv('fixtures_cleaned.csv')
players_df = pd.read_csv('playersinteams.csv')
alldata_df = pd.read_csv('alldata.csv')

# Display dataset shapes
print(f"Matches Dataset: {matches_df.shape}")
print(f"Deliveries Dataset: {deliveries_df.shape}")
print(f"Fixtures Dataset: {fixtures_df.shape}")
print(f"Players Dataset: {players_df.shape}")
print(f"All Player Data: {alldata_df.shape}")

# Initial exploration of matches data
print("\nMatches Dataset Preview:")
print(matches_df.head())
print("\nMatches Dataset Info:")
print(matches_df.info())
print("\nMatches Dataset - Missing Values:")
print(matches_df.isnull().sum())

# Initial exploration of deliveries data
print("\nDeliveries Dataset Preview:")
print(deliveries_df.head())
print("\nDeliveries Dataset - Missing Values:")
print(deliveries_df.isnull().sum())

# Initial exploration of fixtures data
print("\nFixtures Dataset Preview:")
print(fixtures_df.head())
print("\nFixtures Dataset - Missing Values:")
print(fixtures_df.isnull().sum())

# Initial exploration of players data
print("\nPlayers Dataset Preview:")
print(players_df.head())
print("\nUnique Teams in 2025:")
print(players_df['Team'].unique())

# Initial exploration of player statistics
print("\nPlayer Statistics Preview:")
print(alldata_df.head())
print("\nPlayer Statistics - Missing Values:")
print(alldata_df.isnull().sum())


Part 1: Data Loading and Initial Exploration
Matches Dataset: (1095, 20)
Deliveries Dataset: (260920, 17)
Fixtures Dataset: (74, 8)
Players Dataset: (228, 3)
All Player Data: (1008, 25)

Matches Dataset Preview:
       id   season        city        date match_type player_of_match  \
0  335982  2007/08   Bangalore  2008-04-18     League     BB McCullum   
1  335983  2007/08  Chandigarh  2008-04-19     League      MEK Hussey   
2  335984  2007/08       Delhi  2008-04-19     League     MF Maharoof   
3  335985  2007/08      Mumbai  2008-04-20     League      MV Boucher   
4  335986  2007/08     Kolkata  2008-04-20     League       DJ Hussey   

                                        venue                        team1  \
0                       M Chinnaswamy Stadium  Royal Challengers Bangalore   
1  Punjab Cricket Association Stadium, Mohali              Kings XI Punjab   
2                            Feroz Shah Kotla             Delhi Daredevils   
3                            Wankhede

In [4]:

# Part 2: Data Cleaning and Preprocessing
print("\nPart 2: Data Cleaning and Preprocessing")
print("="*80)

# Clean matches dataframe
def clean_matches_df(df):
    # Create a copy to avoid modifying original
    df_clean = df.copy()
    
    # Convert date to datetime
    df_clean['date'] = pd.to_datetime(df_clean['date'])
    
    # Extract year from date
    df_clean['season'] = df_clean['date'].dt.year
    
    # Handle team name changes
    team_name_mapping = {
        'Delhi Daredevils': 'Delhi Capitals',
        'Deccan Chargers': 'Sunrisers Hyderabad',
        'Rising Pune Supergiants': 'Rising Pune Supergiant',
        'Kings XI Punjab': 'Punjab Kings'
    }
    
    team_columns = ['team1', 'team2', 'winner', 'toss_winner']
    for col in team_columns:
        df_clean[col] = df_clean[col].replace(team_name_mapping)
    
    # Fill missing values for winner with 'No Result'
    df_clean['winner'].fillna('No Result', inplace=True)
    
    # Fill missing values
    df_clean['city'].fillna('Unknown', inplace=True)
    df_clean['player_of_match'].fillna('None', inplace=True)
    
    return df_clean

# Clean deliveries dataframe
def clean_deliveries_df(df):
    # Create a copy to avoid modifying original
    df_clean = df.copy()
    
    # Handle team name changes
    team_name_mapping = {
        'Delhi Daredevils': 'Delhi Capitals',
        'Deccan Chargers': 'Sunrisers Hyderabad',
        'Rising Pune Supergiants': 'Rising Pune Supergiant',
        'Kings XI Punjab': 'Punjab Kings'
    }
    
    team_columns = ['batting_team', 'bowling_team']
    for col in team_columns:
        df_clean[col] = df_clean[col].replace(team_name_mapping)
    
    # Fill missing values
    df_clean['extras_type'].fillna('None', inplace=True)
    df_clean['player_dismissed'].fillna('None', inplace=True)
    df_clean['dismissal_kind'].fillna('None', inplace=True)
    df_clean['fielder'].fillna('None', inplace=True)
    
    return df_clean

# Clean fixtures dataframe
def clean_fixtures_df(df):
    # Create a copy to avoid modifying original
    df_clean = df.copy()
    
    # Convert date to datetime
    df_clean['Date'] = pd.to_datetime(df_clean['Date'])
    
    # Format the Start time
    df_clean['Start'] = pd.to_datetime(df_clean['Start'], format='%I:%M %p').dt.time
    
    return df_clean

# Clean player statistics dataframe
def clean_alldata_df(df):
    # Create a copy to avoid modifying original
    df_clean = df.copy()
    
    # Convert numeric columns from string to float
    numeric_columns = [
        'Year', 'Matches_Batted', 'Not_Outs', 'Runs_Scored', 'Batting_Average',
        'Balls_Faced', 'Batting_Strike_Rate', 'Centuries', 'Half_Centuries',
        'Fours', 'Sixes', 'Catches_Taken', 'Stumpings', 'Matches_Bowled',
        'Balls_Bowled', 'Runs_Conceded', 'Wickets_Taken', 'Bowling_Average',
        'Economy_Rate', 'Bowling_Strike_Rate', 'Four_Wicket_Hauls', 'Five_Wicket_Hauls'
    ]
    
    for col in numeric_columns:
        try:
            df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')
        except:
            print(f"Error converting {col} to numeric")
    
    # Fill missing values with appropriate defaults
    df_clean[numeric_columns] = df_clean[numeric_columns].fillna(0)
    df_clean['Player_Name'].fillna('Unknown', inplace=True)
    df_clean['Highest_Score'].fillna('0', inplace=True)
    df_clean['Best_Bowling_Match'].fillna('0/0', inplace=True)
    
    return df_clean

# Execute cleaning functions
matches_df_clean = clean_matches_df(matches_df)
deliveries_df_clean = clean_deliveries_df(deliveries_df)
fixtures_df_clean = clean_fixtures_df(fixtures_df)
alldata_df_clean = clean_alldata_df(alldata_df)

# Check for remaining missing values
print("\nRemaining Missing Values After Cleaning:")
print(f"Matches Dataset: {matches_df_clean.isnull().sum().sum()}")
print(f"Deliveries Dataset: {deliveries_df_clean.isnull().sum().sum()}")
print(f"Fixtures Dataset: {fixtures_df_clean.isnull().sum().sum()}")
print(f"Player Statistics: {alldata_df_clean.isnull().sum().sum()}")

# Get unique teams
teams = set(list(matches_df_clean['team1'].unique()) + 
           list(matches_df_clean['team2'].unique()))
teams = [team for team in teams if pd.notnull(team) and team != 'No Result']
print(f"\nTotal unique teams: {len(teams)}")
print(teams)

# Identify IPL 2025 teams
ipl_2025_teams = players_df['Team'].unique()
print(f"\nIPL 2025 Teams: {len(ipl_2025_teams)}")
print(ipl_2025_teams)



Part 2: Data Cleaning and Preprocessing

Remaining Missing Values After Cleaning:
Matches Dataset: 1099
Deliveries Dataset: 0
Fixtures Dataset: 0
Player Statistics: 0

Total unique teams: 15
['Pune Warriors', 'Royal Challengers Bengaluru', 'Punjab Kings', 'Rajasthan Royals', 'Lucknow Super Giants', 'Kolkata Knight Riders', 'Chennai Super Kings', 'Gujarat Titans', 'Royal Challengers Bangalore', 'Kochi Tuskers Kerala', 'Sunrisers Hyderabad', 'Delhi Capitals', 'Mumbai Indians', 'Rising Pune Supergiant', 'Gujarat Lions']

IPL 2025 Teams: 10
['CSK' 'DC' 'GT' 'KKR' 'LSG' 'MI' 'PBKS' 'RCB' 'RR' 'SRH']


In [5]:

# Part 3: Exploratory Data Analysis (EDA)
print("\nPart 3: Exploratory Data Analysis (EDA)")
print("="*80)



Part 3: Exploratory Data Analysis (EDA)


In [6]:

# 3.1 Team Performance Analysis
print("\n3.1 Team Performance Analysis")
print("-"*50)



3.1 Team Performance Analysis
--------------------------------------------------


In [7]:

def calculate_team_stats(matches_df, deliveries_df):
    """Calculate comprehensive team statistics."""
    # Get unique teams
    teams = set(list(matches_df['team1'].unique()) + 
               list(matches_df['team2'].unique()))
    teams = [team for team in teams if pd.notnull(team) and team != 'No Result']
    
    # Initialize dataframe to store team statistics
    team_stats = pd.DataFrame(index=teams)
    
    # Calculate matches played by each team
    team_stats['Matches_Played'] = 0
    for team in teams:
        team_stats.loc[team, 'Matches_Played'] = len(matches_df[(matches_df['team1'] == team) | 
                                                               (matches_df['team2'] == team)])
    
    # Calculate matches won by each team
    team_stats['Matches_Won'] = 0
    for team in teams:
        team_stats.loc[team, 'Matches_Won'] = len(matches_df[matches_df['winner'] == team])
    
    # Calculate winning percentage
    team_stats['Winning_Percentage'] = (team_stats['Matches_Won'] / team_stats['Matches_Played'] * 100).round(2)
    
    # Calculate total runs scored by each team
    team_stats['Total_Runs_Scored'] = 0
    for team in teams:
        team_stats.loc[team, 'Total_Runs_Scored'] = deliveries_df[deliveries_df['batting_team'] == team]['total_runs'].sum()
    
    # Calculate total runs conceded by each team
    team_stats['Total_Runs_Conceded'] = 0
    for team in teams:
        team_stats.loc[team, 'Total_Runs_Conceded'] = deliveries_df[deliveries_df['bowling_team'] == team]['total_runs'].sum()
    
    # Calculate total balls faced and bowled
    team_stats['Total_Balls_Faced'] = 0
    team_stats['Total_Balls_Bowled'] = 0
    for team in teams:
        team_stats.loc[team, 'Total_Balls_Faced'] = len(deliveries_df[deliveries_df['batting_team'] == team])
        team_stats.loc[team, 'Total_Balls_Bowled'] = len(deliveries_df[deliveries_df['bowling_team'] == team])
    
    # Calculate run rate and economy rate
    team_stats['Run_Rate'] = (team_stats['Total_Runs_Scored'] / (team_stats['Total_Balls_Faced'] / 6)).round(2)
    team_stats['Economy_Rate'] = (team_stats['Total_Runs_Conceded'] / (team_stats['Total_Balls_Bowled'] / 6)).round(2)
    
    # Calculate highest and lowest scores
    team_stats['Highest_Score'] = 0
    team_stats['Lowest_Score'] = float('inf')
    
    for team in teams:
        team_innings = deliveries_df[deliveries_df['batting_team'] == team].groupby(['match_id', 'inning'])['total_runs'].sum()
        if not team_innings.empty:
            team_stats.loc[team, 'Highest_Score'] = team_innings.max()
            team_stats.loc[team, 'Lowest_Score'] = team_innings.min()
    
    # Replace inf with 0
    team_stats['Lowest_Score'] = team_stats['Lowest_Score'].replace(float('inf'), 0)
    
    # Calculate total 4s and 6s
    team_stats['Total_4s'] = 0
    team_stats['Total_6s'] = 0
    for team in teams:
        team_stats.loc[team, 'Total_4s'] = len(deliveries_df[(deliveries_df['batting_team'] == team) & 
                                                            (deliveries_df['batsman_runs'] == 4)])
        team_stats.loc[team, 'Total_6s'] = len(deliveries_df[(deliveries_df['batting_team'] == team) & 
                                                           (deliveries_df['batsman_runs'] == 6)])
    
    # Calculate powerplay (overs 1-6) and death overs (overs 16-20) stats
    team_stats['Avg_Powerplay_Score'] = 0
    team_stats['Avg_Death_Overs_Score'] = 0
    
    for team in teams:
        # Powerplay analysis (overs 1-6)
        powerplay_df = deliveries_df[(deliveries_df['batting_team'] == team) & (deliveries_df['over'] < 6)]
        if not powerplay_df.empty:
            powerplay_scores = powerplay_df.groupby(['match_id', 'inning'])['total_runs'].sum()
            team_stats.loc[team, 'Avg_Powerplay_Score'] = round(powerplay_scores.mean(), 2)
        
        # Death overs analysis (overs 16-20)
        death_overs_df = deliveries_df[(deliveries_df['batting_team'] == team) & (deliveries_df['over'] >= 16)]
        if not death_overs_df.empty:
            death_overs_scores = death_overs_df.groupby(['match_id', 'inning'])['total_runs'].sum()
            team_stats.loc[team, 'Avg_Death_Overs_Score'] = round(death_overs_scores.mean(), 2)
    
    # Calculate winning percentage in last 3 seasons
    recent_seasons = sorted(matches_df['season'].unique())[-3:]
    team_stats['Recent_Win_Rate'] = 0
    
    for team in teams:
        recent_matches = matches_df[(matches_df['season'].isin(recent_seasons)) & 
                                   ((matches_df['team1'] == team) | (matches_df['team2'] == team))]
        recent_wins = len(recent_matches[recent_matches['winner'] == team])
        recent_total = len(recent_matches)
        if recent_total > 0:
            team_stats.loc[team, 'Recent_Win_Rate'] = round(recent_wins / recent_total * 100, 2)
    
    return team_stats

# Calculate team statistics
team_stats = calculate_team_stats(matches_df_clean, deliveries_df_clean)

# Display team statistics for active teams in IPL 2025
active_team_stats = team_stats.loc[team_stats.index.isin(ipl_2025_teams)].copy()
active_team_stats = active_team_stats.sort_values(by='Winning_Percentage', ascending=False)
print("\nTeam Performance Statistics (IPL 2025 Teams):")
print(active_team_stats[['Matches_Played', 'Matches_Won', 'Winning_Percentage', 
                        'Run_Rate', 'Economy_Rate', 'Highest_Score', 'Lowest_Score']])


KeyboardInterrupt: 

In [None]:

# Plot matches played and winning percentages
plt.figure(figsize=(14, 8))
bar_width = 0.4
x = np.arange(len(active_team_stats.index))

plt.bar(x - bar_width/2, active_team_stats['Matches_Played'], width=bar_width, 
        color='skyblue', label='Matches Played')
plt.bar(x + bar_width/2, active_team_stats['Winning_Percentage'], width=bar_width, 
        color='orange', label='Winning Percentage')

plt.xlabel('Teams')
plt.ylabel('Count/Percentage')
plt.title('Matches Played vs Winning Percentage by Team')
plt.xticks(x, active_team_stats.index, rotation=45, ha='right')
plt.legend()
plt.tight_layout()
plt.savefig('team_matches_winrate.png')
plt.show()


In [None]:

# Plot run rate and economy rate
plt.figure(figsize=(14, 8))
bar_width = 0.4
x = np.arange(len(active_team_stats.index))

plt.bar(x - bar_width/2, active_team_stats['Run_Rate'], width=bar_width, 
        color='green', label='Run Rate')
plt.bar(x + bar_width/2, active_team_stats['Economy_Rate'], width=bar_width, 
        color='red', label='Economy Rate')

plt.xlabel('Teams')
plt.ylabel('Rate')
plt.title('Run Rate vs Economy Rate by Team')
plt.xticks(x, active_team_stats.index, rotation=45, ha='right')
plt.legend()
plt.tight_layout()
plt.savefig('team_runrate_economy.png')
plt.show()


In [None]:

# Plot highest and lowest scores
plt.figure(figsize=(14, 8))
bar_width = 0.4
x = np.arange(len(active_team_stats.index))

plt.bar(x - bar_width/2, active_team_stats['Highest_Score'], width=bar_width, 
        color='green', label='Highest Score')
plt.bar(x + bar_width/2, active_team_stats['Lowest_Score'], width=bar_width, 
        color='red', label='Lowest Score')

plt.xlabel('Teams')
plt.ylabel('Score')
plt.title('Highest and Lowest Scores by Team')
plt.xticks(x, active_team_stats.index, rotation=45, ha='right')
plt.legend()
plt.tight_layout()
plt.savefig('team_highest_lowest_scores.png')
plt.show()


In [None]:

# Plot total 4s and 6s
plt.figure(figsize=(14, 8))
bar_width = 0.4
x = np.arange(len(active_team_stats.index))

plt.bar(x - bar_width/2, active_team_stats['Total_4s'], width=bar_width, 
        color='blue', label='Total 4s')
plt.bar(x + bar_width/2, active_team_stats['Total_6s'], width=bar_width, 
        color='red', label='Total 6s')

plt.xlabel('Teams')
plt.ylabel('Count')
plt.title('Total 4s and 6s by Team')
plt.xticks(x, active_team_stats.index, rotation=45, ha='right')
plt.legend()
plt.tight_layout()
plt.savefig('team_fours_sixes.png')
plt.show()


In [None]:

# Plot average powerplay and death overs score
plt.figure(figsize=(14, 8))
bar_width = 0.4
x = np.arange(len(active_team_stats.index))

plt.bar(x - bar_width/2, active_team_stats['Avg_Powerplay_Score'], width=bar_width, 
        color='green', label='Avg Powerplay Score')
plt.bar(x + bar_width/2, active_team_stats['Avg_Death_Overs_Score'], width=bar_width, 
        color='purple', label='Avg Death Overs Score')

plt.xlabel('Teams')
plt.ylabel('Score')
plt.title('Average Powerplay and Death Overs Score by Team')
plt.xticks(x, active_team_stats.index, rotation=45, ha='right')
plt.legend()
plt.tight_layout()
plt.savefig('team_powerplay_death_overs.png')
plt.show()


In [None]:

# Deep analysis of powerplay performance
def analyze_powerplay(deliveries_df):
    """Analyze team performance during powerplay (first 6 overs)."""
    teams = deliveries_df['batting_team'].unique()
    powerplay_stats = pd.DataFrame(index=teams)
    
    for team in teams:
        # Batting stats during powerplay
        team_powerplay = deliveries_df[(deliveries_df['batting_team'] == team) & 
                                      (deliveries_df['over'] < 6)]
        
        # Calculate average runs, wickets, and run rate in powerplay
        if not team_powerplay.empty:
            powerplay_runs = team_powerplay.groupby(['match_id', 'inning'])['total_runs'].sum()
            powerplay_wickets = team_powerplay[team_powerplay['is_wicket'] == 1].groupby(
                ['match_id', 'inning']).size()
            
            powerplay_stats.loc[team, 'Avg_Runs'] = round(powerplay_runs.mean(), 2)
            powerplay_stats.loc[team, 'Avg_Wickets_Lost'] = round(powerplay_wickets.mean(), 2) if not powerplay_wickets.empty else 0
            powerplay_stats.loc[team, 'Run_Rate'] = round(powerplay_stats.loc[team, 'Avg_Runs'] / 6, 2)
            
            # Calculate boundary percentage
            total_balls = len(team_powerplay)
            boundaries = len(team_powerplay[(team_powerplay['batsman_runs'] == 4) | 
                                           (team_powerplay['batsman_runs'] == 6)])
            powerplay_stats.loc[team, 'Boundary_Percentage'] = round(boundaries / total_balls * 100, 2)
    
    return powerplay_stats

# Calculate powerplay statistics
powerplay_stats = analyze_powerplay(deliveries_df_clean)
active_powerplay_stats = powerplay_stats.loc[powerplay_stats.index.isin(ipl_2025_teams)].sort_values(by='Run_Rate', ascending=False)

print("\nTeam Powerplay Performance (IPL 2025 Teams):")
print(active_powerplay_stats)

# Plot powerplay performance
plt.figure(figsize=(14, 8))
plt.bar(active_powerplay_stats.index, active_powerplay_stats['Run_Rate'], color='green')
plt.xlabel('Teams')
plt.ylabel('Run Rate')
plt.title('Team Run Rate During Powerplay (First 6 Overs)')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('team_powerplay_runrate.png')
plt.show()


In [None]:

# 3.2 Player Performance Analysis
print("\n3.2 Player Performance Analysis")
print("-"*50)

# Get the top 20 run-scorers
top_20_run_scorers = alldata_df_clean.nlargest(20, 'Runs_Scored')
print("\nTop 20 Run Scorers in IPL History:")
print(top_20_run_scorers[['Player_Name', 'Runs_Scored', 'Batting_Average', 'Batting_Strike_Rate', 'Centuries', 'Half_Centuries', 'Sixes']])

# Plot batting average vs batting strike rate for top 20 run-scorers
plt.figure(figsize=(14, 10))
plt.scatter(top_20_run_scorers['Batting_Average'], top_20_run_scorers['Batting_Strike_Rate'], 
            s=top_20_run_scorers['Runs_Scored']/50, alpha=0.7)

for i, player in enumerate(top_20_run_scorers['Player_Name']):
    plt.annotate(player, 
                 (top_20_run_scorers['Batting_Average'].iloc[i], 
                  top_20_run_scorers['Batting_Strike_Rate'].iloc[i]),
                 fontsize=8)

plt.xlabel('Batting Average')
plt.ylabel('Batting Strike Rate')
plt.title('Batting Average vs Strike Rate for Top 20 Run Scorers')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('top_batsmen_avg_sr.png')
plt.show()


In [None]:

# Find players with highest average and strike rate (with minimum matches threshold)
min_matches = 50  # Start with 50 matches threshold
players_min_matches = alldata_df_clean[alldata_df_clean['Matches_Batted'] >= min_matches].copy()

# Check if the filtered DataFrame is empty
if players_min_matches.empty:
    print(f"No players found with at least {min_matches} matches.")
    # Reduce the threshold if needed
    min_matches = 30
    print(f"Using {min_matches} matches as the threshold instead.")
    players_min_matches = alldata_df_clean[alldata_df_clean['Matches_Batted'] >= min_matches].copy()
    
    # If still empty, use all players
    if players_min_matches.empty:
        print("Using all players regardless of matches played.")
        players_min_matches = alldata_df_clean[alldata_df_clean['Matches_Batted'] > 0].copy()

# Now find the highest average and strike rate
highest_avg_player = players_min_matches.loc[players_min_matches['Batting_Average'].idxmax()]
highest_sr_player = players_min_matches.loc[players_min_matches['Batting_Strike_Rate'].idxmax()]

print(f"\nPlayer with Highest Batting Average (>={min_matches} matches):")
print(f"{highest_avg_player['Player_Name']} - {highest_avg_player['Batting_Average']} avg in {highest_avg_player['Matches_Batted']} matches")

print(f"\nPlayer with Highest Strike Rate (>={min_matches} matches):")
print(f"{highest_sr_player['Player_Name']} - {highest_sr_player['Batting_Strike_Rate']} SR in {highest_sr_player['Matches_Batted']} matches")

# If still empty, use all players
if players_min_matches.empty:
    print("Using all players regardless of matches played.")
    players_min_matches = alldata_df_clean[alldata_df_clean['Matches_Batted'] > 0].copy()

# Now find the highest average and strike rate
highest_avg_player = players_min_matches.loc[players_min_matches['Batting_Average'].idxmax()]
highest_sr_player = players_min_matches.loc[players_min_matches['Batting_Strike_Rate'].idxmax()]

print("\nPlayer with Highest Batting Average (>50 matches):")
print(f"{highest_avg_player['Player_Name']} - {highest_avg_player['Batting_Average']} avg in {highest_avg_player['Matches_Batted']} matches")

print("\nPlayer with Highest Strike Rate (>50 matches):")
print(f"{highest_sr_player['Player_Name']} - {highest_sr_player['Batting_Strike_Rate']} SR in {highest_sr_player['Matches_Batted']} matches")

# Get top wicket-takers
top_20_wicket_takers = alldata_df_clean.nlargest(20, 'Wickets_Taken')
print("\nTop 20 Wicket Takers in IPL History:")
print(top_20_wicket_takers[['Player_Name', 'Wickets_Taken', 'Bowling_Average', 'Economy_Rate', 'Bowling_Strike_Rate']])

# Plot top wicket-takers
plt.figure(figsize=(14, 8))
plt.bar(top_20_wicket_takers['Player_Name'], top_20_wicket_takers['Wickets_Taken'], color='purple')
plt.xlabel('Players')
plt.ylabel('Wickets Taken')
plt.title('Top 20 Wicket Takers in IPL History')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('top_wicket_takers.png')
plt.show()


In [None]:

# Get highest individual scores
# Function to extract numeric value from highest score (removing not out indicator *)
def extract_score(score_str):
    if pd.isna(score_str):
        return 0
    return int(score_str.replace('*', ''))

alldata_df_clean['Highest_Score_Numeric'] = alldata_df_clean['Highest_Score']

# Fix the extract_score function for highest score
def extract_score(score_str):
    if pd.isna(score_str):
        return 0
    try:
        return int(str(score_str).replace('*', ''))
    except:
        return 0

alldata_df_clean['Highest_Score_Numeric'] = alldata_df_clean['Highest_Score'].apply(extract_score)
top_20_highest_scores = alldata_df_clean.nlargest(20, 'Highest_Score_Numeric')

print("\nPlayers with Highest Individual Scores in IPL:")
print(top_20_highest_scores[['Player_Name', 'Highest_Score', 'Batting_Average', 'Batting_Strike_Rate']])

# Plot top individual highest scores
plt.figure(figsize=(14, 8))
plt.bar(top_20_highest_scores['Player_Name'], top_20_highest_scores['Highest_Score_Numeric'], color='orange')
plt.xlabel('Players')
plt.ylabel('Highest Score')
plt.title('Top 20 Highest Individual Scores in IPL')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('top_individual_scores.png')
plt.show()

In [None]:

# Man of the Match Count Analysis
mom_counts = matches_df_clean['player_of_match'].value_counts().reset_index()
mom_counts.columns = ['Player', 'MoM_Count']
top_20_mom = mom_counts.head(20)

print("\nPlayers with Most Man of the Match Awards:")
print(top_20_mom)

plt.figure(figsize=(14, 8))
plt.bar(top_20_mom['Player'], top_20_mom['MoM_Count'], color='green')
plt.xlabel('Players')
plt.ylabel('Number of MoM Awards')
plt.title('Top 20 Players with Most Man of the Match Awards')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('top_mom_awards.png')
plt.show()


In [None]:

# K-Means Clustering for player role identification
def cluster_players_by_role(df):
    """Use K-Means clustering to identify player roles."""
    # Select players with both batting and bowling stats
    player_roles = df[(df['Matches_Batted'] > 10) & (df['Matches_Bowled'] > 10)].copy()
    
    # Select features for clustering
    features = ['Batting_Average', 'Economy_Rate']
    X = player_roles[features].copy()
    
    # Handle NaN values
    X = X.fillna(0)
    
    # Scale the data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Apply K-means clustering
    kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
    player_roles['Cluster'] = kmeans.fit_predict(X_scaled)
    
    # Map cluster to role based on centroid values
    centroids = kmeans.cluster_centers_
    
    # Sort centroids by batting average (higher = more batting oriented)
    centroid_roles = {
        np.argmax(centroids[:, 0]): 'Batsman',
        np.argmin(centroids[:, 0]): 'Bowler'
    }
    
    # Remaining cluster is all-rounder
    for i in range(3):
        if i not in centroid_roles:
            centroid_roles[i] = 'All-Rounder'
    
    # Map cluster to role
    player_roles['Role'] = player_roles['Cluster'].map(centroid_roles)
    
    return player_roles, centroids, centroid_roles

# Apply clustering to identify player roles
player_roles, centroids, centroid_roles = cluster_players_by_role(alldata_df_clean)

print("\nPlayer Role Identification using K-Means Clustering:")
print(f"Cluster Centroids: {centroids}")
print(f"Cluster to Role Mapping: {centroid_roles}")
print("\nSample Player Role Distribution:")
print(player_roles['Role'].value_counts())


In [None]:

# Plot the clusters
plt.figure(figsize=(12, 10))
colors = ['red', 'green', 'blue']
roles = ['Batsman', 'Bowler', 'All-Rounder']

for i, role in enumerate(roles):
    cluster = player_roles[player_roles['Role'] == role]
    plt.scatter(cluster['Batting_Average'], cluster['Economy_Rate'],
                c=colors[i], label=role, alpha=0.6)

# Plot centroids
for i, role in enumerate(centroid_roles.values()):
    plt.scatter(centroids[i, 0], centroids[i, 1], c='black', marker='X', s=100,
                label=f"{role} Centroid")

plt.xlabel('Batting Average')
plt.ylabel('Economy Rate')
plt.title('K-Means Clustering of Players by Role')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('player_role_clustering.png')
plt.show()


In [None]:

# Identify Top Batsmen by Run Category
def identify_top_batsmen_by_category(deliveries_df, category, n=10):
    """Identify top batsmen in different run categories."""
    if category == 'sixes':
        batsmen_runs = deliveries_df[deliveries_df['batsman_runs'] == 6]['batter'].value_counts().reset_index()
        title = "Top 10 Batsmen with Most Sixes"
        col_name = "Sixes"
    elif category == 'fours':
        batsmen_runs = deliveries_df[deliveries_df['batsman_runs'] == 4]['batter'].value_counts().reset_index()
        title = "Top 10 Batsmen with Most Fours"
        col_name = "Fours"
    elif category == 'twos':
        batsmen_runs = deliveries_df[deliveries_df['batsman_runs'] == 2]['batter'].value_counts().reset_index()
        title = "Top 10 Batsmen with Most Twos"
        col_name = "Twos"
    elif category == 'singles':
        batsmen_runs = deliveries_df[deliveries_df['batsman_runs'] == 1]['batter'].value_counts().reset_index()
        title = "Top 10 Batsmen with Most Singles"
        col_name = "Singles"
    else:
        return None, ""
    
    batsmen_runs.columns = ['Batsman', col_name]
    return batsmen_runs.head(n), title, col_name

# Get top batsmen in each run category
top_sixes, sixes_title, sixes_col = identify_top_batsmen_by_category(deliveries_df_clean, 'sixes')
top_fours, fours_title, fours_col = identify_top_batsmen_by_category(deliveries_df_clean, 'fours')
top_twos, twos_title, twos_col = identify_top_batsmen_by_category(deliveries_df_clean, 'twos')
top_singles, singles_title, singles_col = identify_top_batsmen_by_category(deliveries_df_clean, 'singles')

print("\nTop 10 Batsmen with Most Sixes:")
print(top_sixes)

print("\nTop 10 Batsmen with Most Fours:")
print(top_fours)

print("\nTop 10 Batsmen with Most Twos:")
print(top_twos)

print("\nTop 10 Batsmen with Most Singles:")
print(top_singles)


In [None]:

# Plot top batsmen by run category
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Plot top sixes hitters
axes[0, 0].bar(top_sixes['Batsman'], top_sixes[sixes_col], color='red')
axes[0, 0].set_title(sixes_title)
axes[0, 0].set_xlabel('Batsmen')
axes[0, 0].set_ylabel('Number of Sixes')
axes[0, 0].tick_params(axis='x', rotation=45)

# Plot top fours hitters
axes[0, 1].bar(top_fours['Batsman'], top_fours[fours_col], color='blue')
axes[0, 1].set_title(fours_title)
axes[0, 1].set_xlabel('Batsmen')
axes[0, 1].set_ylabel('Number of Fours')
axes[0, 1].tick_params(axis='x', rotation=45)

# Plot top twos hitters
axes[1, 0].bar(top_twos['Batsman'], top_twos[twos_col], color='green')
axes[1, 0].set_title(twos_title)
axes[1, 0].set_xlabel('Batsmen')
axes[1, 0].set_ylabel('Number of Twos')
axes[1, 0].tick_params(axis='x', rotation=45)

# Plot top singles hitters
axes[1, 1].bar(top_singles['Batsman'], top_singles[singles_col], color='purple')
axes[1, 1].set_title(singles_title)
axes[1, 1].set_xlabel('Batsmen')
axes[1, 1].set_ylabel('Number of Singles')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('top_batsmen_by_run_category.png')
plt.show()


In [None]:

# 3.3 Seasonal Analysis
print("\n3.3 Seasonal Analysis")
print("-"*50)

# Calculate average runs per match per season
def calculate_avg_runs_per_season(matches_df, deliveries_df):
    """Calculate average runs per match per season."""
    # Get unique seasons
    seasons = sorted(matches_df['season'].unique())
    
    # Initialize dataframe to store season statistics
    season_stats = pd.DataFrame(index=seasons)
    
    # Calculate average runs per match per season
    season_stats['Total_Matches'] = 0
    season_stats['Total_Runs'] = 0
    season_stats['Avg_Runs_Per_Match'] = 0
    season_stats['Targets_200_Plus'] = 0
    
    for season in seasons:
        # Get matches for this season
        season_matches = matches_df[matches_df['season'] == season]
        season_stats.loc[season, 'Total_Matches'] = len(season_matches)
        
        # Get match IDs for this season
        match_ids = season_matches['id'].tolist()
        
        # Get deliveries for these matches
        season_deliveries = deliveries_df[deliveries_df['match_id'].isin(match_ids)]
        season_stats.loc[season, 'Total_Runs'] = season_deliveries['total_runs'].sum()
        
        # Calculate average runs per match
        if len(season_matches) > 0:
            season_stats.loc[season, 'Avg_Runs_Per_Match'] = (season_stats.loc[season, 'Total_Runs'] / 
                                                             (season_stats.loc[season, 'Total_Matches'] * 2)).round(2)
        
        # Calculate targets of 200+ runs
        innings_totals = season_deliveries.groupby(['match_id', 'inning'])['total_runs'].sum()
        season_stats.loc[season, 'Targets_200_Plus'] = len(innings_totals[innings_totals >= 200])
    
    return season_stats

# Calculate seasonal statistics
season_stats = calculate_avg_runs_per_season(matches_df_clean, deliveries_df_clean)
print("\nSeasonal Statistics:")
print(season_stats)


In [None]:

# Plot average runs per match per season
plt.figure(figsize=(14, 8))
plt.plot(season_stats.index, season_stats['Avg_Runs_Per_Match'], marker='o', linestyle='-', color='blue')
plt.xlabel('Season')
plt.ylabel('Average Runs Per Match')
plt.title('Average Runs Per Match by Season')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('avg_runs_per_season.png')
plt.show()


In [None]:

# Plot targets of 200+ runs per season
plt.figure(figsize=(14, 8))
plt.bar(season_stats.index, season_stats['Targets_200_Plus'], color='red')
plt.xlabel('Season')
plt.ylabel('Number of 200+ Totals')
plt.title('Number of 200+ Run Totals by Season')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('200plus_totals_per_season.png')
plt.show()


In [None]:

# Calculate average score of each team per season
def calculate_team_score_per_season(matches_df, deliveries_df):
    """Calculate average score of each team per season."""
    # Get unique seasons and teams
    seasons = sorted(matches_df['season'].unique())
    teams = set(matches_df['team1'].unique()) | set(matches_df['team2'].unique())
    teams = [team for team in teams if pd.notnull(team)]
    
    # Initialize dataframe to store team scores per season
    team_season_scores = {}
    
    for season in seasons:
        team_season_scores[season] = {}
        
        # Get matches for this season
        season_matches = matches_df[matches_df['season'] == season]
        
        # Get match IDs for this season
        match_ids = season_matches['id'].tolist()
        
        # Get teams active in this season
        active_teams = set(season_matches['team1'].unique()) | set(season_matches['team2'].unique())
        
        for team in active_teams:
            if pd.isna(team):
                continue
                
            # Get deliveries for this team and season
            team_deliveries = deliveries_df[(deliveries_df['match_id'].isin(match_ids)) & 
                                           (deliveries_df['batting_team'] == team)]
            
            # Calculate team's innings totals
            innings_totals = team_deliveries.groupby(['match_id', 'inning'])['total_runs'].sum()
            
            # Calculate average score
            if not innings_totals.empty:
                team_season_scores[season][team] = innings_totals.mean().round(2)
            else:
                team_season_scores[season][team] = 0
    
    # Convert to dataframe
    team_season_df = pd.DataFrame(team_season_scores)
    
    return team_season_df

# Calculate team scores per season
team_season_scores = calculate_team_score_per_season(matches_df_clean, deliveries_df_clean)
print("\nAverage Team Scores Per Season:")
print(team_season_scores)


In [None]:

# Find top performers per season (Orange Cap and Purple Cap)
def find_top_performers_per_season(matches_df, deliveries_df):
    """Find the Orange Cap (most runs) and Purple Cap (most wickets) holders per season."""
    # Get unique seasons
    seasons = sorted(matches_df['season'].unique())
    
    # Initialize dataframe to store top performers
    top_performers = pd.DataFrame(index=seasons)
    
    for season in seasons:
        # Get matches for this season
        season_matches = matches_df[matches_df['season'] == season]
        
        # Get match IDs for this season
        match_ids = season_matches['id'].tolist()
        
        # Get deliveries for these matches
        season_deliveries = deliveries_df[deliveries_df['match_id'].isin(match_ids)]
        
        # Find top run-scorer (Orange Cap)
        batsman_runs = season_deliveries.groupby('batter')['batsman_runs'].sum().reset_index()
        if not batsman_runs.empty:
            orange_cap = batsman_runs.sort_values('batsman_runs', ascending=False).iloc[0]
            top_performers.loc[season, 'Orange_Cap_Holder'] = orange_cap['batter']
            top_performers.loc[season, 'Orange_Cap_Runs'] = orange_cap['batsman_runs']
        
        # Find top wicket-taker (Purple Cap)
        bowler_wickets = season_deliveries[season_deliveries['is_wicket'] == 1].groupby('bowler').size().reset_index()
        if not bowler_wickets.empty:
            bowler_wickets.columns = ['bowler', 'wickets']
            purple_cap = bowler_wickets.sort_values('wickets', ascending=False).iloc[0]
            top_performers.loc[season, 'Purple_Cap_Holder'] = purple_cap['bowler']
            top_performers.loc[season, 'Purple_Cap_Wickets'] = purple_cap['wickets']
    
    return top_performers

# Find top performers per season
top_performers = find_top_performers_per_season(matches_df_clean, deliveries_df_clean)
print("\nOrange Cap (Top Run-scorer) and Purple Cap (Top Wicket-taker) Per Season:")
print(top_performers)


In [None]:

# Plot Orange Cap runs per season
plt.figure(figsize=(14, 8))
plt.bar(top_performers.index, top_performers['Orange_Cap_Runs'], color='orange')
plt.xlabel('Season')
plt.ylabel('Runs')
plt.title('Orange Cap Holder Runs Per Season')
plt.xticks(rotation=45)
for i, v in enumerate(top_performers['Orange_Cap_Runs']):
    plt.text(i, v + 10, top_performers['Orange_Cap_Holder'].iloc[i], 
             fontsize=8, ha='center', rotation=90)
plt.tight_layout()
plt.savefig('orange_cap_runs.png')
plt.show()


In [None]:

# Plot Purple Cap wickets per season
plt.figure(figsize=(14, 8))
plt.bar(top_performers.index, top_performers['Purple_Cap_Wickets'], color='purple')
plt.xlabel('Season')
plt.ylabel('Wickets')
plt.title('Purple Cap Holder Wickets Per Season')
plt.xticks(rotation=45)
for i, v in enumerate(top_performers['Purple_Cap_Wickets']):
    plt.text(i, v + 0.5, top_performers['Purple_Cap_Holder'].iloc[i], 
             fontsize=8, ha='center', rotation=90)
plt.tight_layout()
plt.savefig('purple_cap_wickets.png')
plt.show()


In [None]:

# Find top 10 bowlers per season
def find_top_bowlers_per_season(matches_df, deliveries_df, last_n_seasons=3):
    """Find the top 10 bowlers per season based on wickets taken."""
    # Get the last n seasons
    seasons = sorted(matches_df['season'].unique())[-last_n_seasons:]
    
    # Store data for all seasons
    all_season_bowlers = {}
    
    for season in seasons:
        # Get matches for this season
        season_matches = matches_df[matches_df['season'] == season]
        
        # Get match IDs for this season
        match_ids = season_matches['id'].tolist()
        
        # Get deliveries for these matches
        season_deliveries = deliveries_df[deliveries_df['match_id'].isin(match_ids)]
        
        # Find wicket-taking bowlers
        wicket_deliveries = season_deliveries[season_deliveries['is_wicket'] == 1]
        bowler_wickets = wicket_deliveries.groupby('bowler').size().reset_index()
        bowler_wickets.columns = ['Bowler', 'Wickets']
        
        # Calculate economy rate
        bowler_runs = season_deliveries.groupby('bowler')['total_runs'].sum().reset_index()
        bowler_runs.columns = ['Bowler', 'Runs']
        
        bowler_balls = season_deliveries.groupby('bowler').size().reset_index()
        bowler_balls.columns = ['Bowler', 'Balls']
        
        # Merge wickets and economy data
        bowler_stats = pd.merge(bowler_wickets, bowler_runs, on='Bowler')
        bowler_stats = pd.merge(bowler_stats, bowler_balls, on='Bowler')
        
        # Calculate economy rate
        bowler_stats['Economy'] = (bowler_stats['Runs'] / (bowler_stats['Balls'] / 6)).round(2)
        
        # Calculate bowling average
        bowler_stats['Average'] = (bowler_stats['Runs'] / bowler_stats['Wickets']).round(2)
        
        # Calculate strike rate
        bowler_stats['Strike_Rate'] = (bowler_stats['Balls'] / bowler_stats['Wickets']).round(2)
        
        # Get top 10 bowlers
        top_10_bowlers = bowler_stats.sort_values('Wickets', ascending=False).head(10)
        
        # Store for this season
        all_season_bowlers[season] = top_10_bowlers
    
    return all_season_bowlers

# Find top bowlers for recent seasons
top_bowlers_per_season = find_top_bowlers_per_season(matches_df_clean, deliveries_df_clean)

# Print top bowlers for the most recent season
latest_season = max(top_bowlers_per_season.keys())
print(f"\nTop 10 Bowlers in {latest_season} Season:")
print(top_bowlers_per_season[latest_season][['Bowler', 'Wickets', 'Economy', 'Average', 'Strike_Rate']])


In [None]:

# Part 4: Feature Extraction
print("\nPart 4: Feature Extraction")
print("="*80)

def extract_features(matches_df, deliveries_df):
    """Extract features for machine learning models."""
    features_df = matches_df.copy()
    
    # Add team strength features
    # Calculate win percentage for each team
    teams = set(list(matches_df['team1'].unique()) + 
               list(matches_df['team2'].unique()))
    teams = [team for team in teams if pd.notnull(team)]
    
    team_win_pct = {}
    team_toss_win_pct = {}
    
    for team in teams:
        # Calculate overall win percentage
        team_matches = matches_df[(matches_df['team1'] == team) | (matches_df['team2'] == team)]
        team_wins = matches_df[matches_df['winner'] == team]
        
        if len(team_matches) > 0:
            team_win_pct[team] = len(team_wins) / len(team_matches)
        else:
            team_win_pct[team] = 0
        
        # Calculate toss win percentage
        team_tosses = matches_df[(matches_df['team1'] == team) | (matches_df['team2'] == team)]
        team_toss_wins = matches_df[matches_df['toss_winner'] == team]
        
        if len(team_tosses) > 0:
            team_toss_win_pct[team] = len(team_toss_wins) / len(team_tosses)
        else:
            team_toss_win_pct[team] = 0
    
    # Add team win percentage as feature
    features_df['team1_win_pct'] = features_df['team1'].map(team_win_pct)
    features_df['team2_win_pct'] = features_df['team2'].map(team_win_pct)
    
    # Add toss win percentage as feature
    features_df['team1_toss_win_pct'] = features_df['team1'].map(team_toss_win_pct)
    features_df['team2_toss_win_pct'] = features_df['team2'].map(team_toss_win_pct)
    
    # Add venue advantage feature
    venue_team_wins = {}
    
    for venue in features_df['venue'].unique():
        venue_matches = features_df[features_df['venue'] == venue]
        
        for team in teams:
            team_venue_matches = venue_matches[(venue_matches['team1'] == team) | 
                                              (venue_matches['team2'] == team)]
            team_venue_wins = venue_matches[venue_matches['winner'] == team]
            
            if len(team_venue_matches) > 0:
                venue_team_wins[(venue, team)] = len(team_venue_wins) / len(team_venue_matches)
            else:
                venue_team_wins[(venue, team)] = 0
    
    # Add venue advantage as feature
    features_df['team1_venue_advantage'] = features_df.apply(
        lambda row: venue_team_wins.get((row['venue'], row['team1']), 0), axis=1)
    features_df['team2_venue_advantage'] = features_df.apply(
        lambda row: venue_team_wins.get((row['venue'], row['team2']), 0), axis=1)
    
    # Add head-to-head advantage
    head_to_head_wins = {}
    
    for team1 in teams:
        for team2 in teams:
            if team1 != team2:
                h2h_matches = features_df[((features_df['team1'] == team1) & (features_df['team2'] == team2)) | 
                                        ((features_df['team1'] == team2) & (features_df['team2'] == team1))]
                team1_wins = h2h_matches[h2h_matches['winner'] == team1]
                
                if len(h2h_matches) > 0:
                    head_to_head_wins[(team1, team2)] = len(team1_wins) / len(h2h_matches)
                else:
                    head_to_head_wins[(team1, team2)] = 0.5  # Neutral if no matches
    
    # Add head-to-head advantage as feature
    features_df['team1_h2h_advantage'] = features_df.apply(
        lambda row: head_to_head_wins.get((row['team1'], row['team2']), 0.5), axis=1)
    features_df['team2_h2h_advantage'] = features_df.apply(
        lambda row: head_to_head_wins.get((row['team2'], row['team1']), 0.5), axis=1)
    
    # Add recent form feature (last 5 matches)
    recent_form = {}
    
    for team in teams:
        team_matches = features_df[(features_df['team1'] == team) | 
                                   (features_df['team2'] == team)].sort_values('date')
        team_results = []
        
        for _, match in team_matches.iterrows():
            if match['winner'] == team:
                team_results.append(1)  # Win
            elif match['winner'] == 'No Result':
                team_results.append(0.5)  # Tie/No Result
            else:
                team_results.append(0)  # Loss
        
        # Calculate rolling average of last n matches
        n_matches = 5
        for i in range(len(team_results)):
            start_idx = max(0, i - n_matches + 1)
            recent_form[(team, i)] = sum(team_results[start_idx:i+1]) / min(n_matches, i+1)
    
    # Add team recent form as feature
    features_df['team1_recent_form'] = features_df.apply(
        lambda row: recent_form.get((row['team1'], 
                                     len(features_df[(features_df['team1'] == row['team1']) | 
                                                    (features_df['team2'] == row['team1']) & 
                                                    (features_df['date'] < row['date'])])), 0.5), axis=1)
    features_df['team2_recent_form'] = features_df.apply(
        lambda row: recent_form.get((row['team2'], 
                                     len(features_df[(features_df['team1'] == row['team2']) | 
                                                    (features_df['team2'] == row['team2']) & 
                                                    (features_df['date'] < row['date'])])), 0.5), axis=1)
    
    # Add toss decision feature
    features_df['toss_decision_bat'] = features_df['toss_decision'].apply(lambda x: 1 if x == 'bat' else 0)
    
    # Add binary target variable (1 if team1 wins, 0 if team2 wins)
    features_df['team1_win'] = features_df.apply(
        lambda row: 1 if row['winner'] == row['team1'] else 0 if row['winner'] == row['team2'] else None, axis=1)
    
    # Drop rows with no clear winner
    features_df = features_df.dropna(subset=['team1_win'])
    
    # Select relevant features for modeling
    selected_features = ['team1_win_pct', 'team2_win_pct', 
                        'team1_toss_win_pct', 'team2_toss_win_pct',
                        'team1_venue_advantage', 'team2_venue_advantage',
                        'team1_h2h_advantage', 'team2_h2h_advantage',
                        'team1_recent_form', 'team2_recent_form',
                        'toss_decision_bat', 'team1_win']
    
    model_data = features_df[selected_features].copy()
    
    return model_data, features_df

# Extract features for modeling
model_data, features_df = extract_features(matches_df_clean, deliveries_df_clean)
print("\nExtracted Features for Modeling:")
print(model_data.head())
print("\nFeature Summary Statistics:")
print(model_data.describe())


In [None]:

# Check for missing values
print("\nMissing Values in Model Data:")
print(model_data.isnull().sum())


In [None]:

# Handle any remaining missing values
model_data = model_data.fillna(0)


In [None]:

# Part 5: Model Building and Evaluation
print("\nPart 5: Model Building and Evaluation")
print("="*80)

# Split data into features and target
X = model_data.drop('team1_win', axis=1)
y = model_data['team1_win']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)

print("\nRandom Forest Model:")
print(f"Accuracy: {rf_accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, rf_pred))


In [None]:

# Train Gradient Boosting Classifier
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)
gb_pred = gb_model.predict(X_test)
gb_accuracy = accuracy_score(y_test, gb_pred)

print("\nGradient Boosting Model:")
print(f"Accuracy: {gb_accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, gb_pred))


In [None]:

# Train XGBoost Classifier
xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
xgb_accuracy = accuracy_score(y_test, xgb_pred)

print("\nXGBoost Model:")
print(f"Accuracy: {xgb_accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, xgb_pred))


In [None]:

# Create Ensemble Model
ensemble_model = VotingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
        ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
        ('xgb', xgb.XGBClassifier(n_estimators=100, random_state=42))
    ],
    voting='soft'
)

ensemble_model.fit(X_train, y_train)
ensemble_pred = ensemble_model.predict(X_test)
ensemble_accuracy = accuracy_score(y_test, ensemble_pred)

print("\nEnsemble Model:")
print(f"Accuracy: {ensemble_accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, ensemble_pred))


In [None]:

# Build Neural Network Model
def build_nn_model(input_dim):
    model = keras.Sequential([
        layers.Dense(64, activation='relu', input_dim=input_dim),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        layers.Dense(32, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        layers.Dense(16, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])
    
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

# Train Neural Network Model
nn_model = build_nn_model(X_train.shape[1])
nn_model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)
nn_pred_prob = nn_model.predict(X_test)
nn_pred = (nn_pred_prob > 0.5).astype(int)
nn_accuracy = accuracy_score(y_test, nn_pred)

print("\nNeural Network Model:")
print(f"Accuracy: {nn_accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, nn_pred))


In [None]:

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'RF_Importance': rf_model.feature_importances_,
    'GB_Importance': gb_model.feature_importances_,
    'XGB_Importance': xgb_model.feature_importances_
})

feature_importance['Average_Importance'] = feature_importance[['RF_Importance', 'GB_Importance', 'XGB_Importance']].mean(axis=1)
feature_importance = feature_importance.sort_values('Average_Importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance)


In [None]:

# Plot feature importance
plt.figure(figsize=(12, 8))
sns.barplot(x='Average_Importance', y='Feature', data=feature_importance)
plt.title('Feature Importance (Average across models)')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.show()


In [None]:

# Part 6: 2025 IPL Prediction
print("\nPart 6: 2025 IPL Prediction")
print("="*80)

# Get unique teams in IPL 2025
ipl_2025_teams = players_df['Team'].unique()
print(f"IPL 2025 Teams: {len(ipl_2025_teams)}")
print(ipl_2025_teams)

# Function to prepare prediction data for a match
def prepare_match_prediction_data(team1, team2, venue, features_df, team_stats):
    """Prepare prediction data for a match."""
    # Create prediction data
    pred_data = {}
    
    # Add team strength features
    pred_data['team1_win_pct'] = team_stats.loc[team1, 'Winning_Percentage'] / 100 if team1 in team_stats.index else 0.5
    pred_data['team2_win_pct'] = team_stats.loc[team2, 'Winning_Percentage'] / 100 if team2 in team_stats.index else 0.5
    
    # Add toss win percentage (use average if not available)
    team1_toss_matches = features_df[(features_df['team1'] == team1) | (features_df['team2'] == team1)]
    team1_toss_wins = features_df[features_df['toss_winner'] == team1]
    
    team2_toss_matches = features_df[(features_df['team1'] == team2) | (features_df['team2'] == team2)]
    team2_toss_wins = features_df[features_df['toss_winner'] == team2]
    
    pred_data['team1_toss_win_pct'] = len(team1_toss_wins) / len(team1_toss_matches) if len(team1_toss_matches) > 0 else 0.5
    pred_data['team2_toss_win_pct'] = len(team2_toss_wins) / len(team2_toss_matches) if len(team2_toss_matches) > 0 else 0.5
    
    # Add venue advantage (use average if not available)
    team1_venue_matches = features_df[(features_df['venue'] == venue) & 
                                     ((features_df['team1'] == team1) | (features_df['team2'] == team1))]
    team1_venue_wins = features_df[(features_df['venue'] == venue) & (features_df['winner'] == team1)]
    
    team2_venue_matches = features_df[(features_df['venue'] == venue) & 
                                     ((features_df['team1'] == team2) | (features_df['team2'] == team2))]
    team2_venue_wins = features_df[(features_df['venue'] == venue) & (features_df['winner'] == team2)]
    
    pred_data['team1_venue_advantage'] = len(team1_venue_wins) / len(team1_venue_matches) if len(team1_venue_matches) > 0 else 0.5
    pred_data['team2_venue_advantage'] = len(team2_venue_wins) / len(team2_venue_matches) if len(team2_venue_matches) > 0 else 0.5
    
    # Add head-to-head advantage (use 0.5 if not available)
    h2h_matches = features_df[((features_df['team1'] == team1) & (features_df['team2'] == team2)) | 
                             ((features_df['team1'] == team2) & (features_df['team2'] == team1))]
    team1_h2h_wins = h2h_matches[h2h_matches['winner'] == team1]
    
    pred_data['team1_h2h_advantage'] = len(team1_h2h_wins) / len(h2h_matches) if len(h2h_matches) > 0 else 0.5
    pred_data['team2_h2h_advantage'] = 1 - pred_data['team1_h2h_advantage']
    
    # Add recent form (use 0.5 if not available)
    team1_recent_matches = features_df[(features_df['team1'] == team1) | (features_df['team2'] == team1)].tail(5)
    team1_recent_wins = sum(1 for _, match in team1_recent_matches.iterrows() if match['winner'] == team1)
    
    team2_recent_matches = features_df[(features_df['team1'] == team2) | (features_df['team2'] == team2)].tail(5)
    team2_recent_wins = sum(1 for _, match in team2_recent_matches.iterrows() if match['winner'] == team2)
    
    pred_data['team1_recent_form'] = team1_recent_wins / len(team1_recent_matches) if len(team1_recent_matches) > 0 else 0.5
    pred_data['team2_recent_form'] = team2_recent_wins / len(team2_recent_matches) if len(team2_recent_matches) > 0 else 0.5
    
    # Add toss decision (assume batting first)
    pred_data['toss_decision_bat'] = 1
    
    return pd.DataFrame([pred_data])

# Simulate IPL 2025 tournament
def simulate_ipl_2025(fixtures_df, team_stats, features_df, ensemble_model, nn_model):
    """Simulate IPL 2025 tournament and predict the winner."""
    # Create copy of fixtures
    fixtures = fixtures_df.copy()
    
    # Initialize points table
    points_table = pd.DataFrame(index=ipl_2025_teams)
    points_table['Matches'] = 0
    points_table['Wins'] = 0
    points_table['Losses'] = 0
    points_table['Draws'] = 0
    points_table['Points'] = 0
    points_table['NRR'] = 0  # Net Run Rate (simplified)
    
    # Simulate league phase matches
    match_results = []
    
    for _, match in fixtures.iterrows():
        home_team = match['Home']
        away_team = match['Away']
        venue = match['Venue']
        
        # Prepare prediction data
        pred_data = prepare_match_prediction_data(home_team, away_team, venue, features_df, team_stats)
        
        # Make predictions
        ensemble_prob = ensemble_model.predict_proba(pred_data)[0][1]
        nn_prob = nn_model.predict(pred_data)[0][0]
        
        # Combine predictions (ensemble gets more weight)
        combined_prob = 0.7 * ensemble_prob + 0.3 * nn_prob
        
        # Determine winner
        if combined_prob > 0.5:
            winner = home_team
            loser = away_team
            win_prob = combined_prob
        else:
            winner = away_team
            loser = home_team
            win_prob = 1 - combined_prob
        
        # Update points table
        points_table.loc[winner, 'Matches'] += 1
        points_table.loc[winner, 'Wins'] += 1
        points_table.loc[winner, 'Points'] += 2
        
        points_table.loc[loser, 'Matches'] += 1
        points_table.loc[loser, 'Losses'] += 1
        
        # Simulate NRR (simplified)
        points_table.loc[winner, 'NRR'] += np.random.uniform(0.05, 0.2)
        points_table.loc[loser, 'NRR'] -= np.random.uniform(0.05, 0.2)
        
        # Record match result
        match_results.append({
            'Match_No': match['Match No'],
            'Home': home_team,
            'Away': away_team,
            'Winner': winner,
            'Win_Probability': win_prob
        })
    
    # Sort points table
    points_table = points_table.sort_values(['Points', 'NRR'], ascending=False)
    
    # Get top 4 teams for playoffs
    playoff_teams = points_table.head(4).index.tolist()
    
    # Simulate Qualifier 1 (1st vs 2nd)
    team1 = playoff_teams[0]
    team2 = playoff_teams[1]
    pred_data = prepare_match_prediction_data(team1, team2, 'Neutral', features_df, team_stats)
    ensemble_prob = ensemble_model.predict_proba(pred_data)[0][1]
    nn_prob = nn_model.predict(pred_data)[0][0]
    combined_prob = 0.7 * ensemble_prob + 0.3 * nn_prob
    
    if combined_prob > 0.5:
        q1_winner = team1
        q1_loser = team2
    else:
        q1_winner = team2
        q1_loser = team1
    
    # Simulate Eliminator (3rd vs 4th)
    team3 = playoff_teams[2]
    team4 = playoff_teams[3]
    pred_data = prepare_match_prediction_data(team3, team4, 'Neutral', features_df, team_stats)
    ensemble_prob = ensemble_model.predict_proba(pred_data)[0][1]
    nn_prob = nn_model.predict(pred_data)[0][0]
    combined_prob = 0.7 * ensemble_prob + 0.3 * nn_prob
    
    if combined_prob > 0.5:
        eliminator_winner = team3
        eliminator_loser = team4
    else:
        eliminator_winner = team4
        eliminator_loser = team3
    
    # Simulate Qualifier 2 (Q1 loser vs Eliminator winner)
    pred_data = prepare_match_prediction_data(q1_loser, eliminator_winner, 'Neutral', features_df, team_stats)
    ensemble_prob = ensemble_model.predict_proba(pred_data)[0][1]
    nn_prob = nn_model.predict(pred_data)[0][0]
    combined_prob = 0.7 * ensemble_prob + 0.3 * nn_prob
    
    if combined_prob > 0.5:
        q2_winner = q1_loser
    else:
        q2_winner = eliminator_winner
    
    # Simulate Final (Q1 winner vs Q2 winner)
    pred_data = prepare_match_prediction_data(q1_winner, q2_winner, 'Neutral', features_df, team_stats)
    ensemble_prob = ensemble_model.predict_proba(pred_data)[0][1]
    nn_prob = nn_model.predict(pred_data)[0][0]
    combined_prob = 0.7 * ensemble_prob + 0.3 * nn_prob
    
    if combined_prob > 0.5:
        champion = q1_winner
        runner_up = q2_winner
        final_win_prob = combined_prob
    else:
        champion = q2_winner
        runner_up = q1_winner
        final_win_prob = 1 - combined_prob
    
    return points_table, playoff_teams, champion, runner_up, final_win_prob, match_results

# Simulate IPL 2025 tournament multiple times
num_simulations = 100
simulation_results = []

for sim in range(num_simulations):
    points_table, playoff_teams, champion, runner_up, final_win_prob, match_results = simulate_ipl_2025(
        fixtures_df_clean, team_stats, features_df, ensemble_model, nn_model)
    
    simulation_results.append({
        'Champion': champion,
        'Runner_Up': runner_up,
        'Final_Win_Prob': final_win_prob,
        'Top_4': playoff_teams
    })

# Count championship wins for each team
champion_counts = {}
for result in simulation_results:
    champion = result['Champion']
    champion_counts[champion] = champion_counts.get(champion, 0) + 1

# Count top 4 appearances for each team
top4_counts = {}
for result in simulation_results:
    for team in result['Top_4']:
        top4_counts[team] = top4_counts.get(team, 0) + 1

# Calculate championship probability
champion_prob = {team: count / num_simulations for team, count in champion_counts.items()}
champion_prob = dict(sorted(champion_prob.items(), key=lambda x: x[1], reverse=True))

# Calculate playoff probability
playoff_prob = {team: count / num_simulations for team, count in top4_counts.items()}
playoff_prob = dict(sorted(playoff_prob.items(), key=lambda x: x[1], reverse=True))


In [None]:

print("\nChampionship Probability:")
for team, prob in champion_prob.items():
    print(f"{team}: {prob:.2%}")


In [None]:

print("\nPlayoff Probability:")
for team, prob in playoff_prob.items():
    print(f"{team}: {prob:.2%}")


In [None]:

# Plot championship probability
plt.figure(figsize=(14, 8))
plt.bar(champion_prob.keys(), [prob * 100 for prob in champion_prob.values()], color='gold')
plt.xlabel('Teams')
plt.ylabel('Championship Probability (%)')
plt.title('IPL 2025 Championship Probability')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('championship_probability.png')
plt.close()


In [None]:

# Plot playoff probability
plt.figure(figsize=(14, 8))
plt.bar(playoff_prob.keys(), [prob * 100 for prob in playoff_prob.values()], color='blue')
plt.xlabel('Teams')
plt.ylabel('Playoff Probability (%)')
plt.title('IPL 2025 Playoff Probability')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('playoff_probability.png')
plt.close()


In [None]:

# Predict the IPL 2025 winner
predicted_champion = max(champion_prob.items(), key=lambda x: x[1])[0]
championship_probability = champion_prob[predicted_champion]

print(f"\n🏆 Predicted IPL 2025 Champion: {predicted_champion} with {championship_probability:.2%} probability")

# Output final points table from a single simulation
final_points_table, _, _, _, _, _ = simulate_ipl_2025(
    fixtures_df_clean, team_stats, features_df, ensemble_model, nn_model)

print("\nPredicted Final Points Table:")
print(final_points_table[['Matches', 'Wins', 'Losses', 'Points', 'NRR']].sort_values(['Points', 'NRR'], ascending=False))


In [None]:

# Part 7: Player Impact Analysis for 2025
print("\nPart 7: Player Impact Analysis for 2025")
print("="*80)

# Analyze 2025 player impact
def analyze_player_impact(players_df, alldata_df):
    """Analyze player impact for 2025 season."""
    # Merge player data with team
    player_impact = pd.merge(players_df, alldata_df, left_on='Players', right_on='Player_Name', how='left')
    
    # Fill missing values
    numeric_cols = ['Batting_Average', 'Batting_Strike_Rate', 'Economy_Rate', 'Bowling_Strike_Rate', 'Bowling_Average']
    player_impact[numeric_cols] = player_impact[numeric_cols].fillna(0)
    
    # Create impact score for batsmen
    player_impact['Batting_Impact'] = (player_impact['Batting_Average'] * 0.4 + 
                                      player_impact['Batting_Strike_Rate'] * 0.6) / 100
    
    # Create impact score for bowlers (lower is better for economy and average)
    player_impact['Bowling_Impact'] = 0
    bowler_mask = player_impact['Bowling_Average'] > 0
    
    if bowler_mask.any():
        max_econ = player_impact.loc[bowler_mask, 'Economy_Rate'].max()
        max_avg = player_impact.loc[bowler_mask, 'Bowling_Average'].max()
        
        player_impact.loc[bowler_mask, 'Bowling_Impact'] = (
            (1 - player_impact.loc[bowler_mask, 'Economy_Rate'] / max_econ) * 0.4 + 
            (1 - player_impact.loc[bowler_mask, 'Bowling_Average'] / max_avg) * 0.6
        )
    
    # Create overall impact score based on player type
    player_impact['Impact_Score'] = 0
    
    # Batsmen
    batsmen_mask = player_impact['Type'] == 'Batsman'
    player_impact.loc[batsmen_mask, 'Impact_Score'] = player_impact.loc[batsmen_mask, 'Batting_Impact']
    
    # Bowlers
    bowler_mask = player_impact['Type'] == 'Bowler'
    player_impact.loc[bowler_mask, 'Impact_Score'] = player_impact.loc[bowler_mask, 'Bowling_Impact']
    
    # All-rounders (weighted average)
    allrounder_mask = player_impact['Type'] == 'All-Rounder'
    player_impact.loc[allrounder_mask, 'Impact_Score'] = (
        player_impact.loc[allrounder_mask, 'Batting_Impact'] * 0.5 + 
        player_impact.loc[allrounder_mask, 'Bowling_Impact'] * 0.5
    )
    
    # Calculate team strength based on player impact
    team_strength = player_impact.groupby('Team')['Impact_Score'].agg(['mean', 'sum']).reset_index()
    team_strength.columns = ['Team', 'Average_Player_Impact', 'Total_Team_Impact']
    team_strength = team_strength.sort_values('Total_Team_Impact', ascending=False)
    
    # Get top players for each team
    top_players = {}
    for team in player_impact['Team'].unique():
        team_players = player_impact[player_impact['Team'] == team].sort_values('Impact_Score', ascending=False)
        top_players[team] = team_players.head(3)
    
    return player_impact, team_strength, top_players

# Analyze player impact
player_impact, team_strength, top_players = analyze_player_impact(players_df, alldata_df_clean)

print("\nTeam Strength Based on Player Impact:")
print(team_strength)


In [None]:

print("\nTop Players for Each Team:")
for team, players in top_players.items():
    print(f"\n{team}:")
    print(players[['Players', 'Type', 'Batting_Average', 'Batting_Strike_Rate', 'Economy_Rate', 'Impact_Score']])

# Plot team strength
plt.figure(figsize=(14, 8))
plt.bar(team_strength['Team'], team_strength['Total_Team_Impact'], color='purple')
plt.xlabel('Teams')
plt.ylabel('Total Team Impact Score')
plt.title('IPL 2025 Team Strength Based on Player Impact')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('team_strength.png')
plt.close()


In [None]:

# Compare team strength with championship probability
comparison_df = pd.DataFrame({
    'Team': list(champion_prob.keys()),
    'Championship_Probability': [prob * 100 for prob in champion_prob.values()]
})
comparison_df = pd.merge(comparison_df, team_strength, on='Team', how='left')

print("\nComparison of Team Strength and Championship Probability:")
print(comparison_df.sort_values('Championship_Probability', ascending=False))

# Plot the relationship between team strength and championship probability
plt.figure(figsize=(12, 8))
plt.scatter(comparison_df['Total_Team_Impact'], comparison_df['Championship_Probability'])

for i, row in comparison_df.iterrows():
    plt.annotate(row['Team'], (row['Total_Team_Impact'], row['Championship_Probability']))

plt.xlabel('Total Team Impact Score')
plt.ylabel('Championship Probability (%)')
plt.title('Relationship Between Team Strength and Championship Probability')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('strength_vs_probability.png')
plt.show()


In [None]:

# Part 8: Summary and Conclusion
print("\nPart 8: Summary and Conclusion")
print("="*80)

# Print key findings
print("\nKey Findings from IPL Analysis:")
print(f"1. Predicted Champion for IPL 2025: {predicted_champion} with {championship_probability:.2%} probability")
print(f"2. Top contenders based on player impact: {', '.join(team_strength['Team'].head(3).tolist())}")
print(f"3. Teams with highest playoff probability: {', '.join(list(playoff_prob.keys())[:4])}")

print("\nModel Performance:")
print(f"Random Forest Accuracy: {rf_accuracy:.4f}")
print(f"Gradient Boosting Accuracy: {gb_accuracy:.4f}")
print(f"XGBoost Accuracy: {xgb_accuracy:.4f}")
print(f"Ensemble Model Accuracy: {ensemble_accuracy:.4f}")
print(f"Neural Network Accuracy: {nn_accuracy:.4f}")

print("\nMost Important Features for Prediction:")
for i, row in feature_importance.head(5).iterrows():
    print(f"{row['Feature']}: {row['Average_Importance']:.4f}")

print("\nTop Players Expected to Make an Impact in IPL 2025:")
for team, players in top_players.items():
    if team == predicted_champion:
        print(f"\nKey Players for Predicted Champion ({team}):")
        for i, player in players.iterrows():
            print(f"- {player['Players']} ({player['Type']}): Impact Score = {player['Impact_Score']:.4f}")

In [None]:

# Man of the Match Count Analysis
mom_counts = matches_df_clean['player_of_match'].value_counts().reset_index()
mom_counts.columns = ['Player', 'MoM_Count']
top_20_mom = mom_counts.head(20)

print("\nPlayers with Most Man of the Match Awards:")
print(top_20_mom)

plt.figure(figsize=(14, 8))
plt.bar(top_20_mom['Player'], top_20_mom['MoM_Count'], color='green')
plt.xlabel('Players')
plt.ylabel('Number of MoM Awards')
plt.title('Top 20 Players with Most Man of the Match Awards')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('top_mom_awards.png')
plt.show()


In [None]:

# K-Means Clustering for player role identification
def cluster_players_by_role(df):
    """Use K-Means clustering to identify player roles."""
    # Select players with both batting and bowling stats
    player_roles = df[(df['Matches_Batted'] > 10) & (df['Matches_Bowled'] > 10)].copy()
    
    # Select features for clustering
    features = ['Batting_Average', 'Economy_Rate']
    X = player_roles[features].copy()
    
    # Handle NaN values
    X = X.fillna(0)
    
    # Scale the data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Apply K-means clustering
    kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
    player_roles['Cluster'] = kmeans.fit_predict(X_scaled)
    
    # Map cluster to role based on centroid values
    centroids = kmeans.cluster_centers_
    
    # Sort centroids by batting average (higher = more batting oriented)
    centroid_roles = {
        np.argmax(centroids[:, 0]): 'Batsman',
        np.argmin(centroids[:, 0]): 'Bowler'
    }
    
    # Remaining cluster is all-rounder
    for i in range(3):
        if i not in centroid_roles:
            centroid_roles[i] = 'All-Rounder'
    
    # Map cluster to role
    player_roles['Role'] = player_roles['Cluster'].map(centroid_roles)
    
    return player_roles, centroids, centroid_roles

# Apply clustering to identify player roles
player_roles, centroids, centroid_roles = cluster_players_by_role(alldata_df_clean)

print("\nPlayer Role Identification using K-Means Clustering:")
print(f"Cluster Centroids: {centroids}")
print(f"Cluster to Role Mapping: {centroid_roles}")
print("\nSample Player Role Distribution:")
print(player_roles['Role'].value_counts())


In [None]:

# Plot the clusters
plt.figure(figsize=(12, 10))
colors = ['red', 'green', 'blue']
roles = ['Batsman', 'Bowler', 'All-Rounder']

for i, role in enumerate(roles):
    cluster = player_roles[player_roles['Role'] == role]
    plt.scatter(cluster['Batting_Average'], cluster['Economy_Rate'],
                c=colors[i], label=role, alpha=0.6)

# Plot centroids
for i, role in enumerate(centroid_roles.values()):
    plt.scatter(centroids[i, 0], centroids[i, 1], c='black', marker='X', s=100,
                label=f"{role} Centroid")

plt.xlabel('Batting Average')
plt.ylabel('Economy Rate')
plt.title('K-Means Clustering of Players by Role')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('player_role_clustering.png')
plt.show()


In [None]:

# Identify Top Batsmen by Run Category
def identify_top_batsmen_by_category(deliveries_df, category, n=10):
    """Identify top batsmen in different run categories."""
    if category == 'sixes':
        batsmen_runs = deliveries_df[deliveries_df['batsman_runs'] == 6]['batter'].value_counts().reset_index()
        title = "Top 10 Batsmen with Most Sixes"
        col_name = "Sixes"
    elif category == 'fours':
        batsmen_runs = deliveries_df[deliveries_df['batsman_runs'] == 4]['batter'].value_counts().reset_index()
        title = "Top 10 Batsmen with Most Fours"
        col_name = "Fours"
    elif category == 'twos':
        batsmen_runs = deliveries_df[deliveries_df['batsman_runs'] == 2]['batter'].value_counts().reset_index()
        title = "Top 10 Batsmen with Most Twos"
        col_name = "Twos"
    elif category == 'singles':
        batsmen_runs = deliveries_df[deliveries_df['batsman_runs'] == 1]['batter'].value_counts().reset_index()
        title = "Top 10 Batsmen with Most Singles"
        col_name = "Singles"
    else:
        return None, ""
    
    batsmen_runs.columns = ['Batsman', col_name]
    return batsmen_runs.head(n), title, col_name

# Get top batsmen in each run category
top_sixes, sixes_title, sixes_col = identify_top_batsmen_by_category(deliveries_df_clean, 'sixes')
top_fours, fours_title, fours_col = identify_top_batsmen_by_category(deliveries_df_clean, 'fours')
top_twos, twos_title, twos_col = identify_top_batsmen_by_category(deliveries_df_clean, 'twos')
top_singles, singles_title, singles_col = identify_top_batsmen_by_category(deliveries_df_clean, 'singles')

print("\nTop 10 Batsmen with Most Sixes:")
print(top_sixes)

print("\nTop 10 Batsmen with Most Fours:")
print(top_fours)

print("\nTop 10 Batsmen with Most Twos:")
print(top_twos)

print("\nTop 10 Batsmen with Most Singles:")
print(top_singles)


In [None]:

# Plot top batsmen by run category
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Plot top sixes hitters
axes[0, 0].bar(top_sixes['Batsman'], top_sixes[sixes_col], color='red')
axes[0, 0].set_title(sixes_title)
axes[0, 0].set_xlabel('Batsmen')
axes[0, 0].set_ylabel('Number of Sixes')
axes[0, 0].tick_params(axis='x', rotation=45)

# Plot top fours hitters
axes[0, 1].bar(top_fours['Batsman'], top_fours[fours_col], color='blue')
axes[0, 1].set_title(fours_title)
axes[0, 1].set_xlabel('Batsmen')
axes[0, 1].set_ylabel('Number of Fours')
axes[0, 1].tick_params(axis='x', rotation=45)

# Plot top twos hitters
axes[1, 0].bar(top_twos['Batsman'], top_twos[twos_col], color='green')
axes[1, 0].set_title(twos_title)
axes[1, 0].set_xlabel('Batsmen')
axes[1, 0].set_ylabel('Number of Twos')
axes[1, 0].tick_params(axis='x', rotation=45)

# Plot top singles hitters
axes[1, 1].bar(top_singles['Batsman'], top_singles[singles_col], color='purple')
axes[1, 1].set_title(singles_title)
axes[1, 1].set_xlabel('Batsmen')
axes[1, 1].set_ylabel('Number of Singles')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('top_batsmen_by_run_category.png')
plt.show()


In [None]:

# 3.3 Seasonal Analysis
print("\n3.3 Seasonal Analysis")
print("-"*50)

# Calculate average runs per match per season
def calculate_avg_runs_per_season(matches_df, deliveries_df):
    """Calculate average runs per match per season."""
    # Get unique seasons
    seasons = sorted(matches_df['season'].unique())
    
    # Initialize dataframe to store season statistics
    season_stats = pd.DataFrame(index=seasons)
    
    # Calculate average runs per match per season
    season_stats['Total_Matches'] = 0
    season_stats['Total_Runs'] = 0
    season_stats['Avg_Runs_Per_Match'] = 0
    season_stats['Targets_200_Plus'] = 0
    
    for season in seasons:
        # Get matches for this season
        season_matches = matches_df[matches_df['season'] == season]
        season_stats.loc[season, 'Total_Matches'] = len(season_matches)
        
        # Get match IDs for this season
        match_ids = season_matches['id'].tolist()
        
        # Get deliveries for these matches
        season_deliveries = deliveries_df[deliveries_df['match_id'].isin(match_ids)]
        season_stats.loc[season, 'Total_Runs'] = season_deliveries['total_runs'].sum()
        
        # Calculate average runs per match
        if len(season_matches) > 0:
            season_stats.loc[season, 'Avg_Runs_Per_Match'] = (season_stats.loc[season, 'Total_Runs'] / 
                                                             (season_stats.loc[season, 'Total_Matches'] * 2)).round(2)
        
        # Calculate targets of 200+ runs
        innings_totals = season_deliveries.groupby(['match_id', 'inning'])['total_runs'].sum()
        season_stats.loc[season, 'Targets_200_Plus'] = len(innings_totals[innings_totals >= 200])
    
    return season_stats

# Calculate seasonal statistics
season_stats = calculate_avg_runs_per_season(matches_df_clean, deliveries_df_clean)
print("\nSeasonal Statistics:")
print(season_stats)


In [None]:

# Plot average runs per match per season
plt.figure(figsize=(14, 8))
plt.plot(season_stats.index, season_stats['Avg_Runs_Per_Match'], marker='o', linestyle='-', color='blue')
plt.xlabel('Season')
plt.ylabel('Average Runs Per Match')
plt.title('Average Runs Per Match by Season')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('avg_runs_per_season.png')
plt.show()


In [None]:

# Plot targets of 200+ runs per season
plt.figure(figsize=(14, 8))
plt.bar(season_stats.index, season_stats['Targets_200_Plus'], color='red')
plt.xlabel('Season')
plt.ylabel('Number of 200+ Totals')
plt.title('Number of 200+ Run Totals by Season')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('200plus_totals_per_season.png')
plt.show()


In [None]:

# Calculate average score of each team per season
def calculate_team_score_per_season(matches_df, deliveries_df):
    """Calculate average score of each team per season."""
    # Get unique seasons and teams
    seasons = sorted(matches_df['season'].unique())
    teams = set(matches_df['team1'].unique()) | set(matches_df['team2'].unique())
    teams = [team for team in teams if pd.notnull(team)]
    
    # Initialize dataframe to store team scores per season
    team_season_scores = {}
    
    for season in seasons:
        team_season_scores[season] = {}
        
        # Get matches for this season
        season_matches = matches_df[matches_df['season'] == season]
        
        # Get match IDs for this season
        match_ids = season_matches['id'].tolist()
        
        # Get teams active in this season
        active_teams = set(season_matches['team1'].unique()) | set(season_matches['team2'].unique())
        
        for team in active_teams:
            if pd.isna(team):
                continue
                
            # Get deliveries for this team and season
            team_deliveries = deliveries_df[(deliveries_df['match_id'].isin(match_ids)) & 
                                           (deliveries_df['batting_team'] == team)]
            
            # Calculate team's innings totals
            innings_totals = team_deliveries.groupby(['match_id', 'inning'])['total_runs'].sum()
            
            # Calculate average score
            if not innings_totals.empty:
                team_season_scores[season][team] = innings_totals.mean().round(2)
            else:
                team_season_scores[season][team] = 0
    
    # Convert to dataframe
    team_season_df = pd.DataFrame(team_season_scores)
    
    return team_season_df

# Calculate team scores per season
team_season_scores = calculate_team_score_per_season(matches_df_clean, deliveries_df_clean)
print("\nAverage Team Scores Per Season:")
print(team_season_scores)


In [None]:

# Find top performers per season (Orange Cap and Purple Cap)
def find_top_performers_per_season(matches_df, deliveries_df):
    """Find the Orange Cap (most runs) and Purple Cap (most wickets) holders per season."""
    # Get unique seasons
    seasons = sorted(matches_df['season'].unique())
    
    # Initialize dataframe to store top performers
    top_performers = pd.DataFrame(index=seasons)
    
    for season in seasons:
        # Get matches for this season
        season_matches = matches_df[matches_df['season'] == season]
        
        # Get match IDs for this season
        match_ids = season_matches['id'].tolist()
        
        # Get deliveries for these matches
        season_deliveries = deliveries_df[deliveries_df['match_id'].isin(match_ids)]
        
        # Find top run-scorer (Orange Cap)
        batsman_runs = season_deliveries.groupby('batter')['batsman_runs'].sum().reset_index()
        if not batsman_runs.empty:
            orange_cap = batsman_runs.sort_values('batsman_runs', ascending=False).iloc[0]
            top_performers.loc[season, 'Orange_Cap_Holder'] = orange_cap['batter']
            top_performers.loc[season, 'Orange_Cap_Runs'] = orange_cap['batsman_runs']
        
        # Find top wicket-taker (Purple Cap)
        bowler_wickets = season_deliveries[season_deliveries['is_wicket'] == 1].groupby('bowler').size().reset_index()
        if not bowler_wickets.empty:
            bowler_wickets.columns = ['bowler', 'wickets']
            purple_cap = bowler_wickets.sort_values('wickets', ascending=False).iloc[0]
            top_performers.loc[season, 'Purple_Cap_Holder'] = purple_cap['bowler']
            top_performers.loc[season, 'Purple_Cap_Wickets'] = purple_cap['wickets']
    
    return top_performers

# Find top performers per season
top_performers = find_top_performers_per_season(matches_df_clean, deliveries_df_clean)
print("\nOrange Cap (Top Run-scorer) and Purple Cap (Top Wicket-taker) Per Season:")
print(top_performers)


In [None]:

# Plot Orange Cap runs per season
plt.figure(figsize=(14, 8))
plt.bar(top_performers.index, top_performers['Orange_Cap_Runs'], color='orange')
plt.xlabel('Season')
plt.ylabel('Runs')
plt.title('Orange Cap Holder Runs Per Season')
plt.xticks(rotation=45)
for i, v in enumerate(top_performers['Orange_Cap_Runs']):
    plt.text(i, v + 10, top_performers['Orange_Cap_Holder'].iloc[i], 
             fontsize=8, ha='center', rotation=90)
plt.tight_layout()
plt.savefig('orange_cap_runs.png')
plt.show()


In [None]:

# Plot Purple Cap wickets per season
plt.figure(figsize=(14, 8))
plt.bar(top_performers.index, top_performers['Purple_Cap_Wickets'], color='purple')
plt.xlabel('Season')
plt.ylabel('Wickets')
plt.title('Purple Cap Holder Wickets Per Season')
plt.xticks(rotation=45)
for i, v in enumerate(top_performers['Purple_Cap_Wickets']):
    plt.text(i, v + 0.5, top_performers['Purple_Cap_Holder'].iloc[i], 
             fontsize=8, ha='center', rotation=90)
plt.tight_layout()
plt.savefig('purple_cap_wickets.png')
plt.show()


In [None]:

# Find top 10 bowlers per season
def find_top_bowlers_per_season(matches_df, deliveries_df, last_n_seasons=3):
    """Find the top 10 bowlers per season based on wickets taken."""
    # Get the last n seasons
    seasons = sorted(matches_df['season'].unique())[-last_n_seasons:]
    
    # Store data for all seasons
    all_season_bowlers = {}
    
    for season in seasons:
        # Get matches for this season
        season_matches = matches_df[matches_df['season'] == season]
        
        # Get match IDs for this season
        match_ids = season_matches['id'].tolist()
        
        # Get deliveries for these matches
        season_deliveries = deliveries_df[deliveries_df['match_id'].isin(match_ids)]
        
        # Find wicket-taking bowlers
        wicket_deliveries = season_deliveries[season_deliveries['is_wicket'] == 1]
        bowler_wickets = wicket_deliveries.groupby('bowler').size().reset_index()
        bowler_wickets.columns = ['Bowler', 'Wickets']
        
        # Calculate economy rate
        bowler_runs = season_deliveries.groupby('bowler')['total_runs'].sum().reset_index()
        bowler_runs.columns = ['Bowler', 'Runs']
        
        bowler_balls = season_deliveries.groupby('bowler').size().reset_index()
        bowler_balls.columns = ['Bowler', 'Balls']
        
        # Merge wickets and economy data
        bowler_stats = pd.merge(bowler_wickets, bowler_runs, on='Bowler')
        bowler_stats = pd.merge(bowler_stats, bowler_balls, on='Bowler')
        
        # Calculate economy rate
        bowler_stats['Economy'] = (bowler_stats['Runs'] / (bowler_stats['Balls'] / 6)).round(2)
        
        # Calculate bowling average
        bowler_stats['Average'] = (bowler_stats['Runs'] / bowler_stats['Wickets']).round(2)
        
        # Calculate strike rate
        bowler_stats['Strike_Rate'] = (bowler_stats['Balls'] / bowler_stats['Wickets']).round(2)
        
        # Get top 10 bowlers
        top_10_bowlers = bowler_stats.sort_values('Wickets', ascending=False).head(10)
        
        # Store for this season
        all_season_bowlers[season] = top_10_bowlers
    
    return all_season_bowlers

# Find top bowlers for recent seasons
top_bowlers_per_season = find_top_bowlers_per_season(matches_df_clean, deliveries_df_clean)

# Print top bowlers for the most recent season
latest_season = max(top_bowlers_per_season.keys())
print(f"\nTop 10 Bowlers in {latest_season} Season:")
print(top_bowlers_per_season[latest_season][['Bowler', 'Wickets', 'Economy', 'Average', 'Strike_Rate']])


In [None]:

# Part 4: Feature Extraction
print("\nPart 4: Feature Extraction")
print("="*80)

def extract_features(matches_df, deliveries_df):
    """Extract features for machine learning models."""
    features_df = matches_df.copy()
    
    # Add team strength features
    # Calculate win percentage for each team
    teams = set(list(matches_df['team1'].unique()) + 
               list(matches_df['team2'].unique()))
    teams = [team for team in teams if pd.notnull(team)]
    
    team_win_pct = {}
    team_toss_win_pct = {}
    
    for team in teams:
        # Calculate overall win percentage
        team_matches = matches_df[(matches_df['team1'] == team) | (matches_df['team2'] == team)]
        team_wins = matches_df[matches_df['winner'] == team]
        
        if len(team_matches) > 0:
            team_win_pct[team] = len(team_wins) / len(team_matches)
        else:
            team_win_pct[team] = 0
        
        # Calculate toss win percentage
        team_tosses = matches_df[(matches_df['team1'] == team) | (matches_df['team2'] == team)]
        team_toss_wins = matches_df[matches_df['toss_winner'] == team]
        
        if len(team_tosses) > 0:
            team_toss_win_pct[team] = len(team_toss_wins) / len(team_tosses)
        else:
            team_toss_win_pct[team] = 0
    
    # Add team win percentage as feature
    features_df['team1_win_pct'] = features_df['team1'].map(team_win_pct)
    features_df['team2_win_pct'] = features_df['team2'].map(team_win_pct)
    
    # Add toss win percentage as feature
    features_df['team1_toss_win_pct'] = features_df['team1'].map(team_toss_win_pct)
    features_df['team2_toss_win_pct'] = features_df['team2'].map(team_toss_win_pct)
    
    # Add venue advantage feature
    venue_team_wins = {}
    
    for venue in features_df['venue'].unique():
        venue_matches = features_df[features_df['venue'] == venue]
        
        for team in teams:
            team_venue_matches = venue_matches[(venue_matches['team1'] == team) | 
                                              (venue_matches['team2'] == team)]
            team_venue_wins = venue_matches[venue_matches['winner'] == team]
            
            if len(team_venue_matches) > 0:
                venue_team_wins[(venue, team)] = len(team_venue_wins) / len(team_venue_matches)
            else:
                venue_team_wins[(venue, team)] = 0
    
    # Add venue advantage as feature
    features_df['team1_venue_advantage'] = features_df.apply(
        lambda row: venue_team_wins.get((row['venue'], row['team1']), 0), axis=1)
    features_df['team2_venue_advantage'] = features_df.apply(
        lambda row: venue_team_wins.get((row['venue'], row['team2']), 0), axis=1)
    
    # Add head-to-head advantage
    head_to_head_wins = {}
    
    for team1 in teams:
        for team2 in teams:
            if team1 != team2:
                h2h_matches = features_df[((features_df['team1'] == team1) & (features_df['team2'] == team2)) | 
                                        ((features_df['team1'] == team2) & (features_df['team2'] == team1))]
                team1_wins = h2h_matches[h2h_matches['winner'] == team1]
                
                if len(h2h_matches) > 0:
                    head_to_head_wins[(team1, team2)] = len(team1_wins) / len(h2h_matches)
                else:
                    head_to_head_wins[(team1, team2)] = 0.5  # Neutral if no matches
    
    # Add head-to-head advantage as feature
    features_df['team1_h2h_advantage'] = features_df.apply(
        lambda row: head_to_head_wins.get((row['team1'], row['team2']), 0.5), axis=1)
    features_df['team2_h2h_advantage'] = features_df.apply(
        lambda row: head_to_head_wins.get((row['team2'], row['team1']), 0.5), axis=1)
    
    # Add recent form feature (last 5 matches)
    recent_form = {}
    
    for team in teams:
        team_matches = features_df[(features_df['team1'] == team) | 
                                   (features_df['team2'] == team)].sort_values('date')
        team_results = []
        
        for _, match in team_matches.iterrows():
            if match['winner'] == team:
                team_results.append(1)  # Win
            elif match['winner'] == 'No Result':
                team_results.append(0.5)  # Tie/No Result
            else:
                team_results.append(0)  # Loss
        
        # Calculate rolling average of last n matches
        n_matches = 5
        for i in range(len(team_results)):
            start_idx = max(0, i - n_matches + 1)
            recent_form[(team, i)] = sum(team_results[start_idx:i+1]) / min(n_matches, i+1)
    
    # Add team recent form as feature
    features_df['team1_recent_form'] = features_df.apply(
        lambda row: recent_form.get((row['team1'], 
                                     len(features_df[(features_df['team1'] == row['team1']) | 
                                                    (features_df['team2'] == row['team1']) & 
                                                    (features_df['date'] < row['date'])])), 0.5), axis=1)
    features_df['team2_recent_form'] = features_df.apply(
        lambda row: recent_form.get((row['team2'], 
                                     len(features_df[(features_df['team1'] == row['team2']) | 
                                                    (features_df['team2'] == row['team2']) & 
                                                    (features_df['date'] < row['date'])])), 0.5), axis=1)
    
    # Add toss decision feature
    features_df['toss_decision_bat'] = features_df['toss_decision'].apply(lambda x: 1 if x == 'bat' else 0)
    
    # Add binary target variable (1 if team1 wins, 0 if team2 wins)
    features_df['team1_win'] = features_df.apply(
        lambda row: 1 if row['winner'] == row['team1'] else 0 if row['winner'] == row['team2'] else None, axis=1)
    
    # Drop rows with no clear winner
    features_df = features_df.dropna(subset=['team1_win'])
    
    # Select relevant features for modeling
    selected_features = ['team1_win_pct', 'team2_win_pct', 
                        'team1_toss_win_pct', 'team2_toss_win_pct',
                        'team1_venue_advantage', 'team2_venue_advantage',
                        'team1_h2h_advantage', 'team2_h2h_advantage',
                        'team1_recent_form', 'team2_recent_form',
                        'toss_decision_bat', 'team1_win']
    
    model_data = features_df[selected_features].copy()
    
    return model_data, features_df

# Extract features for modeling
model_data, features_df = extract_features(matches_df_clean, deliveries_df_clean)
print("\nExtracted Features for Modeling:")
print(model_data.head())
print("\nFeature Summary Statistics:")
print(model_data.describe())


In [None]:

# Check for missing values
print("\nMissing Values in Model Data:")
print(model_data.isnull().sum())


In [None]:

# Handle any remaining missing values
model_data = model_data.fillna(0)


In [None]:

# Part 5: Model Building and Evaluation
print("\nPart 5: Model Building and Evaluation")
print("="*80)

# Split data into features and target
X = model_data.drop('team1_win', axis=1)
y = model_data['team1_win']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:

# Train Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)

print("\nRandom Forest Model:")
print(f"Accuracy: {rf_accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, rf_pred))


In [None]:

# Train Gradient Boosting Classifier
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)
gb_pred = gb_model.predict(X_test)
gb_accuracy = accuracy_score(y_test, gb_pred)

print("\nGradient Boosting Model:")
print(f"Accuracy: {gb_accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, gb_pred))


In [None]:

# Train XGBoost Classifier
xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
xgb_accuracy = accuracy_score(y_test, xgb_pred)

print("\nXGBoost Model:")
print(f"Accuracy: {xgb_accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, xgb_pred))


In [None]:

# Create Ensemble Model
ensemble_model = VotingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
        ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
        ('xgb', xgb.XGBClassifier(n_estimators=100, random_state=42))
    ],
    voting='soft'
)

ensemble_model.fit(X_train, y_train)
ensemble_pred = ensemble_model.predict(X_test)
ensemble_accuracy = accuracy_score(y_test, ensemble_pred)

print("\nEnsemble Model:")
print(f"Accuracy: {ensemble_accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, ensemble_pred))


In [None]:

# Build Neural Network Model
def build_nn_model(input_dim):
    model = keras.Sequential([
        layers.Dense(64, activation='relu', input_dim=input_dim),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        layers.Dense(32, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        layers.Dense(16, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])
    
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

# Train Neural Network Model
nn_model = build_nn_model(X_train.shape[1])
nn_model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)
nn_pred_prob = nn_model.predict(X_test)
nn_pred = (nn_pred_prob > 0.5).astype(int)
nn_accuracy = accuracy_score(y_test, nn_pred)

print("\nNeural Network Model:")
print(f"Accuracy: {nn_accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, nn_pred))


In [None]:

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'RF_Importance': rf_model.feature_importances_,
    'GB_Importance': gb_model.feature_importances_,
    'XGB_Importance': xgb_model.feature_importances_
})

feature_importance['Average_Importance'] = feature_importance[['RF_Importance', 'GB_Importance', 'XGB_Importance']].mean(axis=1)
feature_importance = feature_importance.sort_values('Average_Importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance)

# Plot feature importance
plt.figure(figsize=(12, 8))
sns.barplot(x='Average_Importance', y='Feature', data=feature_importance)
plt.title('Feature Importance (Average across models)')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.show()


In [None]:

# Part 6: 2025 IPL Prediction
print("\nPart 6: 2025 IPL Prediction")
print("="*80)

# Get unique teams in IPL 2025
ipl_2025_teams = players_df['Team'].unique()
print(f"IPL 2025 Teams: {len(ipl_2025_teams)}")
print(ipl_2025_teams)

# Function to prepare prediction data for a match
def prepare_match_prediction_data(team1, team2, venue, features_df, team_stats):
    """Prepare prediction data for a match."""
    # Create prediction data
    pred_data = {}
    
    # Add team strength features
    pred_data['team1_win_pct'] = team_stats.loc[team1, 'Winning_Percentage'] / 100 if team1 in team_stats.index else 0.5
    pred_data['team2_win_pct'] = team_stats.loc[team2, 'Winning_Percentage'] / 100 if team2 in team_stats.index else 0.5
    
    # Add toss win percentage (use average if not available)
    team1_toss_matches = features_df[(features_df['team1'] == team1) | (features_df['team2'] == team1)]
    team1_toss_wins = features_df[features_df['toss_winner'] == team1]
    
    team2_toss_matches = features_df[(features_df['team1'] == team2) | (features_df['team2'] == team2)]
    team2_toss_wins = features_df[features_df['toss_winner'] == team2]
    
    pred_data['team1_toss_win_pct'] = len(team1_toss_wins) / len(team1_toss_matches) if len(team1_toss_matches) > 0 else 0.5
    pred_data['team2_toss_win_pct'] = len(team2_toss_wins) / len(team2_toss_matches) if len(team2_toss_matches) > 0 else 0.5
    
    # Add venue advantage (use average if not available)
    team1_venue_matches = features_df[(features_df['venue'] == venue) & 
                                     ((features_df['team1'] == team1) | (features_df['team2'] == team1))]
    team1_venue_wins = features_df[(features_df['venue'] == venue) & (features_df['winner'] == team1)]
    
    team2_venue_matches = features_df[(features_df['venue'] == venue) & 
                                     ((features_df['team1'] == team2) | (features_df['team2'] == team2))]
    team2_venue_wins = features_df[(features_df['venue'] == venue) & (features_df['winner'] == team2)]
    
    pred_data['team1_venue_advantage'] = len(team1_venue_wins) / len(team1_venue_matches) if len(team1_venue_matches) > 0 else 0.5
    pred_data['team2_venue_advantage'] = len(team2_venue_wins) / len(team2_venue_matches) if len(team2_venue_matches) > 0 else 0.5
    
    # Add head-to-head advantage (use 0.5 if not available)
    h2h_matches = features_df[((features_df['team1'] == team1) & (features_df['team2'] == team2)) | 
                             ((features_df['team1'] == team2) & (features_df['team2'] == team1))]
    team1_h2h_wins = h2h_matches[h2h_matches['winner'] == team1]
    
    pred_data['team1_h2h_advantage'] = len(team1_h2h_wins) / len(h2h_matches) if len(h2h_matches) > 0 else 0.5
    pred_data['team2_h2h_advantage'] = 1 - pred_data['team1_h2h_advantage']
    
    # Add recent form (use 0.5 if not available)
    team1_recent_matches = features_df[(features_df['team1'] == team1) | (features_df['team2'] == team1)].tail(5)
    team1_recent_wins = sum(1 for _, match in team1_recent_matches.iterrows() if match['winner'] == team1)
    
    team2_recent_matches = features_df[(features_df['team1'] == team2) | (features_df['team2'] == team2)].tail(5)
    team2_recent_wins = sum(1 for _, match in team2_recent_matches.iterrows() if match['winner'] == team2)
    
    pred_data['team1_recent_form'] = team1_recent_wins / len(team1_recent_matches) if len(team1_recent_matches) > 0 else 0.5
    pred_data['team2_recent_form'] = team2_recent_wins / len(team2_recent_matches) if len(team2_recent_matches) > 0 else 0.5
    
    # Add toss decision (assume batting first)
    pred_data['toss_decision_bat'] = 1
    
    return pd.DataFrame([pred_data])

# Simulate IPL 2025 tournament
def simulate_ipl_2025(fixtures_df, team_stats, features_df, ensemble_model, nn_model):
    """Simulate IPL 2025 tournament and predict the winner."""
    # Create copy of fixtures
    fixtures = fixtures_df.copy()
    
    # Initialize points table
    points_table = pd.DataFrame(index=ipl_2025_teams)
    points_table['Matches'] = 0
    points_table['Wins'] = 0
    points_table['Losses'] = 0
    points_table['Draws'] = 0
    points_table['Points'] = 0
    points_table['NRR'] = 0  # Net Run Rate (simplified)
    
    # Simulate league phase matches
    match_results = []
    
    for _, match in fixtures.iterrows():
        home_team = match['Home']
        away_team = match['Away']
        venue = match['Venue']
        
        # Prepare prediction data
        pred_data = prepare_match_prediction_data(home_team, away_team, venue, features_df, team_stats)
        
        # Make predictions
        ensemble_prob = ensemble_model.predict_proba(pred_data)[0][1]
        nn_prob = nn_model.predict(pred_data)[0][0]
        
        # Combine predictions (ensemble gets more weight)
        combined_prob = 0.7 * ensemble_prob + 0.3 * nn_prob
        
        # Determine winner
        if combined_prob > 0.5:
            winner = home_team
            loser = away_team
            win_prob = combined_prob
        else:
            winner = away_team
            loser = home_team
            win_prob = 1 - combined_prob
        
        # Update points table
        points_table.loc[winner, 'Matches'] += 1
        points_table.loc[winner, 'Wins'] += 1
        points_table.loc[winner, 'Points'] += 2
        
        points_table.loc[loser, 'Matches'] += 1
        points_table.loc[loser, 'Losses'] += 1
        
        # Simulate NRR (simplified)
        points_table.loc[winner, 'NRR'] += np.random.uniform(0.05, 0.2)
        points_table.loc[loser, 'NRR'] -= np.random.uniform(0.05, 0.2)
        
        # Record match result
        match_results.append({
            'Match_No': match['Match No'],
            'Home': home_team,
            'Away': away_team,
            'Winner': winner,
            'Win_Probability': win_prob
        })
    
    # Sort points table
    points_table = points_table.sort_values(['Points', 'NRR'], ascending=False)
    
    # Get top 4 teams for playoffs
    playoff_teams = points_table.head(4).index.tolist()
    
    # Simulate Qualifier 1 (1st vs 2nd)
    team1 = playoff_teams[0]
    team2 = playoff_teams[1]
    pred_data = prepare_match_prediction_data(team1, team2, 'Neutral', features_df, team_stats)
    ensemble_prob = ensemble_model.predict_proba(pred_data)[0][1]
    nn_prob = nn_model.predict(pred_data)[0][0]
    combined_prob = 0.7 * ensemble_prob + 0.3 * nn_prob
    
    if combined_prob > 0.5:
        q1_winner = team1
        q1_loser = team2
    else:
        q1_winner = team2
        q1_loser = team1
    
    # Simulate Eliminator (3rd vs 4th)
    team3 = playoff_teams[2]
    team4 = playoff_teams[3]
    pred_data = prepare_match_prediction_data(team3, team4, 'Neutral', features_df, team_stats)
    ensemble_prob = ensemble_model.predict_proba(pred_data)[0][1]
    nn_prob = nn_model.predict(pred_data)[0][0]
    combined_prob = 0.7 * ensemble_prob + 0.3 * nn_prob
    
    if combined_prob > 0.5:
        eliminator_winner = team3
        eliminator_loser = team4
    else:
        eliminator_winner = team4
        eliminator_loser = team3
    
    # Simulate Qualifier 2 (Q1 loser vs Eliminator winner)
    pred_data = prepare_match_prediction_data(q1_loser, eliminator_winner, 'Neutral', features_df, team_stats)
    ensemble_prob = ensemble_model.predict_proba(pred_data)[0][1]
    nn_prob = nn_model.predict(pred_data)[0][0]
    combined_prob = 0.7 * ensemble_prob + 0.3 * nn_prob
    
    if combined_prob > 0.5:
        q2_winner = q1_loser
    else:
        q2_winner = eliminator_winner
    
    # Simulate Final (Q1 winner vs Q2 winner)
    pred_data = prepare_match_prediction_data(q1_winner, q2_winner, 'Neutral', features_df, team_stats)
    ensemble_prob = ensemble_model.predict_proba(pred_data)[0][1]
    nn_prob = nn_model.predict(pred_data)[0][0]
    combined_prob = 0.7 * ensemble_prob + 0.3 * nn_prob
    
    if combined_prob > 0.5:
        champion = q1_winner
        runner_up = q2_winner
        final_win_prob = combined_prob
    else:
        champion = q2_winner
        runner_up = q1_winner
        final_win_prob = 1 - combined_prob
    
    return points_table, playoff_teams, champion, runner_up, final_win_prob, match_results

# Simulate IPL 2025 tournament multiple times
num_simulations = 100
simulation_results = []

for sim in range(num_simulations):
    points_table, playoff_teams, champion, runner_up, final_win_prob, match_results = simulate_ipl_2025(
        fixtures_df_clean, team_stats, features_df, ensemble_model, nn_model)
    
    simulation_results.append({
        'Champion': champion,
        'Runner_Up': runner_up,
        'Final_Win_Prob': final_win_prob,
        'Top_4': playoff_teams
    })

# Count championship wins for each team
champion_counts = {}
for result in simulation_results:
    champion = result['Champion']
    champion_counts[champion] = champion_counts.get(champion, 0) + 1

# Count top 4 appearances for each team
top4_counts = {}
for result in simulation_results:
    for team in result['Top_4']:
        top4_counts[team] = top4_counts.get(team, 0) + 1

# Calculate championship probability
champion_prob = {team: count / num_simulations for team, count in champion_counts.items()}
champion_prob = dict(sorted(champion_prob.items(), key=lambda x: x[1], reverse=True))

# Calculate playoff probability
playoff_prob = {team: count / num_simulations for team, count in top4_counts.items()}
playoff_prob = dict(sorted(playoff_prob.items(), key=lambda x: x[1], reverse=True))


In [None]:

print("\nChampionship Probability:")
for team, prob in champion_prob.items():
    print(f"{team}: {prob:.2%}")


In [None]:

print("\nPlayoff Probability:")
for team, prob in playoff_prob.items():
    print(f"{team}: {prob:.2%}")


In [None]:

# Plot championship probability
plt.figure(figsize=(14, 8))
plt.bar(champion_prob.keys(), [prob * 100 for prob in champion_prob.values()], color='gold')
plt.xlabel('Teams')
plt.ylabel('Championship Probability (%)')
plt.title('IPL 2025 Championship Probability')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('championship_probability.png')
plt.show()


In [None]:

# Plot playoff probability
plt.figure(figsize=(14, 8))
plt.bar(playoff_prob.keys(), [prob * 100 for prob in playoff_prob.values()], color='blue')
plt.xlabel('Teams')
plt.ylabel('Playoff Probability (%)')
plt.title('IPL 2025 Playoff Probability')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('playoff_probability.png')
plt.show()


In [None]:

# Predict the IPL 2025 winner
predicted_champion = max(champion_prob.items(), key=lambda x: x[1])[0]
championship_probability = champion_prob[predicted_champion]

print(f"\n🏆 Predicted IPL 2025 Champion: {predicted_champion} with {championship_probability:.2%} probability")

# Output final points table from a single simulation
final_points_table, _, _, _, _, _ = simulate_ipl_2025(
    fixtures_df_clean, team_stats, features_df, ensemble_model, nn_model)

print("\nPredicted Final Points Table:")
print(final_points_table[['Matches', 'Wins', 'Losses', 'Points', 'NRR']].sort_values(['Points', 'NRR'], ascending=False))


In [None]:

# Part 7: Player Impact Analysis for 2025
print("\nPart 7: Player Impact Analysis for 2025")
print("="*80)

# Analyze 2025 player impact
def analyze_player_impact(players_df, alldata_df):
    """Analyze player impact for 2025 season."""
    # Merge player data with team
    player_impact = pd.merge(players_df, alldata_df, left_on='Players', right_on='Player_Name', how='left')
    
    # Fill missing values
    numeric_cols = ['Batting_Average', 'Batting_Strike_Rate', 'Economy_Rate', 'Bowling_Strike_Rate', 'Bowling_Average']
    player_impact[numeric_cols] = player_impact[numeric_cols].fillna(0)
    
    # Create impact score for batsmen
    player_impact['Batting_Impact'] = (player_impact['Batting_Average'] * 0.4 + 
                                      player_impact['Batting_Strike_Rate'] * 0.6) / 100
    
    # Create impact score for bowlers (lower is better for economy and average)
    player_impact['Bowling_Impact'] = 0
    bowler_mask = player_impact['Bowling_Average'] > 0
    
    if bowler_mask.any():
        max_econ = player_impact.loc[bowler_mask, 'Economy_Rate'].max()
        max_avg = player_impact.loc[bowler_mask, 'Bowling_Average'].max()
        
        player_impact.loc[bowler_mask, 'Bowling_Impact'] = (
            (1 - player_impact.loc[bowler_mask, 'Economy_Rate'] / max_econ) * 0.4 + 
            (1 - player_impact.loc[bowler_mask, 'Bowling_Average'] / max_avg) * 0.6
        )
    
    # Create overall impact score based on player type
    player_impact['Impact_Score'] = 0
    
    # Batsmen
    batsmen_mask = player_impact['Type'] == 'Batsman'
    player_impact.loc[batsmen_mask, 'Impact_Score'] = player_impact.loc[batsmen_mask, 'Batting_Impact']
    
    # Bowlers
    bowler_mask = player_impact['Type'] == 'Bowler'
    player_impact.loc[bowler_mask, 'Impact_Score'] = player_impact.loc[bowler_mask, 'Bowling_Impact']
    
    # All-rounders (weighted average)
    allrounder_mask = player_impact['Type'] == 'All-Rounder'
    player_impact.loc[allrounder_mask, 'Impact_Score'] = (
        player_impact.loc[allrounder_mask, 'Batting_Impact'] * 0.5 + 
        player_impact.loc[allrounder_mask, 'Bowling_Impact'] * 0.5
    )
    
    # Calculate team strength based on player impact
    team_strength = player_impact.groupby('Team')['Impact_Score'].agg(['mean', 'sum']).reset_index()
    team_strength.columns = ['Team', 'Average_Player_Impact', 'Total_Team_Impact']
    team_strength = team_strength.sort_values('Total_Team_Impact', ascending=False)
    
    # Get top players for each team
    top_players = {}
    for team in player_impact['Team'].unique():
        team_players = player_impact[player_impact['Team'] == team].sort_values('Impact_Score', ascending=False)
        top_players[team] = team_players.head(3)
    
    return player_impact, team_strength, top_players

# Analyze player impact
player_impact, team_strength, top_players = analyze_player_impact(players_df, alldata_df_clean)

print("\nTeam Strength Based on Player Impact:")
print(team_strength)

print("\nTop Players for Each Team:")
for team, players in top_players.items():
    print(f"\n{team}:")
    print(players[['Players', 'Type', 'Batting_Average', 'Batting_Strike_Rate', 'Economy_Rate', 'Impact_Score']])


In [None]:

# Plot team strength
plt.figure(figsize=(14, 8))
plt.bar(team_strength['Team'], team_strength['Total_Team_Impact'], color='purple')
plt.xlabel('Teams')
plt.ylabel('Total Team Impact Score')
plt.title('IPL 2025 Team Strength Based on Player Impact')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('team_strength.png')
plt.show()


In [None]:

# Compare team strength with championship probability
comparison_df = pd.DataFrame({
    'Team': list(champion_prob.keys()),
    'Championship_Probability': [prob * 100 for prob in champion_prob.values()]
})
comparison_df = pd.merge(comparison_df, team_strength, on='Team', how='left')

print("\nComparison of Team Strength and Championship Probability:")
print(comparison_df.sort_values('Championship_Probability', ascending=False))

# Plot the relationship between team strength and championship probability
plt.figure(figsize=(12, 8))
plt.scatter(comparison_df['Total_Team_Impact'], comparison_df['Championship_Probability'])

for i, row in comparison_df.iterrows():
    plt.annotate(row['Team'], (row['Total_Team_Impact'], row['Championship_Probability']))

plt.xlabel('Total Team Impact Score')
plt.ylabel('Championship Probability (%)')
plt.title('Relationship Between Team Strength and Championship Probability')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('strength_vs_probability.png')
plt.show()


In [None]:

# Part 8: Summary and Conclusion
print("\nPart 8: Summary and Conclusion")
print("="*80)

# Print key findings
print("\nKey Findings from IPL Analysis:")
print(f"1. Predicted Champion for IPL 2025: {predicted_champion} with {championship_probability:.2%} probability")
print(f"2. Top contenders based on player impact: {', '.join(team_strength['Team'].head(3).tolist())}")
print(f"3. Teams with highest playoff probability: {', '.join(list(playoff_prob.keys())[:4])}")


In [None]:

print("\nModel Performance:")
print(f"Random Forest Accuracy: {rf_accuracy:.4f}")
print(f"Gradient Boosting Accuracy: {gb_accuracy:.4f}")
print(f"XGBoost Accuracy: {xgb_accuracy:.4f}")
print(f"Ensemble Model Accuracy: {ensemble_accuracy:.4f}")
print(f"Neural Network Accuracy: {nn_accuracy:.4f}")

print("\nMost Important Features for Prediction:")
for i, row in feature_importance.head(5).iterrows():
    print(f"{row['Feature']}: {row['Average_Importance']:.4f}")

print("\nTop Players Expected to Make an Impact in IPL 2025:")
for team, players in top_players.items():
    if team == predicted_champion:
        print(f"\nKey Players for Predicted Champion ({team}):")
        for i, player in players.iterrows():
            print(f"- {player['Players']} ({player['Type']}): Impact Score = {player['Impact_Score']:.4f}")

In [None]:

# Plot top fours hitters
axes[0, 1].bar(top_fours['Batsman'], top_fours[fours_col], color='blue')
axes[0, 1].set_title(fours_title)
axes[0, 1].set_xlabel('Batsmen')
axes[0, 1].set_ylabel('Number of Fours')
axes[0, 1].tick_params(axis='x', rotation=45)

# Plot top twos hitters
axes[1, 0].bar(top_twos['Batsman'], top_twos[twos_col], color='green')
axes[1, 0].set_title(twos_title)
axes[1, 0].set_xlabel('Batsmen')
axes[1, 0].set_ylabel('Number of Twos')
axes[1, 0].tick_params(axis='x', rotation=45)

# Plot top singles hitters
axes[1, 1].bar(top_singles['Batsman'], top_singles[singles_col], color='purple')
axes[1, 1].set_title(singles_title)
axes[1, 1].set_xlabel('Batsmen')
axes[1, 1].set_ylabel('Number of Singles')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('top_batsmen_by_run_category.png')
plt.show()


In [None]:

# 3.3 Seasonal Analysis
print("\n3.3 Seasonal Analysis")
print("-"*50)

# Calculate average runs per match per season
def calculate_avg_runs_per_season(matches_df, deliveries_df):
    """Calculate average runs per match per season."""
    # Get unique seasons
    seasons = sorted(matches_df['season'].unique())
    
    # Initialize dataframe to store season statistics
    season_stats = pd.DataFrame(index=seasons)
    
    # Calculate average runs per match per season
    season_stats['Total_Matches'] = 0
    season_stats['Total_Runs'] = 0
    season_stats['Avg_Runs_Per_Match'] = 0
    season_stats['Targets_200_Plus'] = 0
    
    for season in seasons:
        # Get matches for this season
        season_matches = matches_df[matches_df['season'] == season]
        season_stats.loc[season, 'Total_Matches'] = len(season_matches)
        
        # Get match IDs for this season
        match_ids = season_matches['id'].tolist()
        
        # Get deliveries for these matches
        season_deliveries = deliveries_df[deliveries_df['match_id'].isin(match_ids)]
        season_stats.loc[season, 'Total_Runs'] = season_deliveries['total_runs'].sum()
        
        # Calculate average runs per match
        if len(season_matches) > 0:
            season_stats.loc[season, 'Avg_Runs_Per_Match'] = (season_stats.loc[season, 'Total_Runs'] / 
                                                             (season_stats.loc[season, 'Total_Matches'] * 2)).round(2)
        
        # Calculate targets of 200+ runs
        innings_totals = season_deliveries.groupby(['match_id', 'inning'])['total_runs'].sum()
        season_stats.loc[season, 'Targets_200_Plus'] = len(innings_totals[innings_totals >= 200])
    
    return season_stats

# Calculate seasonal statistics
season_stats = calculate_avg_runs_per_season(matches_df_clean, deliveries_df_clean)
print("\nSeasonal Statistics:")
print(season_stats)


In [None]:

# Plot average runs per match per season
plt.figure(figsize=(14, 8))
plt.plot(season_stats.index, season_stats['Avg_Runs_Per_Match'], marker='o', linestyle='-', color='blue')
plt.xlabel('Season')
plt.ylabel('Average Runs Per Match')
plt.title('Average Runs Per Match by Season')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('avg_runs_per_season.png')
plt.show()


In [None]:

# Plot targets of 200+ runs per season
plt.figure(figsize=(14, 8))
plt.bar(season_stats.index, season_stats['Targets_200_Plus'], color='red')
plt.xlabel('Season')
plt.ylabel('Number of 200+ Totals')
plt.title('Number of 200+ Run Totals by Season')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('200plus_totals_per_season.png')
plt.show()


In [None]:

# Calculate average score of each team per season
def calculate_team_score_per_season(matches_df, deliveries_df):
    """Calculate average score of each team per season."""
    # Get unique seasons and teams
    seasons = sorted(matches_df['season'].unique())
    teams = set(matches_df['team1'].unique()) | set(matches_df['team2'].unique())
    teams = [team for team in teams if pd.notnull(team)]
    
    # Initialize dataframe to store team scores per season
    team_season_scores = {}
    
    for season in seasons:
        team_season_scores[season] = {}
        
        # Get matches for this season
        season_matches = matches_df[matches_df['season'] == season]
        
        # Get match IDs for this season
        match_ids = season_matches['id'].tolist()
        
        # Get teams active in this season
        active_teams = set(season_matches['team1'].unique()) | set(season_matches['team2'].unique())
        
        for team in active_teams:
            if pd.isna(team):
                continue
                
            # Get deliveries for this team and season
            team_deliveries = deliveries_df[(deliveries_df['match_id'].isin(match_ids)) & 
                                           (deliveries_df['batting_team'] == team)]
            
            # Calculate team's innings totals
            innings_totals = team_deliveries.groupby(['match_id', 'inning'])['total_runs'].sum()
            
            # Calculate average score
            if not innings_totals.empty:
                team_season_scores[season][team] = innings_totals.mean().round(2)
            else:
                team_season_scores[season][team] = 0
    
    # Convert to dataframe
    team_season_df = pd.DataFrame(team_season_scores)
    
    return team_season_df

# Calculate team scores per season
team_season_scores = calculate_team_score_per_season(matches_df_clean, deliveries_df_clean)
print("\nAverage Team Scores Per Season:")
print(team_season_scores)


In [None]:

# Find top performers per season (Orange Cap and Purple Cap)
def find_top_performers_per_season(matches_df, deliveries_df):
    """Find the Orange Cap (most runs) and Purple Cap (most wickets) holders per season."""
    # Get unique seasons
    seasons = sorted(matches_df['season'].unique())
    
    # Initialize dataframe to store top performers
    top_performers = pd.DataFrame(index=seasons)
    
    for season in seasons:
        # Get matches for this season
        season_matches = matches_df[matches_df['season'] == season]
        
        # Get match IDs for this season
        match_ids = season_matches['id'].tolist()
        
        # Get deliveries for these matches
        season_deliveries = deliveries_df[deliveries_df['match_id'].isin(match_ids)]
        
        # Find top run-scorer (Orange Cap)
        batsman_runs = season_deliveries.groupby('batter')['batsman_runs'].sum().reset_index()
        if not batsman_runs.empty:
            orange_cap = batsman_runs.sort_values('batsman_runs', ascending=False).iloc[0]
            top_performers.loc[season, 'Orange_Cap_Holder'] = orange_cap['batter']
            top_performers.loc[season, 'Orange_Cap_Runs'] = orange_cap['batsman_runs']
        
        # Find top wicket-taker (Purple Cap)
        bowler_wickets = season_deliveries[season_deliveries['is_wicket'] == 1].groupby('bowler').size().reset_index()
        if not bowler_wickets.empty:
            bowler_wickets.columns = ['bowler', 'wickets']
            purple_cap = bowler_wickets.sort_values('wickets', ascending=False).iloc[0]
            top_performers.loc[season, 'Purple_Cap_Holder'] = purple_cap['bowler']
            top_performers.loc[season, 'Purple_Cap_Wickets'] = purple_cap['wickets']
    
    return top_performers

# Find top performers per season
top_performers = find_top_performers_per_season(matches_df_clean, deliveries_df_clean)
print("\nOrange Cap (Top Run-scorer) and Purple Cap (Top Wicket-taker) Per Season:")
print(top_performers)


In [None]:

# Plot Orange Cap runs per season
plt.figure(figsize=(14, 8))
plt.bar(top_performers.index, top_performers['Orange_Cap_Runs'], color='orange')
plt.xlabel('Season')
plt.ylabel('Runs')
plt.title('Orange Cap Holder Runs Per Season')
plt.xticks(rotation=45)
for i, v in enumerate(top_performers['Orange_Cap_Runs']):
    plt.text(i, v + 10, top_performers['Orange_Cap_Holder'].iloc[i], 
             fontsize=8, ha='center', rotation=90)
plt.tight_layout()
plt.savefig('orange_cap_runs.png')
plt.show()


In [None]:

# Plot Purple Cap wickets per season
plt.figure(figsize=(14, 8))
plt.bar(top_performers.index, top_performers['Purple_Cap_Wickets'], color='purple')
plt.xlabel('Season')
plt.ylabel('Wickets')
plt.title('Purple Cap Holder Wickets Per Season')
plt.xticks(rotation=45)
for i, v in enumerate(top_performers['Purple_Cap_Wickets']):
    plt.text(i, v + 0.5, top_performers['Purple_Cap_Holder'].iloc[i], 
             fontsize=8, ha='center', rotation=90)
plt.tight_layout()
plt.savefig('purple_cap_wickets.png')
plt.show()


In [None]:

# Find top 10 bowlers per season
def find_top_bowlers_per_season(matches_df, deliveries_df, last_n_seasons=3):
    """Find the top 10 bowlers per season based on wickets taken."""
    # Get the last n seasons
    seasons = sorted(matches_df['season'].unique())[-last_n_seasons:]
    
    # Store data for all seasons
    all_season_bowlers = {}
    
    for season in seasons:
        # Get matches for this season
        season_matches = matches_df[matches_df['season'] == season]
        
        # Get match IDs for this season
        match_ids = season_matches['id'].tolist()
        
        # Get deliveries for these matches
        season_deliveries = deliveries_df[deliveries_df['match_id'].isin(match_ids)]
        
        # Find wicket-taking bowlers
        wicket_deliveries = season_deliveries[season_deliveries['is_wicket'] == 1]
        bowler_wickets = wicket_deliveries.groupby('bowler').size().reset_index()
        bowler_wickets.columns = ['Bowler', 'Wickets']
        
        # Calculate economy rate
        bowler_runs = season_deliveries.groupby('bowler')['total_runs'].sum().reset_index()
        bowler_runs.columns = ['Bowler', 'Runs']
        
        bowler_balls = season_deliveries.groupby('bowler').size().reset_index()
        bowler_balls.columns = ['Bowler', 'Balls']
        
        # Merge wickets and economy data
        bowler_stats = pd.merge(bowler_wickets, bowler_runs, on='Bowler')
        bowler_stats = pd.merge(bowler_stats, bowler_balls, on='Bowler')
        
        # Calculate economy rate
        bowler_stats['Economy'] = (bowler_stats['Runs'] / (bowler_stats['Balls'] / 6)).round(2)
        
        # Calculate bowling average
        bowler_stats['Average'] = (bowler_stats['Runs'] / bowler_stats['Wickets']).round(2)
        
        # Calculate strike rate
        bowler_stats['Strike_Rate'] = (bowler_stats['Balls'] / bowler_stats['Wickets']).round(2)
        
        # Get top 10 bowlers
        top_10_bowlers = bowler_stats.sort_values('Wickets', ascending=False).head(10)
        
        # Store for this season
        all_season_bowlers[season] = top_10_bowlers
    
    return all_season_bowlers

# Find top bowlers for recent seasons
top_bowlers_per_season = find_top_bowlers_per_season(matches_df_clean, deliveries_df_clean)

# Print top bowlers for the most recent season
latest_season = max(top_bowlers_per_season.keys())
print(f"\nTop 10 Bowlers in {latest_season} Season:")
print(top_bowlers_per_season[latest_season][['Bowler', 'Wickets', 'Economy', 'Average', 'Strike_Rate']])


In [None]:

# Part 4: Feature Extraction
print("\nPart 4: Feature Extraction")
print("="*80)

def extract_features(matches_df, deliveries_df):
    """Extract features for machine learning models."""
    features_df = matches_df.copy()
    
    # Add team strength features
    # Calculate win percentage for each team
    teams = set(list(matches_df['team1'].unique()) + 
               list(matches_df['team2'].unique()))
    teams = [team for team in teams if pd.notnull(team)]
    
    team_win_pct = {}
    team_toss_win_pct = {}
    
    for team in teams:
        # Calculate overall win percentage
        team_matches = matches_df[(matches_df['team1'] == team) | (matches_df['team2'] == team)]
        team_wins = matches_df[matches_df['winner'] == team]
        
        if len(team_matches) > 0:
            team_win_pct[team] = len(team_wins) / len(team_matches)
        else:
            team_win_pct[team] = 0
        
        # Calculate toss win percentage
        team_tosses = matches_df[(matches_df['team1'] == team) | (matches_df['team2'] == team)]
        team_toss_wins = matches_df[matches_df['toss_winner'] == team]
        
        if len(team_tosses) > 0:
            team_toss_win_pct[team] = len(team_toss_wins) / len(team_tosses)
        else:
            team_toss_win_pct[team] = 0
    
    # Add team win percentage as feature
    features_df['team1_win_pct'] = features_df['team1'].map(team_win_pct)
    features_df['team2_win_pct'] = features_df['team2'].map(team_win_pct)
    
    # Add toss win percentage as feature
    features_df['team1_toss_win_pct'] = features_df['team1'].map(team_toss_win_pct)
    features_df['team2_toss_win_pct'] = features_df['team2'].map(team_toss_win_pct)
    
    # Add venue advantage feature
    venue_team_wins = {}
    
    for venue in features_df['venue'].unique():
        venue_matches = features_df[features_df['venue'] == venue]
        
        for team in teams:
            team_venue_matches = venue_matches[(venue_matches['team1'] == team) | 
                                              (venue_matches['team2'] == team)]
            team_venue_wins = venue_matches[venue_matches['winner'] == team]
            
            if len(team_venue_matches) > 0:
                venue_team_wins[(venue, team)] = len(team_venue_wins) / len(team_venue_matches)
            else:
                venue_team_wins[(venue, team)] = 0
    
    # Add venue advantage as feature
    features_df['team1_venue_advantage'] = features_df.apply(
        lambda row: venue_team_wins.get((row['venue'], row['team1']), 0), axis=1)
    features_df['team2_venue_advantage'] = features_df.apply(
        lambda row: venue_team_wins.get((row['venue'], row['team2']), 0), axis=1)
    
    # Add head-to-head advantage
    head_to_head_wins = {}
    
    for team1 in teams:
        for team2 in teams:
            if team1 != team2:
                h2h_matches = features_df[((features_df['team1'] == team1) & (features_df['team2'] == team2)) | 
                                        ((features_df['team1'] == team2) & (features_df['team2'] == team1))]
                team1_wins = h2h_matches[h2h_matches['winner'] == team1]
                
                if len(h2h_matches) > 0:
                    head_to_head_wins[(team1, team2)] = len(team1_wins) / len(h2h_matches)
                else:
                    head_to_head_wins[(team1, team2)] = 0.5  # Neutral if no matches
    
    # Add head-to-head advantage as feature
    features_df['team1_h2h_advantage'] = features_df.apply(
        lambda row: head_to_head_wins.get((row['team1'], row['team2']), 0.5), axis=1)
    features_df['team2_h2h_advantage'] = features_df.apply(
        lambda row: head_to_head_wins.get((row['team2'], row['team1']), 0.5), axis=1)
    
    # Add recent form feature (last 5 matches)
    recent_form = {}
    
    for team in teams:
        team_matches = features_df[(features_df['team1'] == team) | 
                                   (features_df['team2'] == team)].sort_values('date')
        team_results = []
        
        for _, match in team_matches.iterrows():
            if match['winner'] == team:
                team_results.append(1)  # Win
            elif match['winner'] == 'No Result':
                team_results.append(0.5)  # Tie/No Result
            else:
                team_results.append(0)  # Loss
        
        # Calculate rolling average of last n matches
        n_matches = 5
        for i in range(len(team_results)):
            start_idx = max(0, i - n_matches + 1)
            recent_form[(team, i)] = sum(team_results[start_idx:i+1]) / min(n_matches, i+1)
    
    # Add team recent form as feature
    features_df['team1_recent_form'] = features_df.apply(
        lambda row: recent_form.get((row['team1'], 
                                     len(features_df[(features_df['team1'] == row['team1']) | 
                                                    (features_df['team2'] == row['team1']) & 
                                                    (features_df['date'] < row['date'])])), 0.5), axis=1)
    features_df['team2_recent_form'] = features_df.apply(
        lambda row: recent_form.get((row['team2'], 
                                     len(features_df[(features_df['team1'] == row['team2']) | 
                                                    (features_df['team2'] == row['team2']) & 
                                                    (features_df['date'] < row['date'])])), 0.5), axis=1)
    
    # Add toss decision feature
    features_df['toss_decision_bat'] = features_df['toss_decision'].apply(lambda x: 1 if x == 'bat' else 0)
    
    # Add binary target variable (1 if team1 wins, 0 if team2 wins)
    features_df['team1_win'] = features_df.apply(
        lambda row: 1 if row['winner'] == row['team1'] else 0 if row['winner'] == row['team2'] else None, axis=1)
    
    # Drop rows with no clear winner
    features_df = features_df.dropna(subset=['team1_win'])
    
    # Select relevant features for modeling
    selected_features = ['team1_win_pct', 'team2_win_pct', 
                        'team1_toss_win_pct', 'team2_toss_win_pct',
                        'team1_venue_advantage', 'team2_venue_advantage',
                        'team1_h2h_advantage', 'team2_h2h_advantage',
                        'team1_recent_form', 'team2_recent_form',
                        'toss_decision_bat', 'team1_win']
    
    model_data = features_df[selected_features].copy()
    
    return model_data, features_df


In [None]:

# Extract features for modeling
model_data, features_df = extract_features(matches_df_clean, deliveries_df_clean)
print("\nExtracted Features for Modeling:")
print(model_data.head())
print("\nFeature Summary Statistics:")
print(model_data.describe())


In [None]:

# Check for missing values
print("\nMissing Values in Model Data:")
print(model_data.isnull().sum())

# Handle any remaining missing values
model_data = model_data.fillna(0)


In [None]:

# Part 5: Model Building and Evaluation
print("\nPart 5: Model Building and Evaluation")
print("="*80)

# Split data into features and target
X = model_data.drop('team1_win', axis=1)
y = model_data['team1_win']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)

print("\nRandom Forest Model:")
print(f"Accuracy: {rf_accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, rf_pred))


In [None]:

# Train Gradient Boosting Classifier
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)
gb_pred = gb_model.predict(X_test)
gb_accuracy = accuracy_score(y_test, gb_pred)

print("\nGradient Boosting Model:")
print(f"Accuracy: {gb_accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, gb_pred))


In [None]:

# Train XGBoost Classifier
xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
xgb_accuracy = accuracy_score(y_test, xgb_pred)

print("\nXGBoost Model:")
print(f"Accuracy: {xgb_accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, xgb_pred))


In [None]:

# Create Ensemble Model
ensemble_model = VotingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
        ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
        ('xgb', xgb.XGBClassifier(n_estimators=100, random_state=42))
    ],
    voting='soft'
)

ensemble_model.fit(X_train, y_train)
ensemble_pred = ensemble_model.predict(X_test)
ensemble_accuracy = accuracy_score(y_test, ensemble_pred)

print("\nEnsemble Model:")
print(f"Accuracy: {ensemble_accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, ensemble_pred))


In [None]:

# Build Neural Network Model
def build_nn_model(input_dim):
    model = keras.Sequential([
        layers.Dense(64, activation='relu', input_dim=input_dim),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        layers.Dense(32, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        layers.Dense(16, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])
    
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

# Train Neural Network Model
nn_model = build_nn_model(X_train.shape[1])
nn_model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)
nn_pred_prob = nn_model.predict(X_test)
nn_pred = (nn_pred_prob > 0.5).astype(int)
nn_accuracy = accuracy_score(y_test, nn_pred)

print("\nNeural Network Model:")
print(f"Accuracy: {nn_accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, nn_pred))

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'RF_Importance': rf_model.feature_importances_,
    'GB_Importance': gb_model.feature_importances_,
    'XGB_Importance': xgb_model.feature_importances_
})

feature_importance['Average_Importance'] = feature_importance[['RF_Importance', 'GB_Importance', 'XGB_Importance']].mean(axis=1)
feature_importance = feature_importance.sort_values('Average_Importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance)


In [None]:

# Plot feature importance
plt.figure(figsize=(12, 8))
sns.barplot(x='Average_Importance', y='Feature', data=feature_importance)
plt.title('Feature Importance (Average across models)')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.show()


In [None]:

# Part 6: 2025 IPL Prediction
print("\nPart 6: 2025 IPL Prediction")
print("="*80)

# Get unique teams in IPL 2025
ipl_2025_teams = players_df['Team'].unique()
print(f"IPL 2025 Teams: {len(ipl_2025_teams)}")
print(ipl_2025_teams)

# Function to prepare prediction data for a match
def prepare_match_prediction_data(team1, team2, venue, features_df, team_stats):
    """Prepare prediction data for a match."""
    # Create prediction data
    pred_data = {}
    
    # Add team strength features
    pred_data['team1_win_pct'] = team_stats.loc[team1, 'Winning_Percentage'] / 100 if team1 in team_stats.index else 0.5
    pred_data['team2_win_pct'] = team_stats.loc[team2, 'Winning_Percentage'] / 100 if team2 in team_stats.index else 0.5
    
    # Add toss win percentage (use average if not available)
    team1_toss_matches = features_df[(features_df['team1'] == team1) | (features_df['team2'] == team1)]
    team1_toss_wins = features_df[features_df['toss_winner'] == team1]
    
    team2_toss_matches = features_df[(features_df['team1'] == team2) | (features_df['team2'] == team2)]
    team2_toss_wins = features_df[features_df['toss_winner'] == team2]
    
    pred_data['team1_toss_win_pct'] = len(team1_toss_wins) / len(team1_toss_matches) if len(team1_toss_matches) > 0 else 0.5
    pred_data['team2_toss_win_pct'] = len(team2_toss_wins) / len(team2_toss_matches) if len(team2_toss_matches) > 0 else 0.5
    
    # Add venue advantage (use average if not available)
    team1_venue_matches = features_df[(features_df['venue'] == venue) & 
                                     ((features_df['team1'] == team1) | (features_df['team2'] == team1))]
    team1_venue_wins = features_df[(features_df['venue'] == venue) & (features_df['winner'] == team1)]
    
    team2_venue_matches = features_df[(features_df['venue'] == venue) & 
                                     ((features_df['team1'] == team2) | (features_df['team2'] == team2))]
    team2_venue_wins = features_df[(features_df['venue'] == venue) & (features_df['winner'] == team2)]
    
    pred_data['team1_venue_advantage'] = len(team1_venue_wins) / len(team1_venue_matches) if len(team1_venue_matches) > 0 else 0.5
    pred_data['team2_venue_advantage'] = len(team2_venue_wins) / len(team2_venue_matches) if len(team2_venue_matches) > 0 else 0.5
    
    # Add head-to-head advantage (use 0.5 if not available)
    h2h_matches = features_df[((features_df['team1'] == team1) & (features_df['team2'] == team2)) | 
                             ((features_df['team1'] == team2) & (features_df['team2'] == team1))]
    team1_h2h_wins = h2h_matches[h2h_matches['winner'] == team1]
    
    pred_data['team1_h2h_advantage'] = len(team1_h2h_wins) / len(h2h_matches) if len(h2h_matches) > 0 else 0.5
    pred_data['team2_h2h_advantage'] = 1 - pred_data['team1_h2h_advantage']
    
    # Add recent form (use 0.5 if not available)
    team1_recent_matches = features_df[(features_df['team1'] == team1) | (features_df['team2'] == team1)].tail(5)
    team1_recent_wins = sum(1 for _, match in team1_recent_matches.iterrows() if match['winner'] == team1)
    
    team2_recent_matches = features_df[(features_df['team1'] == team2) | (features_df['team2'] == team2)].tail(5)
    team2_recent_wins = sum(1 for _, match in team2_recent_matches.iterrows() if match['winner'] == team2)
    
    pred_data['team1_recent_form'] = team1_recent_wins / len(team1_recent_matches) if len(team1_recent_matches) > 0 else 0.5
    pred_data['team2_recent_form'] = team2_recent_wins / len(team2_recent_matches) if len(team2_recent_matches) > 0 else 0.5
    
    # Add toss decision (assume batting first)
    pred_data['toss_decision_bat'] = 1
    
    return pd.DataFrame([pred_data])

# Simulate IPL 2025 tournament
def simulate_ipl_2025(fixtures_df, team_stats, features_df, ensemble_model, nn_model):
    """Simulate IPL 2025 tournament and predict the winner."""
    # Create copy of fixtures
    fixtures = fixtures_df.copy()
    
    # Initialize points table
    points_table = pd.DataFrame(index=ipl_2025_teams)
    points_table['Matches'] = 0
    points_table['Wins'] = 0
    points_table['Losses'] = 0
    points_table['Draws'] = 0
    points_table['Points'] = 0
    points_table['NRR'] = 0  # Net Run Rate (simplified)
    
    # Simulate league phase matches
    match_results = []
    
    for _, match in fixtures.iterrows():
        home_team = match['Home']
        away_team = match['Away']
        venue = match['Venue']
        
        # Prepare prediction data
        pred_data = prepare_match_prediction_data(home_team, away_team, venue, features_df, team_stats)
        
        # Make predictions
        ensemble_prob = ensemble_model.predict_proba(pred_data)[0][1]
        nn_prob = nn_model.predict(pred_data)[0][0]
        
        # Combine predictions (ensemble gets more weight)
        combined_prob = 0.7 * ensemble_prob + 0.3 * nn_prob
        
        # Determine winner
        if combined_prob > 0.5:
            winner = home_team
            loser = away_team
            win_prob = combined_prob
        else:
            winner = away_team
            loser = home_team
            win_prob = 1 - combined_prob
        
        # Update points table
        points_table.loc[winner, 'Matches'] += 1
        points_table.loc[winner, 'Wins'] += 1
        points_table.loc[winner, 'Points'] += 2
        
        points_table.loc[loser, 'Matches'] += 1
        points_table.loc[loser, 'Losses'] += 1
        
        # Simulate NRR (simplified)
        points_table.loc[winner, 'NRR'] += np.random.uniform(0.05, 0.2)
        points_table.loc[loser, 'NRR'] -= np.random.uniform(0.05, 0.2)
        
        # Record match result
        match_results.append({
            'Match_No': match['Match No'],
            'Home': home_team,
            'Away': away_team,
            'Winner': winner,
            'Win_Probability': win_prob
        })
    
    # Sort points table
    points_table = points_table.sort_values(['Points', 'NRR'], ascending=False)
    
    # Get top 4 teams for playoffs
    playoff_teams = points_table.head(4).index.tolist()
    
    # Simulate Qualifier 1 (1st vs 2nd)
    team1 = playoff_teams[0]
    team2 = playoff_teams[1]
    pred_data = prepare_match_prediction_data(team1, team2, 'Neutral', features_df, team_stats)
    ensemble_prob = ensemble_model.predict_proba(pred_data)[0][1]
    nn_prob = nn_model.predict(pred_data)[0][0]
    combined_prob = 0.7 * ensemble_prob + 0.3 * nn_prob
    
    if combined_prob > 0.5:
        q1_winner = team1
        q1_loser = team2
    else:
        q1_winner = team2
        q1_loser = team1
    
    # Simulate Eliminator (3rd vs 4th)
    team3 = playoff_teams[2]
    team4 = playoff_teams[3]
    pred_data = prepare_match_prediction_data(team3, team4, 'Neutral', features_df, team_stats)
    ensemble_prob = ensemble_model.predict_proba(pred_data)[0][1]
    nn_prob = nn_model.predict(pred_data)[0][0]
    combined_prob = 0.7 * ensemble_prob + 0.3 * nn_prob
    
    if combined_prob > 0.5:
        eliminator_winner = team3
        eliminator_loser = team4
    else:
        eliminator_winner = team4
        eliminator_loser = team3
    
    # Simulate Qualifier 2 (Q1 loser vs Eliminator winner)
    pred_data = prepare_match_prediction_data(q1_loser, eliminator_winner, 'Neutral', features_df, team_stats)
    ensemble_prob = ensemble_model.predict_proba(pred_data)[0][1]
    nn_prob = nn_model.predict(pred_data)[0][0]
    combined_prob = 0.7 * ensemble_prob + 0.3 * nn_prob
    
    if combined_prob > 0.5:
        q2_winner = q1_loser
    else:
        q2_winner = eliminator_winner
    
    # Simulate Final (Q1 winner vs Q2 winner)
    pred_data = prepare_match_prediction_data(q1_winner, q2_winner, 'Neutral', features_df, team_stats)
    ensemble_prob = ensemble_model.predict_proba(pred_data)[0][1]
    nn_prob = nn_model.predict(pred_data)[0][0]
    combined_prob = 0.7 * ensemble_prob + 0.3 * nn_prob
    
    if combined_prob > 0.5:
        champion = q1_winner
        runner_up = q2_winner
        final_win_prob = combined_prob
    else:
        champion = q2_winner
        runner_up = q1_winner
        final_win_prob = 1 - combined_prob
    
    return points_table, playoff_teams, champion, runner_up, final_win_prob, match_results

# Simulate IPL 2025 tournament multiple times
num_simulations = 100
simulation_results = []

for sim in range(num_simulations):
    points_table, playoff_teams, champion, runner_up, final_win_prob, match_results = simulate_ipl_2025(
        fixtures_df_clean, team_stats, features_df, ensemble_model, nn_model)
    
    simulation_results.append({
        'Champion': champion,
        'Runner_Up': runner_up,
        'Final_Win_Prob': final_win_prob,
        'Top_4': playoff_teams
    })

# Count championship wins for each team
champion_counts = {}
for result in simulation_results:
    champion = result['Champion']
    champion_counts[champion] = champion_counts.get(champion, 0) + 1

# Count top 4 appearances for each team
top4_counts = {}
for result in simulation_results:
    for team in result['Top_4']:
        top4_counts[team] = top4_counts.get(team, 0) + 1

# Calculate championship probability
champion_prob = {team: count / num_simulations for team, count in champion_counts.items()}
champion_prob = dict(sorted(champion_prob.items(), key=lambda x: x[1], reverse=True))

# Calculate playoff probability
playoff_prob = {team: count / num_simulations for team, count in top4_counts.items()}
playoff_prob = dict(sorted(playoff_prob.items(), key=lambda x: x[1], reverse=True))


In [None]:

print("\nChampionship Probability:")
for team, prob in champion_prob.items():
    print(f"{team}: {prob:.2%}")

print("\nPlayoff Probability:")
for team, prob in playoff_prob.items():
    print(f"{team}: {prob:.2%}")


In [None]:

# Plot championship probability
plt.figure(figsize=(14, 8))
plt.bar(champion_prob.keys(), [prob * 100 for prob in champion_prob.values()], color='gold')
plt.xlabel('Teams')
plt.ylabel('Championship Probability (%)')
plt.title('IPL 2025 Championship Probability')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('championship_probability.png')
plt.show()


In [None]:

# Plot playoff probability
plt.figure(figsize=(14, 8))
plt.bar(playoff_prob.keys(), [prob * 100 for prob in playoff_prob.values()], color='blue')
plt.xlabel('Teams')
plt.ylabel('Playoff Probability (%)')
plt.title('IPL 2025 Playoff Probability')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('playoff_probability.png')
plt.show()


In [None]:

# Predict the IPL 2025 winner
predicted_champion = max(champion_prob.items(), key=lambda x: x[1])[0]
championship_probability = champion_prob[predicted_champion]

print(f"\n🏆 Predicted IPL 2025 Champion: {predicted_champion} with {championship_probability:.2%} probability")

# Output final points table from a single simulation
final_points_table, _, _, _, _, _ = simulate_ipl_2025(
    fixtures_df_clean, team_stats, features_df, ensemble_model, nn_model)

print("\nPredicted Final Points Table:")
print(final_points_table[['Matches', 'Wins', 'Losses', 'Points', 'NRR']].sort_values(['Points', 'NRR'], ascending=False))


In [None]:

# Part 7: Player Impact Analysis for 2025
print("\nPart 7: Player Impact Analysis for 2025")
print("="*80)

# Analyze 2025 player impact
def analyze_player_impact(players_df, alldata_df):
    """Analyze player impact for 2025 season."""
    # Merge player data with team
    player_impact = pd.merge(players_df, alldata_df, left_on='Players', right_on='Player_Name', how='left')
    
    # Fill missing values
    numeric_cols = ['Batting_Average', 'Batting_Strike_Rate', 'Economy_Rate', 'Bowling_Strike_Rate', 'Bowling_Average']
    player_impact[numeric_cols] = player_impact[numeric_cols].fillna(0)
    
    # Create impact score for batsmen
    player_impact['Batting_Impact'] = (player_impact['Batting_Average'] * 0.4 + 
                                      player_impact['Batting_Strike_Rate'] * 0.6) / 100
    
    # Create impact score for bowlers (lower is better for economy and average)
    player_impact['Bowling_Impact'] = 0
    bowler_mask = player_impact['Bowling_Average'] > 0
    
    if bowler_mask.any():
        max_econ = player_impact.loc[bowler_mask, 'Economy_Rate'].max()
        max_avg = player_impact.loc[bowler_mask, 'Bowling_Average'].max()
        
        player_impact.loc[bowler_mask, 'Bowling_Impact'] = (
            (1 - player_impact.loc[bowler_mask, 'Economy_Rate'] / max_econ) * 0.4 + 
            (1 - player_impact.loc[bowler_mask, 'Bowling_Average'] / max_avg) * 0.6
        )
    
    # Create overall impact score based on player type
    player_impact['Impact_Score'] = 0
    
    # Batsmen
    batsmen_mask = player_impact['Type'] == 'Batsman'
    player_impact.loc[batsmen_mask, 'Impact_Score'] = player_impact.loc[batsmen_mask, 'Batting_Impact']
    
    # Bowlers
    bowler_mask = player_impact['Type'] == 'Bowler'
    player_impact.loc[bowler_mask, 'Impact_Score'] = player_impact.loc[bowler_mask, 'Bowling_Impact']
    
    # All-rounders (weighted average)
    allrounder_mask = player_impact['Type'] == 'All-Rounder'
    player_impact.loc[allrounder_mask, 'Impact_Score'] = (
        player_impact.loc[allrounder_mask, 'Batting_Impact'] * 0.5 + 
        player_impact.loc[allrounder_mask, 'Bowling_Impact'] * 0.5
    )
    
    # Calculate team strength based on player impact
    team_strength = player_impact.groupby('Team')['Impact_Score'].agg(['mean', 'sum']).reset_index()
    team_strength.columns = ['Team', 'Average_Player_Impact', 'Total_Team_Impact']
    team_strength = team_strength.sort_values('Total_Team_Impact', ascending=False)
    
    # Get top players for each team
    top_players = {}
    for team in player_impact['Team'].unique():
        team_players = player_impact[player_impact['Team'] == team].sort_values('Impact_Score', ascending=False)
        top_players[team] = team_players.head(3)
    
    return player_impact, team_strength, top_players

# Analyze player impact
player_impact, team_strength, top_players = analyze_player_impact(players_df, alldata_df_clean)

print("\nTeam Strength Based on Player Impact:")
print(team_strength)

print("\nTop Players for Each Team:")
for team, players in top_players.items():
    print(f"\n{team}:")
    print(players[['Players', 'Type', 'Batting_Average', 'Batting_Strike_Rate', 'Economy_Rate', 'Impact_Score']])


In [None]:

# Plot team strength
plt.figure(figsize=(14, 8))
plt.bar(team_strength['Team'], team_strength['Total_Team_Impact'], color='purple')
plt.xlabel('Teams')
plt.ylabel('Total Team Impact Score')
plt.title('IPL 2025 Team Strength Based on Player Impact')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('team_strength.png')
plt.show()


In [None]:

# Compare team strength with championship probability
comparison_df = pd.DataFrame({
    'Team': list(champion_prob.keys()),
    'Championship_Probability': [prob * 100 for prob in champion_prob.values()]
})
comparison_df = pd.merge(comparison_df, team_strength, on='Team', how='left')

print("\nComparison of Team Strength and Championship Probability:")
print(comparison_df.sort_values('Championship_Probability', ascending=False))

# Plot the relationship between team strength and championship probability
plt.figure(figsize=(12, 8))
plt.scatter(comparison_df['Total_Team_Impact'], comparison_df['Championship_Probability'])

for i, row in comparison_df.iterrows():
    plt.annotate(row['Team'], (row['Total_Team_Impact'], row['Championship_Probability']))

plt.xlabel('Total Team Impact Score')
plt.ylabel('Championship Probability (%)')
plt.title('Relationship Between Team Strength and Championship Probability')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('strength_vs_probability.png')
plt.show()


In [None]:

# Part 8: Summary and Conclusion
print("\nPart 8: Summary and Conclusion")
print("="*80)

# Print key findings
print("\nKey Findings from IPL Analysis:")
print(f"1. Predicted Champion for IPL 2025: {predicted_champion} with {championship_probability:.2%} probability")
print(f"2. Top contenders based on player impact: {', '.join(team_strength['Team'].head(3).tolist())}")
print(f"3. Teams with highest playoff probability: {', '.join(list(playoff_prob.keys())[:4])}")

print("\nModel Performance:")
print(f"Random Forest Accuracy: {rf_accuracy:.4f}")
print(f"Gradient Boosting Accuracy: {gb_accuracy:.4f}")
print(f"XGBoost Accuracy: {xgb_accuracy:.4f}")
print(f"Ensemble Model Accuracy: {ensemble_accuracy:.4f}")
print(f"Neural Network Accuracy: {nn_accuracy:.4f}")

print("\nMost Important Features for Prediction:")
for i, row in feature_importance.head(5).iterrows():
    print(f"{row['Feature']}: {row['Average_Importance']:.4f}")

print("\nTop Players Expected to Make an Impact in IPL 2025:")
for team, players in top_players.items():
    if team == predicted_champion:
        print(f"\nKey Players for Predicted Champion ({team}):")
        for i, player in players.iterrows():
            print(f"- {player['Players']} ({player['Type']}): Impact Score = {player['Impact_Score']:.4f}")

In [None]:

# Handle any remaining missing values
model_data = model_data.fillna(0)

# Part 5: Model Building and Evaluation
print("\nPart 5: Model Building and Evaluation")
print("="*80)

# Split data into features and target
X = model_data.drop('team1_win', axis=1)
y = model_data['team1_win']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)

print("\nRandom Forest Model:")
print(f"Accuracy: {rf_accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, rf_pred))


In [None]:

# Train Gradient Boosting Classifier
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)
gb_pred = gb_model.predict(X_test)
gb_accuracy = accuracy_score(y_test, gb_pred)

print("\nGradient Boosting Model:")
print(f"Accuracy: {gb_accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, gb_pred))


In [None]:

# Train XGBoost Classifier
xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
xgb_accuracy = accuracy_score(y_test, xgb_pred)

print("\nXGBoost Model:")
print(f"Accuracy: {xgb_accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, xgb_pred))


In [None]:

# Create Ensemble Model
ensemble_model = VotingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
        ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
        ('xgb', xgb.XGBClassifier(n_estimators=100, random_state=42))
    ],
    voting='soft'
)

ensemble_model.fit(X_train, y_train)
ensemble_pred = ensemble_model.predict(X_test)
ensemble_accuracy = accuracy_score(y_test, ensemble_pred)

print("\nEnsemble Model:")
print(f"Accuracy: {ensemble_accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, ensemble_pred))


In [None]:

# Build Neural Network Model
def build_nn_model(input_dim):
    model = keras.Sequential([
        layers.Dense(64, activation='relu', input_dim=input_dim),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        layers.Dense(32, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        layers.Dense(16, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])
    
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

# Train Neural Network Model
nn_model = build_nn_model(X_train.shape[1])
nn_model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)
nn_pred_prob = nn_model.predict(X_test)
nn_pred = (nn_pred_prob > 0.5).astype(int)
nn_accuracy = accuracy_score(y_test, nn_pred)

print("\nNeural Network Model:")
print(f"Accuracy: {nn_accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, nn_pred))

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'RF_Importance': rf_model.feature_importances_,
    'GB_Importance': gb_model.feature_importances_,
    'XGB_Importance': xgb_model.feature_importances_
})

feature_importance['Average_Importance'] = feature_importance[['RF_Importance', 'GB_Importance', 'XGB_Importance']].mean(axis=1)
feature_importance = feature_importance.sort_values('Average_Importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance)

# Plot feature importance
plt.figure(figsize=(12, 8))
sns.barplot(x='Average_Importance', y='Feature', data=feature_importance)
plt.title('Feature Importance (Average across models)')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.show()


In [None]:

# Part 6: 2025 IPL Prediction
print("\nPart 6: 2025 IPL Prediction")
print("="*80)

# Get unique teams in IPL 2025
ipl_2025_teams = players_df['Team'].unique()
print(f"IPL 2025 Teams: {len(ipl_2025_teams)}")
print(ipl_2025_teams)


In [None]:

# Function to prepare prediction data for a match
def prepare_match_prediction_data(team1, team2, venue, features_df, team_stats):
    """Prepare prediction data for a match."""
    # Create prediction data
    pred_data = {}
    
    # Add team strength features
    pred_data['team1_win_pct'] = team_stats.loc[team1, 'Winning_Percentage'] / 100 if team1 in team_stats.index else 0.5
    pred_data['team2_win_pct'] = team_stats.loc[team2, 'Winning_Percentage'] / 100 if team2 in team_stats.index else 0.5
    
    # Add toss win percentage (use average if not available)
    team1_toss_matches = features_df[(features_df['team1'] == team1) | (features_df['team2'] == team1)]
    team1_toss_wins = features_df[features_df['toss_winner'] == team1]
    
    team2_toss_matches = features_df[(features_df['team1'] == team2) | (features_df['team2'] == team2)]
    team2_toss_wins = features_df[features_df['toss_winner'] == team2]
    
    pred_data['team1_toss_win_pct'] = len(team1_toss_wins) / len(team1_toss_matches) if len(team1_toss_matches) > 0 else 0.5
    pred_data['team2_toss_win_pct'] = len(team2_toss_wins) / len(team2_toss_matches) if len(team2_toss_matches) > 0 else 0.5
    
    # Add venue advantage (use average if not available)
    team1_venue_matches = features_df[(features_df['venue'] == venue) & 
                                     ((features_df['team1'] == team1) | (features_df['team2'] == team1))]
    team1_venue_wins = features_df[(features_df['venue'] == venue) & (features_df['winner'] == team1)]
    
    team2_venue_matches = features_df[(features_df['venue'] == venue) & 
                                     ((features_df['team1'] == team2) | (features_df['team2'] == team2))]
    team2_venue_wins = features_df[(features_df['venue'] == venue) & (features_df['winner'] == team2)]
    
    pred_data['team1_venue_advantage'] = len(team1_venue_wins) / len(team1_venue_matches) if len(team1_venue_matches) > 0 else 0.5
    pred_data['team2_venue_advantage'] = len(team2_venue_wins) / len(team2_venue_matches) if len(team2_venue_matches) > 0 else 0.5
    
    # Add head-to-head advantage (use 0.5 if not available)
    h2h_matches = features_df[((features_df['team1'] == team1) & (features_df['team2'] == team2)) | 
                             ((features_df['team1'] == team2) & (features_df['team2'] == team1))]
    team1_h2h_wins = h2h_matches[h2h_matches['winner'] == team1]
    
    pred_data['team1_h2h_advantage'] = len(team1_h2h_wins) / len(h2h_matches) if len(h2h_matches) > 0 else 0.5
    pred_data['team2_h2h_advantage'] = 1 - pred_data['team1_h2h_advantage']
    
    # Add recent form (use 0.5 if not available)
    team1_recent_matches = features_df[(features_df['team1'] == team1) | (features_df['team2'] == team1)].tail(5)
    team1_recent_wins = sum(1 for _, match in team1_recent_matches.iterrows() if match['winner'] == team1)
    
    team2_recent_matches = features_df[(features_df['team1'] == team2) | (features_df['team2'] == team2)].tail(5)
    team2_recent_wins = sum(1 for _, match in team2_recent_matches.iterrows() if match['winner'] == team2)
    
    pred_data['team1_recent_form'] = team1_recent_wins / len(team1_recent_matches) if len(team1_recent_matches) > 0 else 0.5
    pred_data['team2_recent_form'] = team2_recent_wins / len(team2_recent_matches) if len(team2_recent_matches) > 0 else 0.5
    
    # Add toss decision (assume batting first)
    pred_data['toss_decision_bat'] = 1
    
    return pd.DataFrame([pred_data])


In [None]:

# Simulate IPL 2025 tournament
def simulate_ipl_2025(fixtures_df, team_stats, features_df, ensemble_model, nn_model):
    """Simulate IPL 2025 tournament and predict the winner."""
    # Create copy of fixtures
    fixtures = fixtures_df.copy()
    
    # Initialize points table
    points_table = pd.DataFrame(index=ipl_2025_teams)
    points_table['Matches'] = 0
    points_table['Wins'] = 0
    points_table['Losses'] = 0
    points_table['Draws'] = 0
    points_table['Points'] = 0
    points_table['NRR'] = 0  # Net Run Rate (simplified)
    
    # Simulate league phase matches
    match_results = []
    
    for _, match in fixtures.iterrows():
        home_team = match['Home']
        away_team = match['Away']
        venue = match['Venue']
        
        # Prepare prediction data
        pred_data = prepare_match_prediction_data(home_team, away_team, venue, features_df, team_stats)
        
        # Make predictions
        ensemble_prob = ensemble_model.predict_proba(pred_data)[0][1]
        nn_prob = nn_model.predict(pred_data)[0][0]
        
        # Combine predictions (ensemble gets more weight)
        combined_prob = 0.7 * ensemble_prob + 0.3 * nn_prob
        
        # Determine winner
        if combined_prob > 0.5:
            winner = home_team
            loser = away_team
            win_prob = combined_prob
        else:
            winner = away_team
            loser = home_team
            win_prob = 1 - combined_prob
        
        # Update points table
        points_table.loc[winner, 'Matches'] += 1
        points_table.loc[winner, 'Wins'] += 1
        points_table.loc[winner, 'Points'] += 2
        
        points_table.loc[loser, 'Matches'] += 1
        points_table.loc[loser, 'Losses'] += 1
        
        # Simulate NRR (simplified)
        points_table.loc[winner, 'NRR'] += np.random.uniform(0.05, 0.2)
        points_table.loc[loser, 'NRR'] -= np.random.uniform(0.05, 0.2)
        
        # Record match result
        match_results.append({
            'Match_No': match['Match No'],
            'Home': home_team,
            'Away': away_team,
            'Winner': winner,
            'Win_Probability': win_prob
        })
    
    # Sort points table
    points_table = points_table.sort_values(['Points', 'NRR'], ascending=False)
    
    # Get top 4 teams for playoffs
    playoff_teams = points_table.head(4).index.tolist()
    
    # Simulate Qualifier 1 (1st vs 2nd)
    team1 = playoff_teams[0]
    team2 = playoff_teams[1]
    pred_data = prepare_match_prediction_data(team1, team2, 'Neutral', features_df, team_stats)
    ensemble_prob = ensemble_model.predict_proba(pred_data)[0][1]
    nn_prob = nn_model.predict(pred_data)[0][0]
    combined_prob = 0.7 * ensemble_prob + 0.3 * nn_prob
    
    if combined_prob > 0.5:
        q1_winner = team1
        q1_loser = team2
    else:
        q1_winner = team2
        q1_loser = team1
    
    # Simulate Eliminator (3rd vs 4th)
    team3 = playoff_teams[2]
    team4 = playoff_teams[3]
    pred_data = prepare_match_prediction_data(team3, team4, 'Neutral', features_df, team_stats)
    ensemble_prob = ensemble_model.predict_proba(pred_data)[0][1]
    nn_prob = nn_model.predict(pred_data)[0][0]
    combined_prob = 0.7 * ensemble_prob + 0.3 * nn_prob
    
    if combined_prob > 0.5:
        eliminator_winner = team3
        eliminator_loser = team4
    else:
        eliminator_winner = team4
        eliminator_loser = team3
    
    # Simulate Qualifier 2 (Q1 loser vs Eliminator winner)
    pred_data = prepare_match_prediction_data(q1_loser, eliminator_winner, 'Neutral', features_df, team_stats)
    ensemble_prob = ensemble_model.predict_proba(pred_data)[0][1]
    nn_prob = nn_model.predict(pred_data)[0][0]
    combined_prob = 0.7 * ensemble_prob + 0.3 * nn_prob
    
    if combined_prob > 0.5:
        q2_winner = q1_loser
    else:
        q2_winner = eliminator_winner
    
    # Simulate Final (Q1 winner vs Q2 winner)
    pred_data = prepare_match_prediction_data(q1_winner, q2_winner, 'Neutral', features_df, team_stats)
    ensemble_prob = ensemble_model.predict_proba(pred_data)[0][1]
    nn_prob = nn_model.predict(pred_data)[0][0]
    combined_prob = 0.7 * ensemble_prob + 0.3 * nn_prob
    
    if combined_prob > 0.5:
        champion = q1_winner
        runner_up = q2_winner
        final_win_prob = combined_prob
    else:
        champion = q2_winner
        runner_up = q1_winner
        final_win_prob = 1 - combined_prob
    
    return points_table, playoff_teams, champion, runner_up, final_win_prob, match_results

# Simulate IPL 2025 tournament multiple times
num_simulations = 100
simulation_results = []

for sim in range(num_simulations):
    points_table, playoff_teams, champion, runner_up, final_win_prob, match_results = simulate_ipl_2025(
        fixtures_df_clean, team_stats, features_df, ensemble_model, nn_model)
    
    simulation_results.append({
        'Champion': champion,
        'Runner_Up': runner_up,
        'Final_Win_Prob': final_win_prob,
        'Top_4': playoff_teams
    })

# Count championship wins for each team
champion_counts = {}
for result in simulation_results:
    champion = result['Champion']
    champion_counts[champion] = champion_counts.get(champion, 0) + 1

# Count top 4 appearances for each team
top4_counts = {}
for result in simulation_results:
    for team in result['Top_4']:
        top4_counts[team] = top4_counts.get(team, 0) + 1

# Calculate championship probability
champion_prob = {team: count / num_simulations for team, count in champion_counts.items()}
champion_prob = dict(sorted(champion_prob.items(), key=lambda x: x[1], reverse=True))

# Calculate playoff probability
playoff_prob = {team: count / num_simulations for team, count in top4_counts.items()}
playoff_prob = dict(sorted(playoff_prob.items(), key=lambda x: x[1], reverse=True))


In [None]:

print("\nChampionship Probability:")
for team, prob in champion_prob.items():
    print(f"{team}: {prob:.2%}")

print("\nPlayoff Probability:")
for team, prob in playoff_prob.items():
    print(f"{team}: {prob:.2%}")


In [None]:

# Plot championship probability
plt.figure(figsize=(14, 8))
plt.bar(champion_prob.keys(), [prob * 100 for prob in champion_prob.values()], color='gold')
plt.xlabel('Teams')
plt.ylabel('Championship Probability (%)')
plt.title('IPL 2025 Championship Probability')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('championship_probability.png')
plt.show()


In [None]:

# Plot playoff probability
plt.figure(figsize=(14, 8))
plt.bar(playoff_prob.keys(), [prob * 100 for prob in playoff_prob.values()], color='blue')
plt.xlabel('Teams')
plt.ylabel('Playoff Probability (%)')
plt.title('IPL 2025 Playoff Probability')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('playoff_probability.png')
plt.show()


In [None]:

# Predict the IPL 2025 winner
predicted_champion = max(champion_prob.items(), key=lambda x: x[1])[0]
championship_probability = champion_prob[predicted_champion]

print(f"\n🏆 Predicted IPL 2025 Champion: {predicted_champion} with {championship_probability:.2%} probability")

# Output final points table from a single simulation
final_points_table, _, _, _, _, _ = simulate_ipl_2025(
    fixtures_df_clean, team_stats, features_df, ensemble_model, nn_model)

print("\nPredicted Final Points Table:")
print(final_points_table[['Matches', 'Wins', 'Losses', 'Points', 'NRR']].sort_values(['Points', 'NRR'], ascending=False))


In [None]:

# Part 7: Player Impact Analysis for 2025
print("\nPart 7: Player Impact Analysis for 2025")
print("="*80)

# Analyze 2025 player impact
def analyze_player_impact(players_df, alldata_df):
    """Analyze player impact for 2025 season."""
    # Merge player data with team
    player_impact = pd.merge(players_df, alldata_df, left_on='Players', right_on='Player_Name', how='left')
    
    # Fill missing values
    numeric_cols = ['Batting_Average', 'Batting_Strike_Rate', 'Economy_Rate', 'Bowling_Strike_Rate', 'Bowling_Average']
    player_impact[numeric_cols] = player_impact[numeric_cols].fillna(0)
    
    # Create impact score for batsmen
    player_impact['Batting_Impact'] = (player_impact['Batting_Average'] * 0.4 + 
                                      player_impact['Batting_Strike_Rate'] * 0.6) / 100
    
    # Create impact score for bowlers (lower is better for economy and average)
    player_impact['Bowling_Impact'] = 0
    bowler_mask = player_impact['Bowling_Average'] > 0
    
    if bowler_mask.any():
        max_econ = player_impact.loc[bowler_mask, 'Economy_Rate'].max()
        max_avg = player_impact.loc[bowler_mask, 'Bowling_Average'].max()
        
        player_impact.loc[bowler_mask, 'Bowling_Impact'] = (
            (1 - player_impact.loc[bowler_mask, 'Economy_Rate'] / max_econ) * 0.4 + 
            (1 - player_impact.loc[bowler_mask, 'Bowling_Average'] / max_avg) * 0.6
        )
    
    # Create overall impact score based on player type
    player_impact['Impact_Score'] = 0
    
    # Batsmen
    batsmen_mask = player_impact['Type'] == 'Batsman'
    player_impact.loc[batsmen_mask, 'Impact_Score'] = player_impact.loc[batsmen_mask, 'Batting_Impact']
    
    # Bowlers
    bowler_mask = player_impact['Type'] == 'Bowler'
    player_impact.loc[bowler_mask, 'Impact_Score'] = player_impact.loc[bowler_mask, 'Bowling_Impact']
    
    # All-rounders (weighted average)
    allrounder_mask = player_impact['Type'] == 'All-Rounder'
    player_impact.loc[allrounder_mask, 'Impact_Score'] = (
        player_impact.loc[allrounder_mask, 'Batting_Impact'] * 0.5 + 
        player_impact.loc[allrounder_mask, 'Bowling_Impact'] * 0.5
    )
    
    # Calculate team strength based on player impact
    team_strength = player_impact.groupby('Team')['Impact_Score'].agg(['mean', 'sum']).reset_index()
    team_strength.columns = ['Team', 'Average_Player_Impact', 'Total_Team_Impact']
    team_strength = team_strength.sort_values('Total_Team_Impact', ascending=False)
    
    # Get top players for each team
    top_players = {}
    for team in player_impact['Team'].unique():
        team_players = player_impact[player_impact['Team'] == team].sort_values('Impact_Score', ascending=False)
        top_players[team] = team_players.head(3)
    
    return player_impact, team_strength, top_players

# Analyze player impact
player_impact, team_strength, top_players = analyze_player_impact(players_df, alldata_df_clean)

print("\nTeam Strength Based on Player Impact:")
print(team_strength)

print("\nTop Players for Each Team:")
for team, players in top_players.items():
    print(f"\n{team}:")
    print(players[['Players', 'Type', 'Batting_Average', 'Batting_Strike_Rate', 'Economy_Rate', 'Impact_Score']])

# Plot team strength
plt.figure(figsize=(14, 8))
plt.bar(team_strength['Team'], team_strength['Total_Team_Impact'], color='purple')
plt.xlabel('Teams')
plt.ylabel('Total Team Impact Score')
plt.title('IPL 2025 Team Strength Based on Player Impact')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('team_strength.png')
plt.show()


In [None]:

# Compare team strength with championship probability
comparison_df = pd.DataFrame({
    'Team': list(champion_prob.keys()),
    'Championship_Probability': [prob * 100 for prob in champion_prob.values()]
})
comparison_df = pd.merge(comparison_df, team_strength, on='Team', how='left')

print("\nComparison of Team Strength and Championship Probability:")
print(comparison_df.sort_values('Championship_Probability', ascending=False))

# Plot the relationship between team strength and championship probability
plt.figure(figsize=(12, 8))
plt.scatter(comparison_df['Total_Team_Impact'], comparison_df['Championship_Probability'])

for i, row in comparison_df.iterrows():
    plt.annotate(row['Team'], (row['Total_Team_Impact'], row['Championship_Probability']))

plt.xlabel('Total Team Impact Score')
plt.ylabel('Championship Probability (%)')
plt.title('Relationship Between Team Strength and Championship Probability')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('strength_vs_probability.png')
plt.show()


In [None]:

# Part 8: Summary and Conclusion
print("\nPart 8: Summary and Conclusion")
print("="*80)

# Print key findings
print("\nKey Findings from IPL Analysis:")
print(f"1. Predicted Champion for IPL 2025: {predicted_champion} with {championship_probability:.2%} probability")
print(f"2. Top contenders based on player impact: {', '.join(team_strength['Team'].head(3).tolist())}")
print(f"3. Teams with highest playoff probability: {', '.join(list(playoff_prob.keys())[:4])}")

print("\nModel Performance:")
print(f"Random Forest Accuracy: {rf_accuracy:.4f}")
print(f"Gradient Boosting Accuracy: {gb_accuracy:.4f}")
print(f"XGBoost Accuracy: {xgb_accuracy:.4f}")
print(f"Ensemble Model Accuracy: {ensemble_accuracy:.4f}")
print(f"Neural Network Accuracy: {nn_accuracy:.4f}")

print("\nMost Important Features for Prediction:")
for i, row in feature_importance.head(5).iterrows():
    print(f"{row['Feature']}: {row['Average_Importance']:.4f}")

print("\nTop Players Expected to Make an Impact in IPL 2025:")
for team, players in top_players.items():
    if team == predicted_champion:
        print(f"\nKey Players for Predicted Champion ({team}):")
        for i, player in players.iterrows():
            print(f"- {player['Players']} ({player['Type']}): Impact Score = {player['Impact_Score']:.4f}")