In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings('ignore')



In [None]:

# Set plotting style
plt.style.use('fivethirtyeight')
sns.set_palette('bright')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12


In [None]:

## **DATA COLLECTION**

# 1. Data Loading
print("Loading IPL datasets...")
matches = pd.read_csv('matches.csv')
deliveries = pd.read_csv('deliveries.csv')


print(f"Matches dataset shape: {matches.shape}")
print(f"Deliveries dataset shape: {deliveries.shape}")

# Display sample data
print("\nSample data from matches dataset:")
matches.head()


In [None]:

## **DATA PRE-PROCESSING**


In [None]:
### 2. Data Cleaning and Preprocessing
print("\nChecking for missing values in matches dataset:")
print(matches.isnull().sum())
# Handle missing values
matches['city'].fillna('Unknown', inplace=True)
matches['player_of_match'].fillna('Unknown', inplace=True)
matches['winner'].fillna('No Result', inplace=True)
matches['result_margin'].fillna(0, inplace=True)

# Standardize team names
team_name_mapping = {
    'Delhi Daredevils': 'Delhi Capitals',
    'Deccan Chargers': 'Sunrisers Hyderabad',
    'Rising Pune Supergiants': 'Rising Pune Supergiant',
    'Kings XI Punjab': 'Punjab Kings'
}

for old_name, new_name in team_name_mapping.items():
    matches['team1'] = matches['team1'].replace(old_name, new_name)
    matches['team2'] = matches['team2'].replace(old_name, new_name)
    matches['toss_winner'] = matches['toss_winner'].replace(old_name, new_name)
    matches['winner'] = matches['winner'].replace(old_name, new_name)
    
    deliveries['batting_team'] = deliveries['batting_team'].replace(old_name, new_name)
    deliveries['bowling_team'] = deliveries['bowling_team'].replace(old_name, new_name)

print("\nAfter cleaning, missing values in matches dataset:")
print(matches.isnull().sum())


In [None]:

## **EXOPLORATORY DATA ANALYSIS (EDA)**


In [None]:

import jinja2

# 3.1 Team Performance Analysis
print("\n--------------------------------------- Team Performance Analysis ---------------------------------------")

# Matches played by each team
team1_counts = matches['team1'].value_counts()
team2_counts = matches['team2'].value_counts()
team_matches = team1_counts.add(team2_counts, fill_value=0).astype(int)

# Matches won by each team
team_wins = matches['winner'].value_counts()

# Create a dataframe for team performance
team_performance = pd.DataFrame({
    'Matches_Played': team_matches,
    'Matches_Won': team_wins
})

team_performance['Win_Percentage'] = round((team_performance['Matches_Won'] / team_performance['Matches_Played']) * 100, 2)
team_performance = team_performance.sort_values('Win_Percentage', ascending=False)

print("\nTeam Performance:")
# print(team_performance)
team_performance.style.background_gradient(cmap='gnuplot')


# Plot matches won by teams
plt.figure(figsize=(10, 5))
plt.bar(team_wins.index, team_wins.values, color='blueviolet')
plt.title('Matches Won by Teams', fontsize=12)
plt.xlabel('Teams', fontsize=12)
plt.ylabel('Number of Matches Won', fontsize=10)
plt.xticks(rotation=40, ha='right', fontsize=10)
plt.tight_layout()
plt.savefig('matches_won_by_teams.png')
plt.show()


In [None]:

# Calculate run rates and economy rates
team_runs = {}
team_balls_faced = {}
team_runs_conceded = {}
team_balls_bowled = {}

for _, row in deliveries.iterrows():
    batting_team = row['batting_team']
    bowling_team = row['bowling_team']
    runs = row['total_runs']
    
    # Update batting stats
    if batting_team not in team_runs:
        team_runs[batting_team] = 0
        team_balls_faced[batting_team] = 0
    
    team_runs[batting_team] += runs
    team_balls_faced[batting_team] += 1
    
    # Update bowling stats
    if bowling_team not in team_runs_conceded:
        team_runs_conceded[bowling_team] = 0
        team_balls_bowled[bowling_team] = 0
    
    team_runs_conceded[bowling_team] += runs
    team_balls_bowled[bowling_team] += 1
    
# Calculate run rate and economy rate
teams_union = sorted(set(team_runs.keys()).union(set(team_runs_conceded.keys())))
team_stats = pd.DataFrame(index=teams_union)

team_stats['Run_Rate'] = pd.Series({
    team: (runs * 6 / team_balls_faced[team]) if team in team_balls_faced and team_balls_faced[team] else 0
    for team, runs in team_runs.items()
}).round(2)

team_stats['Economy_Rate'] = pd.Series({
    team: (runs * 6 / team_balls_bowled[team]) if team in team_balls_bowled and team_balls_bowled[team] else 0
    for team, runs in team_runs_conceded.items()
}).round(2)

print("\nTeam Run Rates and Economy Rates (fixed):")
# print(team_stats)
team_stats.style.background_gradient(cmap='ocean')

# Plot run rates and economy rates
plt.figure(figsize=(12, 7))
team_stats.plot(kind='bar', figsize=(12, 7))
plt.title('Team Run Rates vs Economy Rates', fontsize=14)
plt.xlabel('Teams', fontsize=15)
plt.ylabel('Rate', fontsize=15)
plt.xticks(rotation=40, ha='right')
plt.legend(['Run Rate (Batting)', 'Economy Rate (Bowling)'])
plt.tight_layout()
plt.savefig('team_run_rates_economy.png')
plt.show()


In [None]:

# Calculate highest and lowest scores for each team
team_highest_scores = {}
team_lowest_scores = {}

for match_id in matches['id'].unique():
    match_deliveries = deliveries[deliveries['match_id'] == match_id]
    batting_teams = match_deliveries['batting_team'].unique()
    
    for team in batting_teams:
        team_deliveries = match_deliveries[match_deliveries['batting_team'] == team]
        total_score = team_deliveries['total_runs'].sum()
        
        if team not in team_highest_scores or total_score > team_highest_scores[team]:
            team_highest_scores[team] = total_score
            
        if team not in team_lowest_scores or total_score < team_lowest_scores[team]:
            team_lowest_scores[team] = total_score

# Create dataframe for highest and lowest scores
team_scores = pd.DataFrame({
    'Highest_Score': pd.Series(team_highest_scores),
    'Lowest_Score': pd.Series(team_lowest_scores)
})

print("\nTeam Highest and Lowest Scores:")
team_scores.style.background_gradient(cmap='Greens')


In [None]:


# Plot highest and lowest scores
plt.figure(figsize=(10, 6))
team_scores.plot(kind='bar', figsize=(10, 6))
plt.title('Team Highest and Lowest Scores', fontsize=16)
plt.xlabel('Teams', fontsize=14)
plt.ylabel('Score', fontsize=14)
plt.xticks(rotation=40, ha='right', fontsize=10)
plt.legend(['Highest Score', 'Lowest Score'])
plt.tight_layout()
plt.savefig('team_highest_lowest_scores.png')
plt.show()



In [None]:

# Calculate total 4s and 6s for each team
team_fours = {}
team_sixes = {}

for _, row in deliveries.iterrows():
    batting_team = row['batting_team']
    runs = row['batsman_runs']
    
    if batting_team not in team_fours:
        team_fours[batting_team] = 0
        team_sixes[batting_team] = 0
    
    if runs == 4:
        team_fours[batting_team] += 1
    elif runs == 6:
        team_sixes[batting_team] += 1

# Create dataframe for 4s and 6s
team_boundaries = pd.DataFrame({
    'Fours': pd.Series(team_fours),
    'Sixes': pd.Series(team_sixes)
})

print("\nTeam Boundaries (4s and 6s):")
team_boundaries.style.background_gradient(cmap='Spectral')



In [None]:

# Plot 4s and 6s
plt.figure(figsize=(10, 6))
team_boundaries.plot(kind='barh', figsize=(10, 6))
plt.title('Team Boundaries (4s and 6s)', fontsize=18)
plt.xlabel('Teams', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.xticks(rotation=40, ha='right', fontsize=10)
plt.yticks(fontsize=10)
plt.legend(['Fours', 'Sixes'])
plt.tight_layout()
plt.savefig('team_boundaries.png')
plt.show()

# Calculate powerplay and death overs scores
def is_powerplay(over):
    return over <= 6

def is_death_over(over):
    return over >= 16

team_powerplay_runs = {}
team_powerplay_balls = {}
team_death_runs = {}
team_death_balls = {}

for _, row in deliveries.iterrows():
    batting_team = row['batting_team']
    over = row['over']
    runs = row['total_runs']
    
    if batting_team not in team_powerplay_runs:
        team_powerplay_runs[batting_team] = 0
        team_powerplay_balls[batting_team] = 0
        team_death_runs[batting_team] = 0
        team_death_balls[batting_team] = 0
    
    if is_powerplay(over):
        team_powerplay_runs[batting_team] += runs
        team_powerplay_balls[batting_team] += 1
    
    if is_death_over(over):
        team_death_runs[batting_team] += runs
        team_death_balls[batting_team] += 1

# Calculate average powerplay and death overs score
team_phase_scores = pd.DataFrame(index=team_powerplay_runs.keys())
team_phase_scores['Avg_Powerplay_Score'] = pd.Series({team: (runs * 36 / balls) if balls > 0 else 0 
                                                    for team, runs, balls in zip(team_powerplay_runs.keys(), 
                                                                                team_powerplay_runs.values(), 
                                                                                team_powerplay_balls.values())}).round(2)
team_phase_scores['Avg_Death_Score'] = pd.Series({team: (runs * 30 / balls) if balls > 0 else 0 
                                                for team, runs, balls in zip(team_death_runs.keys(), 
                                                                            team_death_runs.values(), 
                                                                            team_death_balls.values())}).round(2)

print("\nTeam Average Powerplay and Death Overs Score:")
team_phase_scores.style.background_gradient(cmap='summer')
# Plot average powerplay and death overs score
plt.figure(figsize=(10, 7))
team_phase_scores.plot(kind='bar', figsize=(10, 7))
plt.title('Team Average Powerplay and Death Overs Score', fontsize=16)
plt.xlabel('Teams', fontsize=14)
plt.ylabel('Average Score', fontsize=14)
plt.xticks(rotation=40, ha='right', fontsize=9)
plt.legend(['Avg Powerplay Score', 'Avg Death Overs Score'])
plt.tight_layout()
plt.savefig('team_phase_scores.png')
plt.show()


In [None]:

# 3.2 Player Performance Analysis
print("\n--- Player Performance Analysis ---")

# Calculate runs scored by each batsman
batsman_runs = deliveries.groupby('batter')['batsman_runs'].sum().sort_values(ascending=False)
top_run_scorers = batsman_runs.head(20)

print("\nTop 20 Run Scorers:")
top_run_scorers.to_frame(name='Runs').style.background_gradient(cmap='viridis')

# Plot top 20 run scorers
plt.figure(figsize=(14, 7))
plt.bar(top_run_scorers.index, top_run_scorers.values, color='darkorange')
plt.title('Top 20 Run Scorers in IPL', fontsize=16)
plt.xlabel('Batsman', fontsize=14)
plt.ylabel('Runs', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('top_run_scorers.png')
plt.show()


In [None]:

# Calculate batting average and strike rate for top run scorers
batsman_dismissals = deliveries[deliveries['is_wicket'] == 1].groupby('batter').size()
batsman_balls_faced = deliveries.groupby('batter').size()

batsman_stats = pd.DataFrame({
    'Runs': batsman_runs,
    'Dismissals': batsman_dismissals,
    'Balls_Faced': batsman_balls_faced
})

batsman_stats['Batting_Average'] = (batsman_stats['Runs'] / batsman_stats['Dismissals']).fillna(batsman_stats['Runs']).round(2)
batsman_stats['Strike_Rate'] = (batsman_stats['Runs'] / batsman_stats['Balls_Faced'] * 100).round(2)

# Filter for top 20 run scorers
top_batsman_stats = batsman_stats.loc[top_run_scorers.index]

print("\nBatting Average and Strike Rate for Top 20 Run Scorers:")
# print(top_batsman_stats[['Batting_Average', 'Strike_Rate']])
top_batsman_stats[['Batting_Average', 'Strike_Rate']] \
    .style \
    .format({'Batting_Average': '{:.2f}', 'Strike_Rate': '{:.2f}'}) \
    .background_gradient(cmap='cividis')
# Plot batting average vs strike rate for top 20 run scorers
plt.figure(figsize=(12, 8))
plt.scatter(top_batsman_stats['Strike_Rate'], top_batsman_stats['Batting_Average'], s=120, alpha=0.7)

for i, batsman in enumerate(top_batsman_stats.index):
    plt.annotate(batsman, 
                 (top_batsman_stats['Strike_Rate'].iloc[i], top_batsman_stats['Batting_Average'].iloc[i]),
                 fontsize=9)

plt.title('Batting Average vs Strike Rate for Top 20 Run Scorers', fontsize=16)
plt.xlabel('Strike Rate', fontsize=14)
plt.ylabel('Batting Average', fontsize=14)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('batting_avg_vs_strike_rate.png')
plt.show()


In [None]:

# Find highest average and strike rate for players with >50 matches
# First, count matches played by each player
player_matches = {}
for player in batsman_stats.index:
    match_ids = deliveries[deliveries['batter'] == player]['match_id'].unique()
    player_matches[player] = len(match_ids)

batsman_stats['Matches_Played'] = pd.Series(player_matches)

# Filter for players with >50 matches
qualified_players = batsman_stats[batsman_stats['Matches_Played'] > 50]
highest_avg_player = qualified_players.loc[qualified_players['Batting_Average'].idxmax()]
highest_sr_player = qualified_players.loc[qualified_players['Strike_Rate'].idxmax()]

print(f"\nPlayer with Highest Average (>50 matches): {highest_avg_player.name} - {highest_avg_player['Batting_Average']}")
print(f"Player with Highest Strike Rate (>50 matches): {highest_sr_player.name} - {highest_sr_player['Strike_Rate']}")


# Calculate wickets taken by each bowler
bowler_wickets = deliveries[deliveries['is_wicket'] == 1].groupby('bowler').size().sort_values(ascending=False)
top_wicket_takers = bowler_wickets.head(20)

print("\nTop 20 Wicket Takers:")
top_wicket_takers.to_frame(name='Wickets').style.background_gradient(cmap='viridis')
# Plot top wicket takers
plt.figure(figsize=(14, 8))
plt.bar(top_wicket_takers.index, top_wicket_takers.values, color='purple')
plt.title('Top 20 Wicket Takers in IPL', fontsize=16)
plt.xlabel('Bowler', fontsize=14)
plt.ylabel('Wickets', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('top_wicket_takers.png')
plt.show()



In [None]:

# Find highest individual scores
batsman_match_runs = deliveries.groupby(['match_id', 'batter'])['batsman_runs'].sum().reset_index()
print("\nTop 20 Highest Individual Scores:")
# Select only the desired columns
highest_scores = batsman_match_runs.sort_values('batsman_runs', ascending=False).head(20)[['batter', 'batsman_runs']]
highest_scores.style.background_gradient(cmap='viridis')
# Plot highest individual scores
plt.figure(figsize=(14, 8))
plt.bar(highest_scores['batter'], highest_scores['batsman_runs'], color='yellowgreen')
plt.title('Top 20 Highest Individual Scores in IPL', fontsize=16)
plt.xlabel('Batsman', fontsize=14)
plt.ylabel('Runs', fontsize=14)
plt.xticks(rotation=40, ha='right')
plt.tight_layout()
plt.savefig('highest_individual_scores.png')
plt.show()


In [None]:

# Man of the Match analysis
mom_counts = matches['player_of_match'].value_counts().head(20)

print("\nTop 20 Players with Most Man of the Match Awards:")
print(mom_counts)

# Plot Man of the Match counts
plt.figure(figsize=(14, 8))
plt.bar(mom_counts.index, mom_counts.values, color='brown')
plt.title('Top 20 Players with Most Man of the Match Awards', fontsize=16)
plt.xlabel('Player', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('mom_counts.png')
plt.show()
