In [None]:
# IPL 2024 Data Analysis - Exploratory Data Analysis (EDA)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('/content/ipl_2024_matches.csv')

In [None]:
# Initial data inspection
print("Dataset Shape:", df.shape)
print("\nFirst 5 Rows:")
display(df.head())

In [None]:
print("\nData Summary:")
display(df.info())
display(df.describe(include='all'))

In [None]:
# Data Cleaning
# Convert date to datetime
df['date'] = pd.to_datetime(df['date'])

In [None]:
# Handle missing values
df['margin'] = df['margin'].fillna(0)  # Fill missing margins with 0
df = df.dropna(subset=['winning_team'])  # Drop matches without result

In [None]:
# Feature Engineering
df['win_by_runs'] = np.where(df['won_by'] == 'runs', df['margin'], 0)
df['win_by_wickets'] = np.where(df['won_by'] == 'wickets', df['margin'], 0)
df['toss_win_match_win'] = np.where(df['toss_winner'] == df['winning_team'], 'Yes', 'No')


In [None]:
# 1. Univariate Analysis
plt.figure(figsize=(15, 12))

In [None]:
# Team Performance
plt.subplot(221)
sns.countplot(y='winning_team', data=df, order=df['winning_team'].value_counts().index)
plt.title('Total Wins by Team')
plt.xlabel('Number of Wins')

In [None]:
# Win Type Analysis
plt.subplot(222)
df['won_by'].value_counts().plot.pie(autopct='%1.1f%%')
plt.title('Match Results Distribution')
plt.ylabel('')


In [None]:
# Toss Decision Preference
plt.subplot(223)
sns.countplot(x='toss_decision', data=df)
plt.title('Toss Decision Preference')
plt.xlabel('Decision')
plt.ylabel('Count')

In [None]:
# Player of Match Awards
plt.subplot(224)
top_players = df['player_of_the_match'].value_counts().head(10)
sns.barplot(x=top_players.values, y=top_players.index)
plt.title('Top 10 Player of Match Awards')
plt.xlabel('Awards Count')

In [None]:
plt.tight_layout()
plt.show()

In [None]:
# 2. Bivariate Analysis
plt.figure(figsize=(15, 10))

In [None]:
# Win Type by Venue
plt.subplot(221)
sns.countplot(y='venue', hue='won_by', data=df)
plt.title('Win Type by Venue')
plt.xlabel('Count')
plt.ylabel('Venue')

In [None]:
# Toss Decision vs Match Result
plt.subplot(222)
sns.countplot(x='toss_decision', hue='toss_win_match_win', data=df)
plt.title('Toss Decision vs Match Outcome')
plt.xlabel('Toss Decision')
plt.ylabel('Count')
plt.legend(title='Toss Winner Won Match?')

In [None]:
# Average Score by Venue
plt.subplot(223)
venue_scores = df.groupby('venue')[['innings1_score', 'innings2_score']].mean()
venue_scores.plot(kind='bar')
plt.title('Average Scores by Venue')
plt.xlabel('Venue')
plt.ylabel('Average Score')
plt.legend(['1st Innings', '2nd Innings'])

In [None]:
# Win Margin Distribution
plt.subplot(224)
sns.boxplot(x='won_by', y='margin', data=df)
plt.title('Win Margin Distribution')
plt.xlabel('Win Type')
plt.ylabel('Margin')

plt.tight_layout()
plt.show()

In [None]:
# 3. Multivariate Analysis
# Correlation Heatmap
numeric_cols = ['innings1_score', 'innings1_wickets', 'innings2_score',
               'innings2_wickets', 'margin', 'win_by_runs', 'win_by_wickets']
plt.figure(figsize=(10, 8))
sns.heatmap(df[numeric_cols].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Pairplot for Key Numerical Features
sns.pairplot(df[['innings1_score', 'innings2_score', 'margin', 'win_by_runs']])
plt.suptitle('Pairplot of Key Match Metrics', y=1.02)
plt.show()

In [None]:
# 4. Time Series Analysis
# Wins Over Time
plt.figure(figsize=(12, 6))
df['date'] = pd.to_datetime(df['date'])
match_count = df.groupby(['date', 'winning_team']).size().unstack().fillna(0)
match_count.cumsum().plot()
plt.title('Cumulative Wins Over Tournament')
plt.xlabel('Date')
plt.ylabel('Cumulative Wins')
plt.legend(title='Team')
plt.show()

In [None]:
# 5. Key Insights Summary
print("\n" + "="*50)
print("KEY INSIGHTS SUMMARY")
print("="*50)

In [None]:
# Insight 1: Toss Impact
toss_win_rate = df['toss_win_match_win'].value_counts(normalize=True) * 100
print(f"\n1. Toss Impact: {toss_win_rate['Yes']:.1f}% of matches were won by toss winners")

In [None]:
# Insight 2: Batting First Advantage
bat_first_wins = df[df['toss_decision'] == 'bat']['toss_win_match_win'].value_counts(normalize=True) * 100
print(f"2. Batting First Advantage: When choosing to bat first, teams won {bat_first_wins['Yes']:.1f}% of matches")


In [None]:
# Insight 3: High Scoring Venues
highest_avg = venue_scores.mean(axis=1).idxmax()
print(f"3. Highest Scoring Venue: {highest_avg} with average score of {venue_scores.mean(axis=1).max():.1f}")

In [None]:
# Insight 4: Player Performance
top_player = df['player_of_the_match'].value_counts().index[0]
print(f"4. Most Consistent Performer: {top_player} won {df['player_of_the_match'].value_counts().iloc[0]} Player of Match awards")

In [None]:
# Insight 5: Win Margins
wicket_wins = df[df['won_by'] == 'wickets']['margin'].mean()
run_wins = df[df['won_by'] == 'runs']['margin'].mean()
print(f"5. Average Win Margins: {run_wins:.1f} runs when batting first, {wicket_wins:.1f} wickets when chasing")

In [None]:
# Insight 6: Tournament Progression
final_standings = df['winning_team'].value_counts().head(3)
print("\n6. Top 3 Teams by Wins:")
display(final_standings)