# üìä Flight Delay Data Exploration

This notebook provides comprehensive exploratory data analysis of flight delay data.

**Objectives:**
- Understand dataset structure and quality
- Analyze delay distributions and patterns
- Identify temporal trends
- Explore carrier and airport performance


## 1. Setup and Data Loading


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
import sys
import os

# Add parent directory to path for imports
sys.path.insert(0, os.path.abspath('..'))

warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.2f}'.format)

# Set plot style
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11

# Custom color palette
COLORS = {
    'primary': '#2E86AB',
    'secondary': '#A23B72',
    'success': '#18A558',
    'warning': '#F18F01',
    'danger': '#C73E1D'
}

print("‚úì Libraries imported successfully")


In [None]:
# Generate sample data (if not using real data)
from src.sample_data import generate_sample_dataset, generate_airport_metadata, generate_carrier_metadata

# Check if data exists, otherwise generate it
data_path = '../data/raw/flights.csv'

if not os.path.exists(data_path):
    print("Generating sample flight data...")
    df = generate_sample_dataset(
        n_flights=50000,
        start_date='2023-01-01',
        end_date='2023-12-31',
        save_path=data_path
    )
    
    # Save metadata
    airports_df = generate_airport_metadata()
    airports_df.to_csv('../data/external/airports.csv', index=False)
    
    carriers_df = generate_carrier_metadata()
    carriers_df.to_csv('../data/external/carriers.csv', index=False)
else:
    print(f"Loading data from {data_path}...")
    df = pd.read_csv(data_path, parse_dates=['scheduled_departure', 'scheduled_arrival',
                                              'actual_departure', 'actual_arrival'])

print(f"\n‚úì Loaded {len(df):,} flight records")


## 2. Dataset Overview


In [None]:
# Basic info
print("=" * 60)
print("DATASET OVERVIEW")
print("=" * 60)
print(f"\nTotal flights: {len(df):,}")
print(f"Date range: {df['scheduled_departure'].min()} to {df['scheduled_departure'].max()}")
print(f"\nColumns: {len(df.columns)}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Display first few rows
print("\nüìã Sample Records:")
df.head(10)


In [None]:
# Column info and descriptive statistics
print("üìä Column Information:")
print(df.dtypes.to_string())
print("\nüìà Descriptive Statistics:")
df.describe()


## 3. Missing Values Analysis


In [None]:
# Missing values analysis
missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100).round(2)

missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Missing %': missing_pct
}).sort_values('Missing Count', ascending=False)

print("üîç Missing Values:")
display(missing_df[missing_df['Missing Count'] > 0])

# Visualize missing values
fig, ax = plt.subplots(figsize=(10, 4))
missing_cols = missing_df[missing_df['Missing Count'] > 0]
if len(missing_cols) > 0:
    colors = [COLORS['danger'] if pct > 5 else COLORS['warning'] for pct in missing_cols['Missing %']]
    ax.barh(missing_cols.index, missing_cols['Missing %'], color=colors)
    ax.set_xlabel('Missing Percentage (%)')
    ax.set_title('Missing Values by Column')
else:
    ax.text(0.5, 0.5, '‚úì No Missing Values!', ha='center', va='center', fontsize=20, color=COLORS['success'])
    ax.axis('off')
plt.tight_layout()
plt.show()


## 4. Delay Distribution Analysis


In [None]:
# Filter out cancelled flights for delay analysis
df_active = df[df['is_cancelled'] == 0].copy()

print(f"Active flights: {len(df_active):,}")
print(f"Cancelled flights: {df['is_cancelled'].sum():,} ({df['is_cancelled'].mean()*100:.2f}%)")

# Delay statistics
print("\nüìä Arrival Delay Statistics:")
print(f"Mean: {df_active['arrival_delay'].mean():.2f} minutes")
print(f"Median: {df_active['arrival_delay'].median():.2f} minutes")
print(f"Std Dev: {df_active['arrival_delay'].std():.2f} minutes")
print(f"Min: {df_active['arrival_delay'].min():.2f} minutes")
print(f"Max: {df_active['arrival_delay'].max():.2f} minutes")
print(f"\n25th percentile: {df_active['arrival_delay'].quantile(0.25):.2f} minutes")
print(f"75th percentile: {df_active['arrival_delay'].quantile(0.75):.2f} minutes")
print(f"90th percentile: {df_active['arrival_delay'].quantile(0.90):.2f} minutes")


In [None]:
# Delay distribution plots
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Histogram
ax1 = axes[0]
data = df_active['arrival_delay'].clip(-60, 180)
ax1.hist(data, bins=60, color=COLORS['primary'], edgecolor='white', alpha=0.7)
ax1.axvline(x=0, color='black', linestyle='--', linewidth=1.5, label='On-Time')
ax1.axvline(x=15, color=COLORS['danger'], linestyle='--', linewidth=1.5, label='Delay Threshold')
ax1.axvline(x=df_active['arrival_delay'].mean(), color=COLORS['warning'], linestyle='-', linewidth=2, label=f'Mean ({df_active["arrival_delay"].mean():.1f})')
ax1.set_xlabel('Delay (minutes)')
ax1.set_ylabel('Frequency')
ax1.set_title('Distribution of Arrival Delays')
ax1.legend()

# Box plot
ax2 = axes[1]
bp = ax2.boxplot([df_active['departure_delay'].dropna(), df_active['arrival_delay'].dropna()], 
                  labels=['Departure', 'Arrival'], patch_artist=True)
colors_box = [COLORS['primary'], COLORS['secondary']]
for patch, color in zip(bp['boxes'], colors_box):
    patch.set_facecolor(color)
    patch.set_alpha(0.7)
ax2.set_ylabel('Delay (minutes)')
ax2.set_title('Departure vs Arrival Delay')
ax2.set_ylim(-50, 150)

# Delay categories pie chart
ax3 = axes[2]
df_active['delay_cat'] = pd.cut(df_active['arrival_delay'], 
                                bins=[-float('inf'), 0, 15, 30, 60, float('inf')],
                                labels=['Early', 'On-Time', 'Minor', 'Moderate', 'Severe'])
delay_counts = df_active['delay_cat'].value_counts()
colors_cat = [COLORS['success'], COLORS['success'], COLORS['warning'], COLORS['danger'], '#8B0000']
ax3.pie(delay_counts, labels=delay_counts.index, autopct='%1.1f%%', colors=colors_cat, startangle=90)
ax3.set_title('Delay Categories')

plt.tight_layout()
plt.savefig('../reports/figures/delay_distribution.png', dpi=150, bbox_inches='tight')
plt.show()


## 5. Temporal Pattern Analysis


In [None]:
# Extract temporal features
df_active['departure_hour'] = df_active['scheduled_departure'].dt.hour
df_active['departure_month'] = df_active['scheduled_departure'].dt.month
df_active['day_of_week'] = df_active['scheduled_departure'].dt.dayofweek
df_active['day_name'] = df_active['scheduled_departure'].dt.day_name()
df_active['is_delayed'] = (df_active['arrival_delay'] >= 15).astype(int)

# Temporal patterns visualization
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# By hour
ax1 = axes[0, 0]
hourly_stats = df_active.groupby('departure_hour')['arrival_delay'].agg(['mean', 'std']).reset_index()
ax1.plot(hourly_stats['departure_hour'], hourly_stats['mean'], marker='o', linewidth=2, 
         color=COLORS['primary'], markersize=8)
ax1.fill_between(hourly_stats['departure_hour'], 
                 hourly_stats['mean'] - hourly_stats['std']/2,
                 hourly_stats['mean'] + hourly_stats['std']/2,
                 alpha=0.2, color=COLORS['primary'])
ax1.axhline(y=0, color='black', linestyle='--', alpha=0.3)
ax1.axhline(y=15, color=COLORS['danger'], linestyle='--', alpha=0.5, label='Delay Threshold')
ax1.set_xlabel('Hour of Day')
ax1.set_ylabel('Average Delay (minutes)')
ax1.set_title('Average Delay by Hour of Day')
ax1.set_xticks(range(0, 24))
ax1.legend()

# By day of week
ax2 = axes[0, 1]
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
daily_delay = df_active.groupby('day_name')['arrival_delay'].mean().reindex(day_order)
colors_day = [COLORS['danger'] if x > daily_delay.mean() else COLORS['success'] for x in daily_delay.values]
ax2.bar(range(7), daily_delay.values, color=colors_day, edgecolor='white')
ax2.set_xticks(range(7))
ax2.set_xticklabels(['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
ax2.axhline(y=daily_delay.mean(), color='black', linestyle='--', alpha=0.5)
ax2.set_xlabel('Day of Week')
ax2.set_ylabel('Average Delay (minutes)')
ax2.set_title('Average Delay by Day of Week')

# By month
ax3 = axes[1, 0]
monthly_delay = df_active.groupby('departure_month')['arrival_delay'].mean()
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
colors_month = plt.cm.coolwarm(np.linspace(0, 1, 12))
ax3.bar(range(1, 13), monthly_delay.values, color=colors_month, edgecolor='white')
ax3.set_xticks(range(1, 13))
ax3.set_xticklabels(month_names)
ax3.set_xlabel('Month')
ax3.set_ylabel('Average Delay (minutes)')
ax3.set_title('Average Delay by Month (Seasonal Pattern)')

# Delay rate heatmap
ax4 = axes[1, 1]
heatmap_data = df_active.pivot_table(values='is_delayed', index='day_of_week', 
                                      columns='departure_hour', aggfunc='mean') * 100
sns.heatmap(heatmap_data, cmap='YlOrRd', ax=ax4, cbar_kws={'label': 'Delay Rate (%)'})
ax4.set_yticklabels(['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
ax4.set_xlabel('Hour of Day')
ax4.set_ylabel('Day of Week')
ax4.set_title('Delay Rate Heatmap (Hour √ó Day)')

plt.tight_layout()
plt.savefig('../reports/figures/temporal_patterns.png', dpi=150, bbox_inches='tight')
plt.show()


## 6. Carrier and Airport Analysis


In [None]:
# Carrier performance analysis
carrier_stats = df_active.groupby('carrier').agg({
    'arrival_delay': ['mean', 'median', 'std'],
    'is_delayed': 'mean',
    'flight_id': 'count'
}).reset_index()
carrier_stats.columns = ['Carrier', 'Mean Delay', 'Median Delay', 'Std Delay', 'Delay Rate', 'Flight Count']
carrier_stats['Delay Rate'] = (carrier_stats['Delay Rate'] * 100).round(2)
carrier_stats = carrier_stats.sort_values('Mean Delay', ascending=False)

print("‚úàÔ∏è Carrier Performance Summary:")
display(carrier_stats)

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Mean delay by carrier
ax1 = axes[0]
carrier_sorted = carrier_stats.sort_values('Mean Delay')
colors_carrier = [COLORS['danger'] if x > 15 else COLORS['warning'] if x > 0 else COLORS['success'] 
                  for x in carrier_sorted['Mean Delay']]
bars = ax1.barh(carrier_sorted['Carrier'], carrier_sorted['Mean Delay'], color=colors_carrier, edgecolor='white')
ax1.axvline(x=15, color=COLORS['danger'], linestyle='--', linewidth=1.5, alpha=0.7, label='15 min threshold')
ax1.set_xlabel('Average Delay (minutes)')
ax1.set_title('Average Delay by Carrier')
ax1.legend()

# Delay rate by carrier
ax2 = axes[1]
carrier_by_rate = carrier_stats.sort_values('Delay Rate')
colors_rate = [COLORS['danger'] if x > 30 else COLORS['warning'] if x > 20 else COLORS['success'] 
               for x in carrier_by_rate['Delay Rate']]
ax2.barh(carrier_by_rate['Carrier'], carrier_by_rate['Delay Rate'], color=colors_rate, edgecolor='white')
ax2.set_xlabel('Delay Rate (%)')
ax2.set_title('Delay Rate by Carrier')

plt.tight_layout()
plt.savefig('../reports/figures/carrier_analysis.png', dpi=150, bbox_inches='tight')
plt.show()


In [None]:
# Airport analysis
origin_stats = df_active.groupby('origin').agg({
    'departure_delay': 'mean',
    'arrival_delay': 'mean', 
    'is_delayed': 'mean',
    'flight_id': 'count'
}).reset_index()
origin_stats.columns = ['Airport', 'Avg Dep Delay', 'Avg Arr Delay', 'Delay Rate', 'Departures']
origin_stats['Delay Rate'] = (origin_stats['Delay Rate'] * 100).round(2)

print("üè¢ Top 10 Airports with Highest Average Delay:")
display(origin_stats.nlargest(10, 'Avg Dep Delay'))

# Airport visualization
fig, axes = plt.subplots(1, 2, figsize=(16, 8))

# Top 15 airports by delay
ax1 = axes[0]
top_delay = origin_stats.nlargest(15, 'Avg Dep Delay').sort_values('Avg Dep Delay')
colors_apt = [COLORS['danger'] if x > 15 else COLORS['warning'] if x > 5 else COLORS['success'] 
              for x in top_delay['Avg Dep Delay']]
ax1.barh(top_delay['Airport'], top_delay['Avg Dep Delay'], color=colors_apt, edgecolor='white')
ax1.axvline(x=15, color=COLORS['danger'], linestyle='--', alpha=0.7)
ax1.set_xlabel('Average Departure Delay (minutes)')
ax1.set_title('Top 15 Airports by Average Delay')

# Volume vs Delay scatter
ax2 = axes[1]
scatter = ax2.scatter(origin_stats['Departures'], origin_stats['Avg Dep Delay'], 
                      c=origin_stats['Delay Rate'], cmap='RdYlGn_r', 
                      s=100, alpha=0.7, edgecolors='white')
plt.colorbar(scatter, ax=ax2, label='Delay Rate (%)')
ax2.set_xlabel('Number of Departures')
ax2.set_ylabel('Average Delay (minutes)')
ax2.set_title('Airport Volume vs Delay')

# Label major airports
for _, row in origin_stats.nlargest(5, 'Departures').iterrows():
    ax2.annotate(row['Airport'], (row['Departures'], row['Avg Dep Delay']), fontsize=9)

plt.tight_layout()
plt.savefig('../reports/figures/airport_analysis.png', dpi=150, bbox_inches='tight')
plt.show()


## 7. Summary and Next Steps


In [None]:
# Summary
print("=" * 70)
print("üìã EXPLORATORY DATA ANALYSIS SUMMARY")
print("=" * 70)

print(f"\nüìä Dataset Overview:")
print(f"   ‚Ä¢ Total flights analyzed: {len(df_active):,}")
print(f"   ‚Ä¢ Unique carriers: {df_active['carrier'].nunique()}")
print(f"   ‚Ä¢ Unique airports: {df_active['origin'].nunique()}")

print(f"\n‚è±Ô∏è Delay Statistics:")
print(f"   ‚Ä¢ Overall delay rate: {df_active['is_delayed'].mean()*100:.1f}%")
print(f"   ‚Ä¢ Average arrival delay: {df_active['arrival_delay'].mean():.1f} minutes")
print(f"   ‚Ä¢ Median arrival delay: {df_active['arrival_delay'].median():.1f} minutes")

print(f"\nüìà Key Patterns Identified:")
worst_hour = hourly_stats.loc[hourly_stats['mean'].idxmax()]
print(f"   ‚Ä¢ Worst time for delays: {int(worst_hour['departure_hour'])}:00")
worst_carrier = carrier_stats.iloc[0]['Carrier']
print(f"   ‚Ä¢ Highest delay carrier: {worst_carrier}")

print("\n" + "=" * 70)

# Save for next notebook
df_active.to_csv('../data/processed/flights_explored.csv', index=False)
print("\n‚úì Data saved for next notebook: data/processed/flights_explored.csv")
