# Flight Data Analysis Notebook

This notebook provides examples of analyzing flight data from the Airport Flight Data Collector system.

## Setup

First, let's import necessary libraries and set up our connection to the API.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from datetime import datetime, timedelta
import json
import warnings
warnings.filterwarnings('ignore')

# Set up plotting
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

In [None]:
# API Configuration
API_BASE_URL = 'http://localhost:3001/api/v2'
API_TOKEN = 'your-api-token-here'  # Replace with your actual token

# Create session with authentication
session = requests.Session()
session.headers.update({
    'Authorization': f'Bearer {API_TOKEN}',
    'Content-Type': 'application/json'
})

print(f"Connected to API at {API_BASE_URL}")

## 1. Fetch Flight Data

Let's fetch recent flight data from our monitored airports.

In [None]:
def fetch_flight_data(airport='SFO', days=30):
    """Fetch flight data for a specific airport"""
    end_date = datetime.now()
    start_date = end_date - timedelta(days=days)
    
    params = {
        'airport': airport,
        'startDate': start_date.isoformat(),
        'endDate': end_date.isoformat(),
        'limit': 10000
    }
    
    response = session.get(f'{API_BASE_URL}/flights', params=params)
    
    if response.status_code == 200:
        data = response.json()
        return pd.DataFrame(data['data'])
    else:
        print(f"Error fetching data: {response.status_code}")
        return None

# Fetch data for all airports
airports = ['SFO', 'YYZ', 'YVR']
flights_data = {}

for airport in airports:
    print(f"Fetching data for {airport}...")
    flights_data[airport] = fetch_flight_data(airport)
    print(f"  Fetched {len(flights_data[airport])} flights")

# Combine all data
all_flights = pd.concat(flights_data.values(), ignore_index=True)
print(f"\nTotal flights: {len(all_flights)}")

## 2. Data Exploration

Let's explore the structure and basic statistics of our flight data.

In [None]:
# Display basic information
print("Dataset Shape:", all_flights.shape)
print("\nColumn Types:")
print(all_flights.dtypes)
print("\nFirst few rows:")
all_flights.head()

In [None]:
# Convert date columns to datetime
date_columns = ['scheduledTime', 'actualTime', 'collectedAt']
for col in date_columns:
    if col in all_flights.columns:
        all_flights[col] = pd.to_datetime(all_flights[col], errors='coerce')

# Calculate delay in minutes
if 'scheduledTime' in all_flights.columns and 'actualTime' in all_flights.columns:
    all_flights['delayMinutes'] = (
        (all_flights['actualTime'] - all_flights['scheduledTime']).dt.total_seconds() / 60
    )
    all_flights['isDelayed'] = all_flights['delayMinutes'] > 15

# Basic statistics
print("\nFlight Status Distribution:")
print(all_flights['status'].value_counts())
print("\nAirline Distribution (Top 10):")
print(all_flights['airline'].value_counts().head(10))

## 3. Flight Volume Analysis

Analyze flight volumes across different dimensions.

In [None]:
# Daily flight volume
if 'scheduledTime' in all_flights.columns:
    all_flights['date'] = all_flights['scheduledTime'].dt.date
    daily_volume = all_flights.groupby(['date', 'airport']).size().reset_index(name='flights')
    
    # Plot daily volume
    fig, ax = plt.subplots(figsize=(15, 6))
    for airport in airports:
        airport_data = daily_volume[daily_volume['airport'] == airport]
        ax.plot(airport_data['date'], airport_data['flights'], label=airport, linewidth=2)
    
    ax.set_xlabel('Date', fontsize=12)
    ax.set_ylabel('Number of Flights', fontsize=12)
    ax.set_title('Daily Flight Volume by Airport', fontsize=14, fontweight='bold')
    ax.legend()
    ax.grid(True, alpha=0.3)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
# Hourly distribution
if 'scheduledTime' in all_flights.columns:
    all_flights['hour'] = all_flights['scheduledTime'].dt.hour
    hourly_dist = all_flights.groupby(['hour', 'airport']).size().reset_index(name='flights')
    
    # Create pivot table for heatmap
    hourly_pivot = hourly_dist.pivot(index='hour', columns='airport', values='flights')
    
    # Plot heatmap
    fig, ax = plt.subplots(figsize=(12, 8))
    sns.heatmap(hourly_pivot, cmap='YlOrRd', annot=True, fmt='g', cbar_kws={'label': 'Number of Flights'})
    ax.set_xlabel('Airport', fontsize=12)
    ax.set_ylabel('Hour of Day', fontsize=12)
    ax.set_title('Flight Distribution by Hour and Airport', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()

## 4. Delay Analysis

Analyze flight delays and their patterns.

In [None]:
# Delay statistics by airport
if 'delayMinutes' in all_flights.columns:
    delay_stats = all_flights.groupby('airport').agg({
        'delayMinutes': ['mean', 'median', 'std'],
        'isDelayed': 'mean'
    }).round(2)
    
    delay_stats.columns = ['Mean Delay (min)', 'Median Delay (min)', 'Std Dev', 'Delay Rate']
    delay_stats['Delay Rate'] = (delay_stats['Delay Rate'] * 100).round(1)
    
    print("Delay Statistics by Airport:")
    print(delay_stats)
    
    # Plot delay distribution
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    for idx, airport in enumerate(airports):
        airport_delays = all_flights[all_flights['airport'] == airport]['delayMinutes'].dropna()
        axes[idx].hist(airport_delays[airport_delays <= 120], bins=30, edgecolor='black', alpha=0.7)
        axes[idx].axvline(15, color='red', linestyle='--', label='15 min threshold')
        axes[idx].set_xlabel('Delay (minutes)')
        axes[idx].set_ylabel('Frequency')
        axes[idx].set_title(f'{airport} Delay Distribution')
        axes[idx].legend()
        axes[idx].grid(True, alpha=0.3)
    
    plt.suptitle('Flight Delay Distributions by Airport', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()

## 5. Airline Performance Analysis

Compare performance metrics across different airlines.

In [None]:
# Airline performance metrics
airline_performance = all_flights.groupby('airline').agg({
    'flightNumber': 'count',
    'isDelayed': 'mean',
    'delayMinutes': 'mean'
}).round(2)

airline_performance.columns = ['Total Flights', 'Delay Rate', 'Avg Delay (min)']
airline_performance['Delay Rate'] = (airline_performance['Delay Rate'] * 100).round(1)

# Filter top airlines
top_airlines = airline_performance.nlargest(15, 'Total Flights')

# Create performance chart
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Delay rate chart
ax1 = axes[0]
top_airlines_sorted = top_airlines.sort_values('Delay Rate')
colors = ['green' if x < 20 else 'orange' if x < 30 else 'red' 
          for x in top_airlines_sorted['Delay Rate']]
ax1.barh(range(len(top_airlines_sorted)), top_airlines_sorted['Delay Rate'], color=colors)
ax1.set_yticks(range(len(top_airlines_sorted)))
ax1.set_yticklabels(top_airlines_sorted.index)
ax1.set_xlabel('Delay Rate (%)')
ax1.set_title('Airline Delay Rates (Top 15 Airlines by Volume)')
ax1.grid(True, alpha=0.3, axis='x')

# Average delay chart
ax2 = axes[1]
top_airlines_sorted = top_airlines.sort_values('Avg Delay (min)')
colors = ['green' if x < 15 else 'orange' if x < 30 else 'red' 
          for x in top_airlines_sorted['Avg Delay (min)']]
ax2.barh(range(len(top_airlines_sorted)), top_airlines_sorted['Avg Delay (min)'], color=colors)
ax2.set_yticks(range(len(top_airlines_sorted)))
ax2.set_yticklabels(top_airlines_sorted.index)
ax2.set_xlabel('Average Delay (minutes)')
ax2.set_title('Average Delay by Airline (Top 15 Airlines by Volume)')
ax2.grid(True, alpha=0.3, axis='x')

plt.suptitle('Airline Performance Comparison', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("\nTop 10 Airlines by Performance:")
print(top_airlines.head(10))

## 6. Advanced Analytics

Use the advanced analytics API endpoints for deeper insights.

In [None]:
# Fetch comprehensive analytics
def get_analytics(endpoint, params={}):
    """Fetch data from analytics endpoints"""
    response = session.get(f'{API_BASE_URL}/analytics/{endpoint}', params=params)
    if response.status_code == 200:
        return response.json()['data']
    else:
        print(f"Error: {response.status_code}")
        return None

# Get trend analysis
trends = get_analytics('trends', {
    'airports': ','.join(airports),
    'startDate': (datetime.now() - timedelta(days=30)).isoformat(),
    'endDate': datetime.now().isoformat()
})

if trends:
    print("Flight Volume Trends:")
    for airport in airports:
        if airport in trends['volume']:
            trend = trends['volume'][airport]
            print(f"  {airport}: {trend['direction']} (slope: {trend['slope']:.2f})")

In [None]:
# Get anomalies
anomalies = get_analytics('anomalies', {
    'airports': ','.join(airports),
    'severity': 'high'
})

if anomalies:
    anomaly_df = pd.DataFrame(anomalies)
    if not anomaly_df.empty:
        print(f"Found {len(anomaly_df)} high-severity anomalies:")
        print(anomaly_df[['airport', 'date', 'type', 'value', 'expected', 'zScore']].head(10))
    else:
        print("No high-severity anomalies detected")

In [None]:
# Get seasonality patterns
seasonality = get_analytics('seasonality', {
    'airports': ','.join(airports)
})

if seasonality:
    # Plot day of week patterns
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    for idx, airport in enumerate(airports):
        if airport in seasonality:
            pattern = seasonality[airport]['dayOfWeek']['pattern']
            df = pd.DataFrame(pattern)
            
            axes[idx].bar(df['day'], df['count'], color='skyblue', edgecolor='black')
            axes[idx].set_xlabel('Day of Week')
            axes[idx].set_ylabel('Flight Count')
            axes[idx].set_title(f'{airport} Weekly Pattern')
            axes[idx].tick_params(axis='x', rotation=45)
    
    plt.suptitle('Flight Volume by Day of Week', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()

## 7. Forecasting

Generate forecasts for future flight volumes and delays.

In [None]:
# Get forecasts
forecasts = get_analytics('forecasts', {
    'airports': ','.join(airports),
    'daysAhead': 7
})

if forecasts:
    # Create forecast visualization
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    
    for idx, airport in enumerate(airports):
        if airport in forecasts:
            forecast = forecasts[airport]
            
            # Volume forecast
            ax1 = axes[0, idx]
            days = range(1, len(forecast['volume']['values']) + 1)
            ax1.plot(days, forecast['volume']['values'], 'b-', marker='o', linewidth=2)
            ax1.fill_between(days, 
                            [v * 0.9 for v in forecast['volume']['values']],
                            [v * 1.1 for v in forecast['volume']['values']],
                            alpha=0.3)
            ax1.set_xlabel('Days Ahead')
            ax1.set_ylabel('Predicted Flights')
            ax1.set_title(f'{airport} Volume Forecast')
            ax1.grid(True, alpha=0.3)
            
            # Delay forecast
            ax2 = axes[1, idx]
            ax2.plot(days, forecast['delays']['values'], 'r-', marker='s', linewidth=2)
            ax2.fill_between(days,
                            [v * 0.9 for v in forecast['delays']['values']],
                            [v * 1.1 for v in forecast['delays']['values']],
                            alpha=0.3, color='red')
            ax2.set_xlabel('Days Ahead')
            ax2.set_ylabel('Predicted Avg Delay (min)')
            ax2.set_title(f'{airport} Delay Forecast')
            ax2.grid(True, alpha=0.3)
    
    plt.suptitle('7-Day Flight Forecasts', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()

## 8. Export Results

Export analysis results for further use.

In [None]:
# Export to CSV
output_dir = './analysis_results/'
import os
os.makedirs(output_dir, exist_ok=True)

# Export flight data
all_flights.to_csv(f'{output_dir}flight_data.csv', index=False)
print(f"Flight data exported to {output_dir}flight_data.csv")

# Export airline performance
top_airlines.to_csv(f'{output_dir}airline_performance.csv')
print(f"Airline performance exported to {output_dir}airline_performance.csv")

# Export to Parquet for efficient storage
all_flights.to_parquet(f'{output_dir}flight_data.parquet', compression='snappy')
print(f"Flight data exported to {output_dir}flight_data.parquet")

# Create summary report
summary = {
    'analysis_date': datetime.now().isoformat(),
    'total_flights': len(all_flights),
    'airports': airports,
    'date_range': {
        'start': all_flights['scheduledTime'].min().isoformat() if 'scheduledTime' in all_flights.columns else None,
        'end': all_flights['scheduledTime'].max().isoformat() if 'scheduledTime' in all_flights.columns else None
    },
    'overall_metrics': {
        'avg_delay': all_flights['delayMinutes'].mean() if 'delayMinutes' in all_flights.columns else None,
        'delay_rate': (all_flights['isDelayed'].mean() * 100) if 'isDelayed' in all_flights.columns else None,
        'top_airline': all_flights['airline'].value_counts().index[0] if 'airline' in all_flights.columns else None
    }
}

with open(f'{output_dir}analysis_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

print(f"\nAnalysis summary exported to {output_dir}analysis_summary.json")
print("\nAnalysis complete!")