# Using Aviation Edge and OpenSky APIs for historical data analysis

## Phase 1: Data Collection

### Real-Time Flight Data

In [34]:
# Importing modules

import requests
import pandas as pd
import numpy as np
import nest_asyncio
import asyncio
from datetime import datetime, timedelta
import time
import json
import random

In [36]:
# Setting up OpenSky

from python_opensky import OpenSky
nest_asyncio.apply()

class AirCanadaDataCollector:
    def __init__(self):
        self.flight_data = []
        self.historical_data = pd.DataFrame()   #Dataframe for storing Air Canada Flights
    
    async def collect_realtime_flights(self):
        """Collects current Air Canada flight data"""
        async with OpenSky() as opensky:
            
            # Getting most recent flight data
            states = await opensky.get_states()
            
            ac_flights = []

            # Filtering to get Air Canada flight data only
            for state in states.states:
                if state.callsign and state.callsign.strip().startswith('ACA'):
                    flight_info = {
                        'timestamp': datetime.now(),
                        'callsign': state.callsign.strip(),
                        'icao24': state.icao24,
                        'longitude': state.longitude,
                        'latitude': state.latitude,
                        'altitude': state.barometric_altitude,
                        'velocity': state.velocity,
                        'heading': state.true_track,
                        'vertical_rate': state.vertical_rate,
                        'on_ground': state.on_ground
                    }

                    # Adds the current Air Canada flight details to the list
                    ac_flights.append(flight_info)
            
            return pd.DataFrame(ac_flights)
    
    async def collect_historical_flights(self, start_time, end_time):
        """Collects historical flight data for a specific time range"""
        async with OpenSky() as opensky:
            # OpenSky provides historical data access
            flights = await opensky.get_flights_by_aircraft(
                begin=int(start_time.timestamp()),
                end=int(end_time.timestamp())
            )
            return flights

# Initializes collector object
collector = AirCanadaDataCollector()

# Collectes current flights
current_flights = await collector.collect_realtime_flights()
print(f"Collected {len(current_flights)} current Air Canada flights")
print(current_flights)

Collected 40 current Air Canada flights
                    timestamp callsign  icao24  longitude  latitude  altitude  \
0  2025-07-01 04:19:47.948007     ACA7  c01049   113.7602   22.1908    883.92   
1  2025-07-01 04:19:47.948007   ACA894  c06a39     3.5506   48.4770  11887.20   
2  2025-07-01 04:19:47.948007   ACA900  c058c0   -10.9907   54.4862  12496.80   
3  2025-07-01 04:19:47.948007  ACA1074  c07b09   -73.7495   45.4592       NaN   
4  2025-07-01 04:19:47.948007   ACA878  c038c8    -1.0440   46.0665  10660.38   
5  2025-07-01 04:19:47.948007   ACA918  c038a7    -6.2498   53.4310       NaN   
6  2025-07-01 04:19:47.948007   ACA800  c023c5    -7.2842   53.7803   5158.74   
7  2025-07-01 04:19:47.948007   ACA794  c010ea  -110.6108   37.3617   9448.80   
8  2025-07-01 04:19:47.948007   ACA129  c010cb  -123.1735   49.1862    -15.24   
9  2025-07-01 04:19:47.948007   ACA128  c051e2  -121.7690   49.1506   6911.34   
10 2025-07-01 04:19:47.948007   ACA314  c05067  -118.0506   49.3892  

### Air Canada Route and Schedule Data

In [39]:
# Creating a class processing airport data and identifying high traffic routes

class AirCanadaRoute:
    def __init__(self):
        self.major_hubs = ["YUL", "YYZ", "YYC", "YVR"]  # Major Hubs: Montreal, Toronto & Vancouver
        self.route_data = []

    def get_airport_data(self):
        """Gets airport traffic data from reliable sources (yyc.com, admtl.com, internationalairportreview.com and torontopearson.com)"""
        
        airport_data = {
            "YUL" : {"name": "Montreal Trudeau", "annual_passengers": "22400000"},
            "YYZ" : {"name": "Toronto Pearson", "annual_passengers": "46800000"},
            "YVR" : {"name": "Vancouver International", "annual_passengers": "26200000"}
        }
        return airport_data

    def identify_high_traffic_routes(self, flight_data):
        """Identifies highest traffic routes for analysis"""
        route_frequency = {}
        
        for _, flight in flight_data.iterrows():

            # Extracts route information from callsign patterns
            callsign = flight["callsign"]
            
            # Air Canada uses specific callsign patterns for different routes
            if callsign.startswith('ACA'):
                flight_number = callsign[3:]

                # Key: flight_number, value: route_frequency
                route_frequency[flight_number] = route_frequency.get(flight_number, 0) + 1
        return dict(sorted(route_frequency.items(), key=lambda x:x[1], reverse=True))

In [41]:
#  Creating AirCanadaRoute objects

route_collector = AirCanadaRoute()
airport_data = route_collector.get_airport_data()
high_traffic_routes = route_collector.identify_high_traffic_routes(current_flights)

In [43]:
print(high_traffic_routes)

{'7': 1, '894': 1, '900': 1, '1074': 1, '878': 1, '918': 1, '800': 1, '794': 1, '129': 1, '128': 1, '314': 1, '780': 1, '746': 1, '7201': 1, '7252': 1, '902': 1, '816': 1, '347': 1, '154': 1, '842': 1, '540': 1, '126': 1, '91': 1, '892': 1, '313': 1, '364': 1, '': 1, '519': 1, '327': 1, '156': 1, '890': 1, '42': 1, '820': 1, '870': 1, '298': 1, '644': 1, '876': 1, '159': 1, '884': 1, '228': 1}


### External Data Collection (Weather, Holidays & Seasonal Factors)

In [84]:
class ExternalDataCollector:
    def __init__(self):
        # Processes weather data from 'api.weather.gc.ca' and public holidays for later use
        
        self.weather_api = "https://api.weather.gc.ca"
        self.geomet_api = "https://api.weather.gc.ca/collections"

        self.major_hubs = {
            'YUL': {'name': 'Montreal Trudeau', 'lat': 45.4706, 'lon': -73.7408, 'passengers': 22400000},
            'YYZ': {'name': 'Toronto Pearson', 'lat': 43.6777, 'lon': -79.6248, 'passengers': 46800000},
            'YVR': {'name': 'Vancouver International', 'lat': 49.1967, 'lon': -123.1815, 'passengers': 26200000}
        }
        
        self.holidays_2025 = [
            "2025-01-01",  # New Year's Day
            "2025-04-18",  # Good Friday
            "2025-05-19",  # Victoria Day
            "2025-07-01",  # Canada Day
            "2025-09-01",  # Labour Day
            "2025-10-13",  # Thanksgiving
            "2025-11-11",  # Remembrance Day
            "2025-12-25",  # Christmas Day
            "2025-12-26"   # Boxing Day
        ]
    
    def get_weather_data(self, airport_code, date):
        """Gets weather data for specific airports for specific dates"""

        if airport_code not in self.major_hubs:
            return None

        hub_info = self.major_hubs[airport_code]

        # Uses GeoMet API for current weather observation based on real-time weather data
        weather_url = f"{self.geomet_api}/weather-observations/items"
        params = {
            'bbox': f"{hub_info['lon']-0.1},{hub_info['lat']-0.1},{hub_info['lon']+0.1},{hub_info['lat']+0.1}",
            'datetime': date.strftime('%Y-%m-%dT%H:%M:%SZ'),
            'limit': 1 
        }
        
        response = requests.get(weather_url, params=params, timeout=10)

        # Only proceeds if the weather data is obtained
        if response.status_code == 200:
            data = response.json()  #Correcting response format

            if data.get('features'):
                properties = data['features'][0]['properties']
                
                return {
                    'airport': airport_code,
                    'date': date,
                    'temperature': properties.get('air_temperature'),
                    'precipitation': properties.get('precipitation'),
                    'visibility': properties.get('visibility'),
                    'wind_speed': properties.get('wind_speed'),
                    'conditions': properties.get('weather_condition'),
                    'pressure': properties.get('pressure')
                }

    
    def is_holiday(self, date):
        """Checks if the specific date is a holiday or not"""
        
        date_str = date.strftime("%Y-%m-%d")
        return date_str in self.holidays_2025

    def get_seasonal_factors(self, date):
        """Gets seasonal adjustment factors"""
        
        month = date.month
        seasons = {
            "winter_months": [12, 1, 2],
            "spring_months": [3, 4, 5],
            "summer_months": [6, 7, 8],
            "fall_months": [9, 10, 11]
        }

        for season, months in seasons.items():
            if month in months:
                return season
            else:
                return "Unknown"

In [86]:
# Creating ExternalDataCollector object

external_collector = ExternalDataCollector()

### Passenger Booking Pattern Simulator

In [89]:
# Realistic Passenger Booking Simulation since I don't have access to real passenger data immediately

class PassengerBookingSimulator:
    def __init__(self):
        # Passenger booking features that affects no-show probability
        
        self.fare_class = ["Basic", "Standard", "Flex", "Premium", "Business"]
        self.advance_purchase_window = [1, 7, 14, 30, 60, 90]  # Time in days
        self.passenger_types = ["Business", "Leisure", "VFR"]

    def generate_booking_data(self, flight_info, num_passengers = 180):
        """Generates realistic passenger booking data for a flight based on available research"""
        
        passengers = []

        for i in range(0, num_passengers):
            # Generates passenger features that research shows affect no-show rates

            advance_purchase = random.choice(self.advance_purchase_window)
            fare_class = random.choice(self.fare_class)
            passenger_type = random.choice(self.passenger_types)

            # Factors that co-relate with no-show probability according to research (https://citeseerx.ist.psu.edu/document?repid=rep1&type=pdf&doi=d3572abdaf3d12bd8e85471b1b217c495f7662d9)
            base_no_show_prob = 0.12  # According to researches, the industry average is between 5%-15% with 12% being a commonly cited average

            if advance_purchase <= 7:
                base_no_show_prob *= 0.7  # Last-minute bookings are less likely to be a no-show. Based on 30% reduction for last minute booking is based on general industry logic

            if fare_class in ["Premium", "Business"]:
                base_no_show_prob *= 0.5  # Premium and Business bookings are less likely to be a no-show. More of an assumption so 50/50 chances

            if passenger_type == "Business":
                base_no_show_prob *= 0.6  # Passengers travelling for business are less likely to be a no-show. 40% reduction for business travelers is based on general industry logic

            passenger = {
                "passenger_id": f"PAX_{flight_info['callsign']}_{i:03d}",
                "flight_callsign": flight_info["callsign"],
                "booking_data": datetime.now() - timedelta(days= advance_purchase),
                "advance_purchase_days": advance_purchase,
                "fare_class": fare_class,
                "passenger_type": passenger_type,
                "predicted_no_show_prob": min(base_no_show_prob, 0.15),  # Caps at 15%
                "actual_no_show": random.random() > base_no_show_prob  # Returns boolean
            }
            passengers.append(passenger)

        return pd.DataFrame(passengers)

In [91]:
# Creating PassengerBookingSimulation object

passenger_simulation = PassengerBookingSimulator()

### Building a data collection pipeline

In [94]:
# AirCanadaPipeline class collects real-time flight details, analyzes the routes, collects external data for each flight and predicts the no show rate

In [96]:
class AirCanadaPipeline:

    def __init__(self):
        
        # Initializing all the objects
        self.flight_collector = AirCanadaDataCollector()
        self.route_collector = AirCanadaRoute()
        self.external_data_collector = ExternalDataCollector()
        self.passenger_simulator = PassengerBookingSimulator()

    async def run_complete_collection(self):
        """Runs the complete data collection"""
        print("Starting Air Canada Data Collection...")

        # Collects real-time Air Canada flight data
        print("Collecting real-time flight data...")
        current_flights = await self.flight_collector.collect_realtime_flights()
        
        # Analyze the routes for the flights
        print("Analyzing flight route patterns...")
        high_traffic_routes = self.route_collector.identify_high_traffic_routes(current_flights)
        
        # Collect external data for each flights
        print("Collecting external factors...")
        tuned_flights = []

        # Iterates through each flight
        for _, flight, in current_flights.iterrows():
            
            # Adds weather data
            weather = self.external_data_collector.get_weather_data("YYZ", datetime.now())

            # Adds holiday data
            is_holiday = self.external_data_collector.is_holiday(datetime.now())

            # Adds seasonal factor
            season = self.external_data_collector.get_seasonal_factors(datetime.now())

            # Runs passenger booking simulation for this flight
            passengers = self.passenger_simulator.generate_booking_data(flight)

            tuned_flight = {
                **flight.to_dict(),  # Converts the flight data to dictionary representation
                "weather_conditions": weather,
                "is_holiday": is_holiday,
                "season": season,
                "passenger_count": len(passengers),
                "predicted_no_shows": passengers["predicted_no_show_prob"].sum(),   # Predicted number of no-show passengers
                "historical_no_show_rate": passengers["predicted_no_show_prob"].mean()  # Average no-show rate for flight
            }

            tuned_flights.append(tuned_flight)  # Adds the simulation data to the tuned_flights list

        tuned_df = pd.DataFrame(tuned_flights)  # Creates DataFrame for the tuned_flights

        print(f"Complete! Collected data for {len(tuned_df)} flights.")
        print(f"Average predicted no-show rate: {tuned_df['historical_no_show_rate'].mean():.3f}")

        return tuned_df, high_traffic_routes

In [98]:
# Runs the Pipeline

pipeline = AirCanadaPipeline()
flight_data, route_analysis = await pipeline.run_complete_collection()
        
# Displays the output of pipeline

print("\n*** Air Canada Flight Analysis ***")
print(flight_data[['callsign', 'passenger_count', 'predicted_no_shows', 'historical_no_show_rate']].head())

print(f"\n*** Top 5 Air Canada Routes by Frequency: ***")
for route, frequency in list(route_analysis.items())[:5]:
    print(f"AC{route}: {frequency} flights observed.")
    

Starting Air Canada Data Collection...
Collecting real-time flight data...
Analyzing flight route patterns...
Collecting external factors...
Complete! Collected data for 36 flights.
Average predicted no-show rate: 0.075

*** Air Canada Flight Analysis ***
  callsign  passenger_count  predicted_no_shows  historical_no_show_rate
0     ACA7              180             13.3092                 0.073940
1   ACA894              180             14.2512                 0.079173
2   ACA900              180             13.6332                 0.075740
3  ACA1074              180             13.7808                 0.076560
4   ACA878              180             12.9648                 0.072027

*** Top 5 Air Canada Routes by Frequency: ***
AC7: 1 flights observed.
AC894: 1 flights observed.
AC900: 1 flights observed.
AC1074: 1 flights observed.
AC878: 1 flights observed.
