# Using Aviation Edge and OpenSky APIs for historical data analysis

## Phase 1: Data Collection

### Real-Time Flight Data

In [92]:
# Importing modules

import requests
import pandas as pd
import nest_asyncio
import asyncio
from datetime import datetime, timedelta
import time
import json
import random

In [94]:
# Setting up OpenSky

from python_opensky import OpenSky
nest_asyncio.apply()

class AirCanadaDataCollector:
    def __init__(self):
        self.flight_data = []
        self.historical_data = pd.DataFrame()   #Dataframe for storing Air Canada Flights
    
    async def collect_realtime_flights(self):
        """Collects current Air Canada flight data"""
        async with OpenSky() as opensky:
            
            # Getting most recent flight data
            states = await opensky.get_states()
            
            ac_flights = []

            # Filtering to get Air Canada flight data only
            for state in states.states:
                if state.callsign and state.callsign.strip().startswith('ACA'):
                    flight_info = {
                        'timestamp': datetime.now(),
                        'callsign': state.callsign.strip(),
                        'icao24': state.icao24,
                        'longitude': state.longitude,
                        'latitude': state.latitude,
                        'altitude': state.barometric_altitude,
                        'velocity': state.velocity,
                        'heading': state.true_track,
                        'vertical_rate': state.vertical_rate,
                        'on_ground': state.on_ground
                    }

                    # Adds the current Air Canada flight details to the list
                    ac_flights.append(flight_info)
            
            return pd.DataFrame(ac_flights)
    
    async def collect_historical_flights(self, start_time, end_time):
        """Collects historical flight data for a specific time range"""
        async with OpenSky() as opensky:
            # OpenSky provides historical data access
            flights = await opensky.get_flights_by_aircraft(
                begin=int(start_time.timestamp()),
                end=int(end_time.timestamp())
            )
            return flights

# Initializes collector object
collector = AirCanadaDataCollector()

# Collectes current flights
current_flights = await collector.collect_realtime_flights()
print(f"Collected {len(current_flights)} current Air Canada flights")
print(current_flights)

Collected 86 current Air Canada flights
                    timestamp callsign  icao24  longitude  latitude  altitude  \
0  2025-06-29 19:07:51.882994   ACA772  c07c7b   -72.0295   42.6124   5372.10   
1  2025-06-29 19:07:51.882994   ACA733  c07e33   -73.7552   45.4576       NaN   
2  2025-06-29 19:07:51.882994   ACA030  c0104f  -123.1813   49.1955       NaN   
3  2025-06-29 19:07:51.882994    ACA17  c01049   123.3402   16.0001  12192.00   
4  2025-06-29 19:07:51.882994     ACA7  c01074  -159.8760   57.0108  10363.20   
..                        ...      ...     ...        ...       ...       ...   
81 2025-06-29 19:07:51.891554   ACA667  c04fbb   -70.0264   45.8539  10363.20   
82 2025-06-29 19:07:51.891554   ACA344  c04ff7  -113.8961   48.9480  11269.98   
83 2025-06-29 19:07:51.891554  ACA1058  c00700  -122.6374   48.4150   6614.16   
84 2025-06-29 19:07:51.891554   ACA115  c0192e  -110.5517   51.0190  10378.44   
85 2025-06-29 19:07:51.891554   ACA585  c00821   -94.5676   48.4619  

### Air Canada Route and Schedule Data

In [96]:
# Creating a class processing airport data and identifying high traffic routes

class AirCanadaRoute:
    def __init__(self):
        self.major_hubs = ["YUL", "YYZ", "YYC", "YVR"]  # Major Hubs: Montreal, Toronto, Calgary & Vancouver
        self.route_data = []

    def get_airport_data(self):
        """Gets airport traffic data from reliable sources (yyc.com, admtl.com, internationalairportreview.com and torontopearson.com)"""
        
        airport_data = {
            "YUL" : {"name": "Montreal Trudeau", "annual_passengers": "22400000"},
            "YYZ" : {"name": "Toronto Pearson", "annual_passengers": "46800000"},
            "YYC" : {"name": "Calgary International", "annual_passengers": "18900000"},
            "YVR" : {"name": "Vancouver International", "annual_passengers": "26200000"}
        }
        return airport_data

    def identify_high_traffic_routes(self, flight_data):
        """Identifies highest traffic routes for analysis"""
        route_frequency = {}
        
        for _, flight in flight_data.iterrows():

            # Extracts route information from callsign patterns
            callsign = flight["callsign"]
            
            # Air Canada uses specific callsign patterns for different routes
            if callsign.startswith('ACA'):
                flight_number = callsign[3:]

                # Key: flight_number, value: route_frequency
                route_frequency[flight_number] = route_frequency.get(flight_number, 0) + 1
        return dict(sorted(route_frequency.items(), key=lambda x:x[1], reverse=True))

In [97]:
#  Creating AirCanadaRoute objects

route_collector = AirCanadaRoute()
airport_data = route_collector.get_airport_data()
high_traffic_routes = route_collector.identify_high_traffic_routes(current_flights)

In [98]:
print(high_traffic_routes)

{'772': 1, '733': 1, '030': 1, '17': 1, '7': 1, '068': 1, '2334': 1, '788': 1, '740': 1, '2114': 1, '793': 1, '3': 1, '8': 1, '23': 1, '062': 1, '33': 1, '526': 1, '921': 1, '376': 1, '148': 1, '1092': 1, '306': 1, '742': 1, '1396': 1, '266': 1, '110': 1, '791': 1, '581': 1, '1034': 1, '554': 1, '305': 1, '566': 1, '1311': 1, '715': 1, '7263': 1, '840': 1, '694': 1, '145': 1, '777': 1, '9XC': 1, '1282': 1, '990': 1, '857': 1, '326': 1, '60': 1, '461': 1, '668': 1, '323': 1, '308': 1, '1115': 1, '763': 1, '324': 1, '556': 1, '615': 1, '599': 1, '617': 1, '267': 1, '619': 1, '778': 1, '894': 1, '418': 1, '150': 1, '304': 1, '861': 1, '119': 1, '164': 1, '108': 1, '1040': 1, '773': 1, '2179': 1, '1029': 1, '464': 1, '421': 1, '7225': 1, '146': 1, '171': 1, '113': 1, '151': 1, '184': 1, '555': 1, '527': 1, '667': 1, '344': 1, '1058': 1, '115': 1, '585': 1}


### External Data Collection (Weather, Holidays & Seasonal Factors)

In [104]:
class ExternalDataCollector:
    def __init__(self):
        # Processes weather data from 'api.weather.gc.ca' and public holidays for later use
        
        self.weather_api = "https://api.weather.gc.ca"
        self.holidays_2025 = [
            "2025-01-01",  # New Year's Day
            "2025-04-18",  # Good Friday
            "2025-05-19",  # Victoria Day
            "2025-07-01",  # Canada Day
            "2025-09-01",  # Labour Day
            "2025-10-13",  # Thanksgiving
            "2025-11-11",  # Remembrance Day
            "2025-12-25",  # Christmas Day
            "2025-12-26"   # Boxing Day
        ]
    
    def get_weather_data(self, airport_code, date):
        """Gets weather data for specific airports for specific dates"""
        
        weather_data = {
            "airport": airport_code,
            "date": date,
            "temperature": None,
            "percipitation": None,
            "visibility": None,
            "wind_speed": None,
            "conditions": None
        }
        return weather_data


    def is_holiday(self, date):
        """Checks if the specific date is a holiday or not"""
        
        date_str = date.strftime("%Y-%m-%d")
        return date_str in self.holidays_2025

    def get_seasonal_factors(self, date):
        """Gets seasonal adjustment factors"""
        
        month = date.month
        seasons = {
            "winter_months": [12, 1, 2],
            "spring_months": [3, 4, 5],
            "summer_months": [6, 7, 8],
            "fall_months": [9, 10, 11]
        }

        for season, months in seasons.items():
            if month in months:
                return season
            else:
                return "Unknown"

In [106]:
# Creating ExternalDataCollector object

external_collector = ExternalDataCollector()

### Passenger Booking Pattern Simulator

In [109]:
# Realistic Passenger Booking Simulation since I don't have access to real passenger data immediately

class PassengerBookingSimulator:
    def __init__(self):
        # Passenger booking features that affects no-show probability
        
        self.fare_class = ["Basic", "Standard", "Flex", "Premium", "Business"]
        self.advance_purchase_window = [1, 7, 14, 30, 60, 90]  # Time in days
        self.passenger_types = ["Business", "Leisure", "VFR"]

    def generate_booking_data(self, flight_info, num_passengers = 180):
        """Generates realistic passenger booking data for a flight"""
        
        passengers = []

        for i in range(0, num_passengers):
            # Generates passenger features that research shows affect no-show rates

            advance_purchase = random.choice(self.advance_purchase_window)
            fare_class = random.choice(self.fare_class)
            passenger_type = random.choice(self.passenger_types)

            # Factors that co-relate with no-show probability according to research (https://citeseerx.ist.psu.edu/document?repid=rep1&type=pdf&doi=d3572abdaf3d12bd8e85471b1b217c495f7662d9)
            base_no_show_prob = 0.1  # According to industry average (5%-15%)

            if advance_purchase <= 7:
                base_no_show_prob *= 0.7  # Last-minute bookings are less likely to be a no-show

            if fare_class in ["Premium", "Business"]:
                base_no_show_prob *= 0.5  # Premium and Business bookings are less likely to be a no-show

            if passenger_type == "Business":
                base_no_show_prob *= 0.6  # Passengers travelling for business are less likely to be a no-show

            passenger = {
                "passenger_id": f"PAX_{flight_info['callsign']}_{i:03d}",
                "flight_callsign": flight_info["callsign"],
                "booking_data": datetime.now() - timedelta(days= advance_purchase),
                "advance_purchase_days": advance_purchase,
                "fare_class": fare_class,
                "passenger_type": passenger_type,
                "predicted_no_show_prob": min(base_no_show_prob, 0.15),  # Caps at 15%
                "actual_no_show": random.random() > base_no_show_prob  # Returns boolean
            }
            passengers.append(passenger)

        return pd.DataFrame(passengers)

In [111]:
# Creating PassengerBookingSimulation object

passenger_simulation = PassengerBookingSimulator()

### Building a data collection pipeline

In [114]:
# AirCanadaPipeline class collects real-time flight details, analyzes the routes, collects external data for each flight and predicts the no show rate

In [116]:
class AirCanadaPipeline:

    def __init__(self):
        
        # Initializing all the objects
        self.flight_collector = AirCanadaDataCollector()
        self.route_collector = AirCanadaRoute()
        self.external_data_collector = ExternalDataCollector()
        self.passenger_simulator = PassengerBookingSimulator()

    async def run_complete_collection(self):
        """Runs the complete data collection"""
        print("Starting Air Canada Data Collection...")

        # Collects real-time Air Canada flight data
        print("Collecting real-time flight data...")
        current_flights = await self.flight_collector.collect_realtime_flights()
        
        # Analyze the routes for the flights
        print("Analyzing flight route patterns...")
        high_traffic_routes = self.route_collector.identify_high_traffic_routes(current_flights)
        
        # Collect external data for each flights
        print("Collecting external factors...")
        tuned_flights = []

        # Iterates through each flight
        for _, flight, in current_flights.iterrows():
            
            # Adds weather data
            weather = self.external_data_collector.get_weather_data("YYZ", datetime.now())

            # Adds holiday data
            is_holiday = self.external_data_collector.is_holiday(datetime.now())

            # Adds seasonal factor
            season = self.external_data_collector.get_seasonal_factors(datetime.now())

            # Runs passenger booking simulation for this flight
            passengers = self.passenger_simulator.generate_booking_data(flight)

            tuned_flight = {
                **flight.to_dict(),  # Converts the flight data to dictionary representation
                "weather_conditions": weather,
                "is_holiday": is_holiday,
                "season": season,
                "passenger_count": len(passengers),
                "predicted_no_shows": passengers["predicted_no_show_prob"].sum(),
                "historical_no_show_rate": passengers["predicted_no_show_prob"].mean()
            }

            tuned_flights.append(tuned_flight)  # Adds the simulation data to the tuned_flights list

        tuned_df = pd.DataFrame(tuned_flights)  # Creates DataFrame for the tuned_flights

        print(f"Complete! Collected data for {len(tuned_df)} flights.")
        print(f"Average predicted no-show rate: {tuned_df['historical_no_show_rate'].mean():.3f}")

        return tuned_df, high_traffic_routes

In [118]:
# Runs the Pipeline

pipeline = AirCanadaPipeline()
flight_data, route_analysis = await pipeline.run_complete_collection()
        
# Displays the output of pipeline

print("\n*** Air Canada Flight Analysis ***")
print(flight_data[['callsign', 'passenger_count', 'predicted_no_shows', 'historical_no_show_rate']].head())

print(f"\nTop 5 Air Canada Routes by Frequency:")
for route, frequency in list(route_analysis.items())[:5]:
    print(f"AC{route}: {frequency} flights observed.")

Starting Air Canada Data Collection...
Collecting real-time flight data...
Analyzing flight route patterns...
Collecting external factors...
Complete! Collected data for 86 flights.
Average predicted no-show rate: 0.062

*** Air Canada Flight Analysis ***
  callsign  passenger_count  predicted_no_shows  historical_no_show_rate
0   ACA772              180              11.415                 0.063417
1   ACA733              180              11.648                 0.064711
2   ACA030              180              11.262                 0.062567
3    ACA17              180              11.210                 0.062278
4     ACA7              180              10.971                 0.060950

Top 5 Air Canada Routes by Frequency:
AC772: 1 flights observed.
AC733: 1 flights observed.
AC030: 1 flights observed.
AC17: 1 flights observed.
AC7: 1 flights observed.
