# Using Aviation Edge and OpenSky APIs for historical data analysis

## Phase 1: Data Collection

### Real-Time Flight Data

In [4]:
# Importing modules

import requests
import pandas as pd
import nest_asyncio
import asyncio
from datetime import datetime, timedelta
import time
import json
import random

In [5]:
# Setting up OpenSky

from python_opensky import OpenSky
nest_asyncio.apply()

class AirCanadaDataCollector:
    def __init__(self):
        self.flight_data = []
        self.historical_data = pd.DataFrame()   #Dataframe for storing Air Canada Flights
    
    async def collect_realtime_flights(self):
        """Collects current Air Canada flight data"""
        async with OpenSky() as opensky:
            
            # Getting most recent flight data
            states = await opensky.get_states()
            
            ac_flights = []

            # Filtering to get Air Canada flight data only
            for state in states.states:
                if state.callsign and state.callsign.strip().startswith('ACA'):
                    flight_info = {
                        'timestamp': datetime.now(),
                        'callsign': state.callsign.strip(),
                        'icao24': state.icao24,
                        'longitude': state.longitude,
                        'latitude': state.latitude,
                        'altitude': state.barometric_altitude,
                        'velocity': state.velocity,
                        'heading': state.true_track,
                        'vertical_rate': state.vertical_rate,
                        'on_ground': state.on_ground
                    }

                    # Adds the current Air Canada flight details to the list
                    ac_flights.append(flight_info)
            
            return pd.DataFrame(ac_flights)
    
    async def collect_historical_flights(self, start_time, end_time):
        """Collects historical flight data for a specific time range"""
        async with OpenSky() as opensky:
            # OpenSky provides historical data access
            flights = await opensky.get_flights_by_aircraft(
                begin=int(start_time.timestamp()),
                end=int(end_time.timestamp())
            )
            return flights

# Initializes collector object
collector = AirCanadaDataCollector()

# Collectes current flights
current_flights = await collector.collect_realtime_flights()
print(f"Collected {len(current_flights)} current Air Canada flights")
print(current_flights)

Collected 78 current Air Canada flights
                    timestamp callsign  icao24  longitude  latitude  altitude  \
0  2025-06-29 17:37:11.787163   ACA760  c07c7a   -77.2774   46.2611  10668.00   
1  2025-06-29 17:37:11.787163   ACA506  c07e2c   -86.2804   42.3094   7985.76   
2  2025-06-29 17:37:11.787163  ACA1186  c07e32   -73.4456   43.4302  11277.60   
3  2025-06-29 17:37:11.787163   ACA733  c07e33   -73.6448   41.0390   3268.98   
4  2025-06-29 17:37:11.787163   ACA112  c01040  -122.0853   49.1494   6111.24   
..                        ...      ...     ...        ...       ...       ...   
73 2025-06-29 17:37:11.789625   ACA460  c005c5   -79.6346   43.6938       NaN   
74 2025-06-29 17:37:11.789625   ACA115  c0192e   -94.9141   47.9360   9136.38   
75 2025-06-29 17:37:11.789625   ACA585  c00821   -80.7518   43.6274  10370.82   
76 2025-06-29 17:37:11.789625   ACA238  c006ec  -121.7724   50.0930   7338.06   
77 2025-06-29 17:37:11.789625   ACA356  c00757  -112.5554   51.0126  

### Air Canada Route and Schedule Data

In [9]:
# Creating a class processing airport data and identifying high traffic routes

class AirCanadaRoute:
    def __init__(self):
        self.major_hubs = ["YUL", "YYZ", "YYC", "YVR"]  # Major Hubs: Montreal, Toronto, Calgary & Vancouver
        self.route_data = []

    def get_airport_data(self):
        """Gets airport traffic data from reliable sources (yyc.com, admtl.com, internationalairportreview.com and torontopearson.com)"""
        
        airport_data = {
            "YUL" : {"name": "Montreal Trudeau", "annual_passengers": "22400000"},
            "YYZ" : {"name": "Toronto Pearson", "annual_passengers": "46800000"},
            "YYC" : {"name": "Calgary International", "annual_passengers": "18900000"},
            "YVR" : {"name": "Vancouver International", "annual_passengers": "26200000"}
        }
        return airport_data

    def identify_high_traffic_routes(self, flight_data):
        """Identifies highest traffic routes for analysis"""
        route_frequency = {}
        
        for _, flight in flight_data.iterrows():

            # Extracts route information from callsign patterns
            callsign = flight["callsign"]
            
            # Air Canada uses specific callsign patterns for different routes
            if callsign.startswith('ACA'):
                flight_number = callsign[3:]

                # Key: flight_number, value: route_frequency
                route_frequency[flight_number] = route_frequency.get(flight_number, 0) + 1
        return dict(sorted(route_frequency.items(), key=lambda x:x[1], reverse=True))

In [11]:
#  Creating AirCanadaRoute objects

route_collector = AirCanadaRoute()
airport_data = route_collector.get_airport_data()
high_traffic_routes = route_collector.identify_high_traffic_routes(current_flights)

In [13]:
print(high_traffic_routes)

{'760': 1, '506': 1, '1186': 1, '733': 1, '112': 1, '895': 1, '095': 1, '068': 1, '931': 1, '788': 1, '740': 1, '1358': 1, '575': 1, '3': 1, '905': 1, '063': 1, '168': 1, '983': 1, '526': 1, '540': 1, '741': 1, '771': 1, '266': 1, '265': 1, '217': 1, '626': 1, '361': 1, '110': 1, '791': 1, '1310': 1, '548': 1, '1034': 1, '554': 1, '1300': 1, '648': 1, '305': 1, '145': 1, '416': 1, '142': 1, '1171': 1, '567': 1, '1282': 1, '487': 1, '990': 1, '111': 1, '668': 1, '323': 1, '1176': 1, '324': 1, '1337': 1, '029': 1, '778': 1, '738': 1, '6': 1, '891': 1, '897': 1, '2': 1, '304': 1, '1178': 1, '108': 1, '1040': 1, '770': 1, '2179': 1, '592': 1, '940': 1, '7225': 1, '171': 1, '113': 1, '576': 1, '555': 1, '774': 1, '877': 1, '7243': 1, '460': 1, '115': 1, '585': 1, '238': 1, '356': 1}


### External Data Collection (Weather, Holidays & Seasonal Factors)

In [16]:
class ExternalDataCollector:
    def __init__(self):
        # Processes weather data from 'api.weather.gc.ca' and public holidays for later use
        
        self.weather_api = "https://api.weather.gc.ca"
        self.holidays_2025 = [
            "2025-01-01",  # New Year's Day
            "2025-04-18",  # Good Friday
            "2025-05-19",  # Victoria Day
            "2025-07-01",  # Canada Day
            "2025-09-01",  # Labour Day
            "2025-10-13",  # Thanksgiving
            "2025-11-11",  # Remembrance Day
            "2025-12-25",  # Christmas Day
            "2025-12-26"   # Boxing Day
        ]
    
    def get_weather_data(self, airport_code, date):
        """Gets weather data for specific airports for specific dates"""
        
        weather_data = {
            "airport": airport_code,
            "date": date,
            "temperature": None,
            "percipitation": None,
            "visibility": None,
            "wind_speed": None,
            "conditions": None
        }
        return weather_data


    def is_holiday(self, date):
        """Checks if the specific date is a holiday or not"""
        
        date_str = date.strftime("%Y-%m-%d")
        return date_str in self.holidays_2025

    def get_seasonal_factors(self, date):
        """Gets seasonal adjustment factors"""
        
        month = date.month
        seasons = {
            "winter_months": [12, 1, 2],
            "spring_months": [3, 4, 5],
            "summer_months": [6, 7, 8],
            "fall_months": [9, 10, 11]
        }

        for season, months in seasons.items():
            if month in months:
                return season
            else:
                return "Unknown"

In [18]:
# Creating ExternalDataCollector object

external_collector = ExternalDataCollector()

### Passenger Booking Pattern Simulator

In [21]:
# Realistic Passenger Booking Simulation since I don't have access to real passenger data immediately

class PassengerBookingSimulator:
    def __init__(self):
        # Passenger booking features that affects no-show probability
        
        self.fare_class = ["Basic", "Standard", "Flex", "Premium", "Business"]
        self.advance_purchase_window = [1, 7, 14, 30, 60, 90]  # Time in days
        self.passenger_types = ["Business", "Leisure", "VFR"]

    def generate_booking_data(self, flight_info, num_of_passengers):
        """Generates realistic passenger booking data for a flight"""
        
        passengers = []

        for i in range(0, num_passengers= 200):
            # Generates passenger features that research shows affect no-show rates

            advance_purchase = random.choice(self.advance_purchase_window)
            fare_class = random.choice(self.fare_class)
            passenger_type = random.choice(self.passenger_type)

            # Factors that co-relate with no-show probability according to research (https://citeseerx.ist.psu.edu/document?repid=rep1&type=pdf&doi=d3572abdaf3d12bd8e85471b1b217c495f7662d9)
            base_no_show_probability = 0.1  # According to industry average (5%-15%)

            if advance_purchase <= 7:
                base_no_show_prob *= 0.7  # Last-minute bookings are less likely to be a no-show

            if fare_class in ["Premium", "Business"]:
                base_no_show_prob *= 0.5  # Premium and Business bookings are less likely to be a no-show

            if passenger_type == "Business":
                base_no_show_prob *= 0.6  # Passengers travelling for business are less likely to be a no-show

            passenger = {
                "passenger_id": f"PAX_{flight_info['callsign']}_{i:03d}",
                "flight_callsign": flight_info["callsign"],
                "booking_data": datetime.now() - timedelta(days= advance_purchase),
                "advance_purchase_days": advance_purchase,
                "fare_class": fare_class,
                "passenger_type": passenger_type,
                "predicted_no_show_prob": min(base_no_show, 0.15),  # Caps at 15%
                "actual_no_show": random.random() > base_no_show_prob
            }
            passengers.append(passenger)

        return pd.DataFrame(passengers)

In [23]:
# Creating PassengerBookingSimulation object

passenger_simulation = PassengerBookingSimulator()

### Building a data collection pipeline

In [26]:
# AirCanadaPipeline class collects real-time flight details, analyzes the routes, collects external data for each flight and predicts the no show rate

In [37]:
class AirCanadaPipeline:

    def __init__(self):
        
        # Initializing all the objects
        self.flight_collector = AirCanadaDataCollector()
        self.route_collector = AirCanadaRoute()
        self.external_data_collector = ExternalDataCollector()
        self.passenger_simulator = PassengerDataSimulator()

    # Collect real-time flight details
    # Analyze the routes for the flights
    # Collect external data for each flights