# Process blog post:

Like so many others out there this quarantine has instilled a certain level of wanderlust in me. And obviously before I commit to any location I wanted to know that I'm not getting scalped on the airfare. To put my (and possibly your) mind at ease I've looked at historical airfare prices across X popular routes to get a view on the price fluctiations you can expect if you're travelling.

## Table of Contents:
0. Exec summary
1. Data collection
    - Schema and collection method
    - Sources
        - Airports & Routes
        - Flights & Prices
2. Data preparation and feature engineering
    - TBC
3. Model generation
4. Model validation
5. Output and visualizations
6. Extensions

## To do:
Updates for streamlining:
- collect regional mapping table of ICAO codes (e.g first letter K = USA)
- collect airline IATA codes
- possible additional data collection
    - aircraft codes
    - public aviation registers (# planes and type)

## Re-structuring airport and supplements

In [110]:
import numpy as np
import pandas as pd
import requests
import json
import sqlite3

In [134]:
# open all the csv files
airports = pd.read_csv("data/airports.csv")
destinations = pd.read_csv("data/destinations.csv")
basics = pd.read_csv("data/basics.csv")
frequency = pd.read_csv("data/frequency.csv")
runways = pd.read_csv("data/runway.csv")

In [135]:
# data integrity checks
airports["country_comparison"] = airports["Country"].str.replace(" ", "-").str.lower()
airports["url_country"] = airports["URL"].str.split("/").str[1]
airports["check"] = airports["country_comparison"] != airports["url_country"]

# discrepancies to check
airports[airports["check"]].groupby(["url_country", "country_comparison"]).count()

# duplicate URLs
print(f"There are { sum(airports.duplicated('URL')) } duplicated URLS")
airports[airports.duplicated("URL")]

There are 0 duplicated URLS


Unnamed: 0,Airport,Type,City,Country,IATA,ICAO,FAA,URL,country_comparison,url_country,check


In [150]:
# join key
join_key = ["Country", "Airport"]

# basics joins into main table to add long/lat/timezone
basics_t = basics.set_index(join_key + ["Metric"])['Value'].unstack().reset_index()
pd.merge(airports, basics_t, how="left", on=join_key)

# check IATA, ICAO, FAA codes are the same

# drop extra columns

# flag duplicate IATA codes in airports
dup_index = airports.groupby("IATA").count()[airports.groupby("IATA").count()["Airport"]>1]
pd.merge(airports[airports["Type"]!="Closed"], dup_index, how="inner", on="IATA")#.to_csv("debug.csv")

# make clean version without nan and dups in IATA for joins


Unnamed: 0,Airport,Type,City,Country,IATA,ICAO,FAA,URL,country_comparison,url_country,check,FAA Code,IATA Code,ICAO Code,Latitude,Longitude,Time Zone
0,Ajrestan,Small airport,Ajrestan,Afghanistan,,,,/afghanistan/ajrestan-57879.html,afghanistan,afghanistan,False,,,,33.4852982,67.1440964,Asia/Kabul (GMT +4.5:00)
1,Andkhoi,Small airport,Andkhoi,Afghanistan,,OAAK,,/afghanistan/andkhoi-74127.html,afghanistan,afghanistan,False,,,OAAK,36.9433222,65.2069667,Asia/Samarkand (GMT +5:00)
2,Bagram Air Base,Medium airport,Bagram,Afghanistan,OAI,OAIX,,/afghanistan/bagram-air-base-74128.html,afghanistan,afghanistan,False,,OAI,OAIX,34.9460983,69.2649994,Asia/Kabul (GMT +4.5:00)
3,Bamiyan,Small airport,Bamiyan,Afghanistan,BIN,OABN,,/afghanistan/bamiyan-566.html,afghanistan,afghanistan,False,,BIN,OABN,34.8170013,67.8170013,Asia/Kabul (GMT +4.5:00)
4,Bost,Small airport,Bost,Afghanistan,BST,OABT,,/afghanistan/bost-976.html,afghanistan,afghanistan,False,,BST,OABT,31.5597,64.3649979,Asia/Kabul (GMT +4.5:00)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44097,Victoria Falls International,Medium airport,Victoria Falls,Zimbabwe,VFA,FVFA,,/zimbabwe/victoria-falls-7409.html,zimbabwe,zimbabwe,False,,,,,,
44098,Wedza,Small airport,Wedza,Zimbabwe,,FVWD,,/zimbabwe/wedza-64686.html,zimbabwe,zimbabwe,False,,,,,,
44099,Zaka,Small airport,Zaka,Zimbabwe,,FVZK,,/zimbabwe/zaka-64689.html,zimbabwe,zimbabwe,False,,,,,,
44100,Zisco,Small airport,,Zimbabwe,,FVSC,,/zimbabwe/zisco-64673.html,zimbabwe,zimbabwe,False,,,,,,


In [176]:
# reshaping the routes table
routes = pd.merge(destinations, 
                  airports[join_key + ["IATA", "ICAO"]], 
                  how="left", 
                  on=join_key, 
                  suffixes=["", "_Source"])

routes = pd.merge(routes, 
                  airports[join_key + ["IATA", "ICAO"]],
                  how="left", 
                  left_on=["IATA"], # remember to change later to add city
                  right_on=["IATA"], 
                  suffixes=["", "_Dest"])
routes
# routes.rename(columns={'IATA': 'IATA_Dest', 'ICAO': 'ICAO_Source'}, inplace=True)

Unnamed: 0,Destination,IATA,Airlines Flying Route,Airport,Country,City,IATA_Source,ICAO,Country_Dest,Airport_Dest,ICAO_Dest
0,Indira Gandhi International,DEL,Safi Airlines,Herat,Afghanistan,Afghanistan,HEA,OAHR,India,Indira Gandhi International,VIDP
1,Benazir Bhutto International,,Safi Airlines,Herat,Afghanistan,Afghanistan,HEA,OAHR,Afghanistan,Ajrestan,
2,Benazir Bhutto International,,Safi Airlines,Herat,Afghanistan,Afghanistan,HEA,OAHR,Afghanistan,Andkhoi,OAAK
3,Benazir Bhutto International,,Safi Airlines,Herat,Afghanistan,Afghanistan,HEA,OAHR,Afghanistan,Charikar,
4,Benazir Bhutto International,,Safi Airlines,Herat,Afghanistan,Afghanistan,HEA,OAHR,Afghanistan,Dehdadi,
...,...,...,...,...,...,...,...,...,...,...,...
103952,Touat Cheikh Sidi Mohamed Belkebir,AZR,Air Algerie,Bordj Badji Mokhtar,Algeria,Algeria,BMW,DATM,Algeria,Touat Cheikh Sidi Mohamed Belkebir,DAUA
103953,Houari Boumediene,ALG,Air Algerie,Bou Chekif,Algeria,Algeria,TID,DAOB,Algeria,Houari Boumediene,DAAG
103954,Houari Boumediene,ALG,Air Algerie,Cheikh Larbi Tébessi,Algeria,Algeria,TEE,DABS,Algeria,Houari Boumediene,DAAG
103955,Ain el Beida,OGX,Air Algerie,Djanet Inedbirene,Algeria,Algeria,DJG,DAAJ,Algeria,Ain el Beida,DAUU


In [27]:
dest_counts = destinations.loc[:,["Source", "Country", "Destination"]].groupby(["Source", "Country"]).count()
df = pd.merge(airports, dest_counts, how="left", left_on=["Airport", "Country"], right_on=["Source", "Country"])

# theres a fair number of airports without routes. Imagine this is due to being small so no regular commerical flights
# implication there is that there are charter flights or flights run by small operators going to these airports (OOS)
df.head(2)

Unnamed: 0,Airport,Type,City,Country,IATA,ICAO,FAA,URL,Destination
0,Ajrestan,Small airport,Ajrestan,Afghanistan,,,,/afghanistan/ajrestan-57879.html,
1,Andkhoi,Small airport,Andkhoi,Afghanistan,,OAAK,,/afghanistan/andkhoi-74127.html,


## Skyscanner flights

Purpose of this section is to explore the data structure of the Skyscanner API and create an ETL pipeline that will periodically (daily) add historical prices to an SQLlite DB.

Taking the airports and routes gather from the previous section, I now build up an updating view of the prices for those routes by airline. The value of the time series is to be able to look at how the time-till-flight (TTF) affects the price of the ticket.

In [None]:
# Query SQLite for 

Now that the routes have been structured for mass API quering, I iterate through the combinations to find the day's flight prices. 

N.B. Automate this to happen every day at X time.

In [40]:
def create_url(origin, dest, date_outbound, date_inbound, country, currency, locale):
    return f"https://skyscanner-skyscanner-flight-search-v1.p.rapidapi.com/apiservices/browsedates/v1.0/{country}/{currency}/{locale}/{origin}/{dest}/{date_outbound}"


def make_call(origin, dest, date_outbound, date_inbound, country = "UK", currency = "GBP", locale = "en-UK"):
    url = create_url(origin, dest, date_outbound, date_inbound, country, currency, locale)
    
    querystring = {"inboundpartialdate":date_inbound}

    headers = {
        'x-rapidapi-host': "skyscanner-skyscanner-flight-search-v1.p.rapidapi.com",
        'x-rapidapi-key': "c877ec170amsh5f0eebf89a83bd2p13283cjsnced6d7aa5856"
        }

    response = requests.request("GET", url, headers=headers, params=querystring)
    response_json = json.loads(response.text)
    print(json.dumps(response_json, indent=2))
    return response_json

def flatten_json(quotes):
    for i, _ in enumerate(quotes['Quotes']):
        
        for key, val in quotes['Quotes'][i]['OutboundLeg'].items():
            quotes['Quotes'][i][key] = val
            
        del(quotes['Quotes'][i]['OutboundLeg'])
        
    # json to DataFrames
    # for each then need to enforce data types as well as rename any relevant columns 
    carriers = pd.DataFrame.from_dict(quotes['Carriers'])
    
    places = pd.DataFrame.from_dict(quotes['Places'])
            
    quotes = pd.DataFrame.from_dict(quotes['Quotes'])
    
    # construct routes from unique quotes start/end destinations
    routes = pd.DataFrame.from_dict(quotes['Quotes'])


### Data schema

    1. Carrier level
        - Carrier ID
        - Carrier Name
        - Carrier Parent?
        - Carrier Country
        - Other flags ...
        
    2. Route level (includes layovers)
        - Route ID
        - Start airport
        - End airport
        
    3. Airport level
        - Airport ID
        - Airport Name
        - Airport Country
        - Geo tag? lat and long?
            - possibly important if want to look at catchment analysis later on
           
    4. Quote level (route & carrier & time level)
        - Departure time
        - Landing time / duration??
        - Carrier ID
        - Route id
        - Price
        - Direct or not flag
        - Layover time?
        - Layover cities (how do I store this?)
        - Currency - GBP
        - Quote time - when was this queried
        - Sourcing
    

In [11]:
# create SQLLite DB and make schema

#!/usr/bin/python
# API Call key and meta data

# # quotes = make_call("LHR-sky", "LAX- sky", "2020-10-01", "2020-11-01")
# df = pd.DataFrame.from_dict(quotes)
# head(df)


conn = sqlite3.connect('test.db')

field = ["Carrier_ID", "Carrier_Name", "Country"]

conn.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='{table_name}';")

conn.execute('''CREATE TABLE IF NOT EXISTS CARRIER
                (CARRIER_ID INT PRIMARY KEY     NOT NULL,
                 CARRIER_NAME           TEXT    NOT NULL,
                 COUNTRY                TEXT    NOT NULL);''')

conn.execute("INSERT INTO CARRIER (CARRIER_ID,CARRIER_NAME,COUNTRY) \
      VALUES (2, 'Virgin Airways', 'UK')");

conn.commit()

cursor = conn.execute("SELECT carrier_id, carrier_name, country from CARRIER")
for row in cursor:
    for i, val in enumerate(row):
        print(f"{field[i]} = ", val)

conn.close()
# back up function that recreates the schema on deletion

# take json format and put into relational database

# append to SQLLite database

Carrier_ID =  1
Carrier_Name =  British Airways
Country =  UK
Carrier_ID =  2
Carrier_Name =  Virgin Airways
Country =  UK


In [16]:
conn = sqlite3.connect('test.db')

# print out all tables in database
cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table';")
for row in cursor:
    print(row)

('CARRIER',)


In [None]:
def main(): 
    # to get new quotes for each combination of date, origin, dest
    for origin, dest, date_outbound, date_inbound in search_params:
        quotes = make_call(origin, dest, date_outbound, date_inbound)
        
        quotes = prepare(quotes)
        connection = make_connection()
        update_database(connection, quotes)


        
def prepare():
    pass
        
def make_connection():
    pass
        
def update_database(connection, json):
    '''
    Update SQLite tables with new json
    '''
    if not json:
        # add to data validation (date check?)
        print("No new data, exiting update")
        
    updates = [update_carriers, update_routes, update_airports, update_quotes]
    for update in updates:
        print(update(connection, json))
        
        
def update_carriers():
    # connect to table
    # insert
    # try catch?
    pass