# Data Collection

Purspose of this notebook is to explore the data structure of the Skyscanner API and create an ETL pipeline that will periodically (daily) add to an SQLlite DB.

In [40]:
def create_url(origin, dest, date_outbound, date_inbound, country, currency, locale):
    return f"https://skyscanner-skyscanner-flight-search-v1.p.rapidapi.com/apiservices/browsedates/v1.0/{country}/{currency}/{locale}/{origin}/{dest}/{date_outbound}"


def make_call(origin, dest, date_outbound, date_inbound, country = "UK", currency = "GBP", locale = "en-UK"):
    url = create_url(origin, dest, date_outbound, date_inbound, country, currency, locale)
    
    querystring = {"inboundpartialdate":date_inbound}

    headers = {
        'x-rapidapi-host': "skyscanner-skyscanner-flight-search-v1.p.rapidapi.com",
        'x-rapidapi-key': "XXX"
        }

    response = requests.request("GET", url, headers=headers, params=querystring)
    response_json = json.loads(response.text)
    print(json.dumps(response_json, indent=2))
    return response_json


In [42]:
# API Call key and meta data

import requests
import json
import numpy as np
import pandas as pd

# quotes = make_call("LHR-sky", "LAX- sky", "2020-10-01", "2020-11-01")
df = pd.DataFrame.from_dict(quotes)
head(df)


{
  "Dates": {
    "OutboundDates": [
      {
        "PartialDate": "2020-10-01",
        "QuoteIds": [
          1,
          2
        ],
        "Price": 514.0,
        "QuoteDateTime": "2020-09-12T12:30:00"
      }
    ]
  },
  "Quotes": [
    {
      "QuoteId": 1,
      "MinPrice": 1588.0,
      "Direct": true,
      "OutboundLeg": {
        "CarrierIds": [
          1859
        ],
        "OriginId": 65698,
        "DestinationId": 65368,
        "DepartureDate": "2020-10-01T00:00:00"
      },
      "QuoteDateTime": "2020-09-12T12:30:00"
    },
    {
      "QuoteId": 2,
      "MinPrice": 514.0,
      "Direct": false,
      "OutboundLeg": {
        "CarrierIds": [
          857
        ],
        "OriginId": 65698,
        "DestinationId": 65368,
        "DepartureDate": "2020-10-01T00:00:00"
      },
      "QuoteDateTime": "2020-09-12T12:30:00"
    }
  ],
  "Places": [
    {
      "PlaceId": 65368,
      "IataCode": "LAX",
      "Name": "Los Angeles International",
      "Type"

ValueError: arrays must all be same length

In [71]:
def flatten_json(quotes):
    for i, _ in enumerate(quotes['Quotes']):
        
        for key, val in quotes['Quotes'][i]['OutboundLeg'].items():
            quotes['Quotes'][i][key] = val
            
        del(quotes['Quotes'][i]['OutboundLeg'])
        
    # json to DataFrames
    # for each then need to enforce data types as well as rename any relevant columns 
    carriers = pd.DataFrame.from_dict(quotes['Carriers'])
    
    places = pd.DataFrame.from_dict(quotes['Places'])
            
    quotes = pd.DataFrame.from_dict(quotes['Quotes'])
    
    # construct routes from unique quotes start/end destinations
    routes = pd.DataFrame.from_dict(quotes['Quotes'])


Unnamed: 0,QuoteId,MinPrice,Direct,OutboundLeg,QuoteDateTime,CarrierIds,OriginId,DestinationId,DepartureDate
0,1,1588.0,True,"{'CarrierIds': [1859], 'OriginId': 65698, 'Des...",2020-09-12T12:30:00,[1859],65698,65368,2020-10-01T00:00:00
1,2,514.0,False,"{'CarrierIds': [857], 'OriginId': 65698, 'Dest...",2020-09-12T12:30:00,[857],65698,65368,2020-10-01T00:00:00


Unnamed: 0,CarrierId,Name
0,857,Finnair
1,1859,Virgin Atlantic


Unnamed: 0,PlaceId,IataCode,Name,Type,SkyscannerCode,CityName,CityId,CountryName
0,65368,LAX,Los Angeles International,Station,LAX,Los Angeles,LAXA,United States
1,65698,LHR,London Heathrow,Station,LHR,London,LOND,United Kingdom


In [None]:
# take json format and put into relational database

# append to SQLLite database

In [11]:
# create SQLLite DB and make schema

#!/usr/bin/python

import sqlite3

conn = sqlite3.connect('test.db')

field = ["Carrier_ID", "Carrier_Name", "Country"]

conn.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='{table_name}';")

conn.execute('''CREATE TABLE IF NOT EXISTS CARRIER
                (CARRIER_ID INT PRIMARY KEY     NOT NULL,
                 CARRIER_NAME           TEXT    NOT NULL,
                 COUNTRY                TEXT    NOT NULL);''')

conn.execute("INSERT INTO CARRIER (CARRIER_ID,CARRIER_NAME,COUNTRY) \
      VALUES (2, 'Virgin Airways', 'UK')");

conn.commit()

cursor = conn.execute("SELECT carrier_id, carrier_name, country from CARRIER")
for row in cursor:
    for i, val in enumerate(row):
        print(f"{field[i]} = ", val)

conn.close()
# back up function that recreates the schema on deletion


Carrier_ID =  1
Carrier_Name =  British Airways
Country =  UK
Carrier_ID =  2
Carrier_Name =  Virgin Airways
Country =  UK


In [16]:
conn = sqlite3.connect('test.db')

# print out all tables in database
cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table';")
for row in cursor:
    print(row)

('CARRIER',)


### Data schema

    1. Carrier level
        - Carrier ID
        - Carrier Name
        - Carrier Parent?
        - Carrier Country
        - Other flags ...
        
    2. Route level (includes layovers)
        - Route ID
        - Start airport
        - End airport
        
    3. Airport level
        - Airport ID
        - Airport Name
        - Airport Country
        - Geo tag? lat and long?
            - possibly important if want to look at catchment analysis later on
           
    4. Quote level (route & carrier & time level)
        - Departure time
        - Landing time / duration??
        - Carrier ID
        - Route id
        - Price
        - Direct or not flag
        - Layover time?
        - Layover cities (how do I store this?)
        - Currency - GBP
        - Quote time - when was this queried
        - Sourcing
    

In [None]:
def main(): 
    # to get new quotes for each combination of date, origin, dest
    for origin, dest, date_outbound, date_inbound in search_params:
        quotes = make_call(origin, dest, date_outbound, date_inbound)
        
        quotes = prepare(quotes)
        connection = make_connection()
        update_database(connection, quotes)


        
def prepare():
    pass
        
def make_connection():
    pass
        
def update_database(connection, json):
    '''
    Update SQLite tables with new json
    '''
    if not json:
        # add to data validation (date check?)
        print("No new data, exiting update")
        
    updates = [update_carriers, update_routes, update_airports, update_quotes]
    for update in updates:
        print(update(connection, json))
        
        
def update_carriers():
    # connect to table
    # insert
    # try catch?
    pass

## Process blog post:

Like so many others out there this quarantine has instilled a certain level of wanderlust in me. And obviously before I commit to any location I wanted to know that I'm not getting scalped on the airfare. To put my (and possibly your) mind at ease I've looked at historical airfare prices across X popular routes to get a view on the price fluctiations you can expect if you're travelling.

### Steps:
0. Exec summary
1. Data collection
    - Sources
    - Schema and collection
2. Data preparation and feature engineering
    - TBC
3. Model generation
4. Model validation
5. Output and visualizations
6. Extensions

Scraping a complete list of Airports with additional characteristics - long and lat for catchment analysis

Source: https://www.world-airport-codes.com/alphabetical/country-name/a.html?page=30

Process:
---------------------
To query the population of relevant routes from Skyscanner, I first needed to collect a comprehensive (or as close to as possible) list of airports. 
1. Find all airport URLS
    - For each letter in alphabet:
        - For each page:
            - For each linked airport
                - Scrape airport hyperlink and associated data
                
2. Collect informaton on seperate airport URL pages
    - For each airport URL:
        - Collect relevant airport specific info
        - Add to data store
        
3. Check data is consistent between first page and specific URLS

4. Summary statistics

----------------------
All possible routes then becomes the 2-combinaton of all n routes in the collected list.

N.B. interesting to look at the number of routes each airport is directly connected, what average is by country, airport size, etc
N.B. interesting to see number of actual routes as % of all possible. Imagine <1% given factorial nature of combinations...

In [None]:
# Collect URLs from one landing page
def scape_urls(html):
    pass

# Collect information from one URL
def scrape_airport(html):
    pass

# Store information in SQLite
def store_airport(html):
    pass


In [202]:
# Find the URLs of each airport on world-airport-codes.com
from string import ascii_lowercase
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

# download chromedriver 
# allow use with - https://stackoverflow.com/questions/60362018/macos-catalinav-10-15-3-error-chromedriver-cannot-be-opened-because-the-de
driver = webdriver.Chrome()
driver.implicitly_wait(15)

# set up pages to scrape
BASE_URL = "https://www.world-airport-codes.com/alphabetical/country-name/{letter}.html?page={page_n}"
starting_urls = [[BASE_URL.format(letter=letter, page_n=n) for n in range(1, 500)] for letter in ascii_lowercase]

all_tables = []
for group_of_urls in starting_urls:
    for url in group_of_urls:
        driver.get(url)
        soup = BeautifulSoup(driver.page_source, "html")
        tables = soup.find_all("table")
        if len(tables) == 0:
            break
        all_tables.append(tables)

In [215]:
# collect raw tables in dataframes for later processing
raw_dataframes = []
raw_urls = []

for table_group in all_tables:
    for table in table_group:
        text_data = [[cell.text for cell in row.find_all(["th","td"])]
                                for row in table.find_all("tr")]
        raw_dataframes.append(pd.DataFrame(text_data))

        url_data = [[row.text, row.get("href")] for row in table.find_all("a")]
        raw_urls.append(pd.DataFrame(url_data))

In [216]:
# appending all tables into one
airports = pd.concat(raw_dataframes, axis=0, ignore_index=True)
airport_urls = pd.concat(raw_urls, axis=0, ignore_index=True)

# naming fields
airports.columns = airports.iloc[0,:]
airport_urls.columns = ["Airport", "URL"]

# drop extra header rows
airports.drop(airports[airports["Airport"] == "Airport"].index, inplace=True)

# removing whitespace and column names from raw text
for column in airports.columns:
    airports[column] = airports[column].map(lambda text: text.replace(f"{column}:", "").strip())

# removing "Closed" tag from airport name
airports.loc[airports["Type"] == "Closed", "Airport"] = airports.loc[airports["Type"] == "Closed", "Airport"].str[:-7]

# joining in URLS
airports = pd.merge(airports, airport_urls, how="left", on="Airport")

In [233]:
# write to disk
airports.to_csv("airports.csv", index=False, encoding='utf-8-sig')

### Scraping additional airport characteristics from unique webpages

In [234]:
# scape the individual page associated with airport
driver.get("https://www.world-airport-codes.com/united-states/john-f-kennedy-international-5202.html")
soup = BeautifulSoup(driver.page_source, "html")

In [254]:
def classify_supplement(series, airport):
    mapping = {
        "Frequency": ["Type", "Description", "Frequency (MHz)"],
        "Runway": ['Runway', 'Length (feet)', 'Width (feet)', 'Surface Type'],
        "Destinations": ['Destination', 'IATA', 'Airlines Flying Route']
    }
    
    for k, v in mapping.items():
        if len(series) == len(v) and all(series == v):
            return k
        
    # broken
    print(series, airport)
    return None

    
from collections import defaultdict

raw_supplements = defaultdict(list)
supplements = {}

# need to skip this if there are 0 tables
tbl = soup.find_all("table")

for table in tbl:
    text_data = [[cell.text for cell in row.find_all(["th","td"])]
                            for row in table.find_all("tr")]
    
    raw = pd.DataFrame(text_data)
    supp_type = classify_supplement(raw.iloc[0], airport)
    
    if supp_type == "Destinations":
        raw["Origin"] = "PLACEHOLDER"
        
    raw_supplements[supp_type].append(raw)

    
# need to add in contact details
    
# need to add in lat long  
    
    
for k, v in raw_supplements.items():
    supplements[k] = pd.concat(v, ignore_index=True)
    
supplements["Destinations"]

Unnamed: 0,0,1,2
0,Destination,IATA,Airlines Flying Route
1,Ministro Pistarini International,EZE,"LAN Argentina, American Airlines, Aerolineas A..."
2,Licenciado Benito Juarez International,MEX,"Interjet (ABC Aerolineas), AeroMéxico, Alitali..."
3,Louis Armstrong New Orleans International,MSY,"Pinnacle Airlines, Air France, Alitalia, JetBl..."
4,Queen Alia International,AMM,"American Airlines, Royal Jordanian, US Airways"
...,...,...,...
157,King Abdulaziz International,JED,Saudi Arabian Airlines
158,King Khaled International,RUH,Saudi Arabian Airlines
159,Atatürk International,ISL,"Turkish Airlines, US Airways"
160,Vnukovo International,VKO,Transaero Airlines


In [256]:

supplements["Destinations"][0]

0                                    Destination
1               Ministro Pistarini International
2         Licenciado Benito Juarez International
3      Louis Armstrong New Orleans International
4                       Queen Alia International
                         ...                    
157                 King Abdulaziz International
158                    King Khaled International
159                        Atatürk International
160                        Vnukovo International
161               Murtala Muhammed International
Name: 0, Length: 162, dtype: object

In [240]:
dfs[2]

Unnamed: 0,0,1,2
0,Destination,IATA,Airlines Flying Route
1,Ministro Pistarini International,EZE,"LAN Argentina, American Airlines, Aerolineas A..."
2,Licenciado Benito Juarez International,MEX,"Interjet (ABC Aerolineas), AeroMéxico, Alitali..."
3,Louis Armstrong New Orleans International,MSY,"Pinnacle Airlines, Air France, Alitalia, JetBl..."
4,Queen Alia International,AMM,"American Airlines, Royal Jordanian, US Airways"
...,...,...,...
157,King Abdulaziz International,JED,Saudi Arabian Airlines
158,King Khaled International,RUH,Saudi Arabian Airlines
159,Atatürk International,ISL,"Turkish Airlines, US Airways"
160,Vnukovo International,VKO,Transaero Airlines


In [None]:
"airport-details-header"

#then check h2 text


for element in to_scrape:
    try:
        pass
    except:
        # replace with N/A
        pass
    finally:
        #fill in
        pass

# create supplementary table

# join into main table