In [2]:
import datetime
import pandas as pd
import numpy as np
from sklearn import linear_model
from google.cloud import bigquery
import datalab.storage as gcs #v1.1.5

# Product purpose

To enable research on the price of air travel in regional markets to enable potential future users to develop their own understanding of volatilities, seasonalities and correlations with other parts of the world economy. To enable in depth discussion with potential users as to the functionality and definitions used for the indices.
Licensees are forbidden from creating new derived data products from these indices and or using these indices/data as benchmarks.

# Input Data/Data repositories


Kiwi raw data:  gs://ext-kiwi-excl-data
        
Iata raw data:  gs://ext-iata-excel-data

Auxiliary data : BQ dataset generic        

# Applicable regulatory documentation

# Applicable internal documentation

# Benchmark code repository

Latest commit for this version 
814f8f6 

https://bitbucket.org/indexproduction_SK/indexproduction/commits/814f8f6f7ab4b742bdef53a470714341af150703

# Divisors & rationales

# Models used

In order o build a longer time series, the skytra index before 2018 is a modelled version

This index is produced by the IATA-only index code v4.1, which accounts for the spot window changes in all regions

The input dataset is IATA_full_process.I2_2013_to_2018

The output dataset is stored in iata_RESEARCH.X7_IATA_index_2013_2018

Final calculation of the IATA index for each flown date and region_pair

Process IATA only itineraries

flight_date filter according to variable spot window

od_km looked up from our skytra.airports dataset

IATA bad price filtering applied

median price of cohort of journey maps

IATA_TOT_RPK weighting by nleg of combo index

In [16]:
# Function that calls SQL code
def iata_index_table():
    client=bigquery.Client()
    query_job=client.query("""
                    SELECT *
                    FROM `d-dat-digitalaircrafttransport.iata_RESEARCH.X7_IATA_index_2013_2018`
                    """)
    print("Reading iata_RESEARCH.X7_IATA_index_2013_2018")
    results = query_job.result().to_dataframe()
    print('Compiled the dataframe')
    df_lines = len(results.index)
    print('Processed {0} lines'.format(df_lines))
    return results

# region pair dict
region_dict = {
        'EU-EU': 'Europe-Europe',
        'NA-NA': 'North America-North America',
        'AS-AS': 'Asia/Pacific-Asia/Pacific',
        'AS-NA': 'Asia/Pacific-North America',
        'AS-EU': 'Asia/Pacific-Europe',
        'EU-NA': 'Europe-North America'
        }

In [8]:
# The main IATA index dataframe
iata_df=iata_index_table()
iata_df.head()

Reading iata_RESEARCH.X7_IATA_index_2013_2018
Compiled the dataframe
Processed 12972 lines


Unnamed: 0,region_pair,first_flight_date,IATA_RRPK_USD,IATA_TOT_RPK,iata_tickets
0,Asia/Pacific-Asia/Pacific,2013-02-02,0.123531,497569000.0,257774
1,Asia/Pacific-Europe,2013-02-02,0.047475,191757300.0,11257
2,Asia/Pacific-North America,2013-02-02,0.06067,266681100.0,13200
3,Europe-Europe,2013-02-02,0.079758,89788340.0,51026
4,Europe-North America,2013-02-02,0.034561,207878400.0,14414


In [9]:
# Creating a pivot table of the index based on the region pair
iata_df['first_flight_date'] = pd.to_datetime(iata_df['first_flight_date'])
iata_index_df=pd.pivot_table(iata_df, index='first_flight_date', values='IATA_RRPK_USD', columns='region_pair')

In [10]:
# Manipulating the IATA dataframe
iata_index_df=iata_index_df.loc[:datetime.date(2017,12,31), :]
iata_index_df.tail()

region_pair,Asia/Pacific-Asia/Pacific,Asia/Pacific-Europe,Asia/Pacific-North America,Europe-Europe,Europe-North America,North America-North America
first_flight_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-12-27,0.081956,0.039446,0.044838,0.110136,0.047558,0.124283
2017-12-28,0.083335,0.037382,0.044447,0.096926,0.044301,0.113214
2017-12-29,0.095741,0.036113,0.04316,0.094337,0.042012,0.107193
2017-12-30,0.084774,0.03439,0.0393,0.094961,0.040291,0.107799
2017-12-31,0.073238,0.029761,0.036892,0.079206,0.040518,0.096926


In [11]:
# v4 Skytra index
# Function that calls SQL code
def skytra_index_table():
    client=bigquery.Client()
    query_job=client.query("""
                    SELECT *
                    FROM `d-dat-digitalaircrafttransport.index.X7_v4_2018_2019`
                    WHERE region_pair in ('Europe-Europe', 'Asia/Pacific-Asia/Pacific', 'Asia/Pacific-North America', 
                        'Asia/Pacific-Europe', 'Europe-North America', 'North America-North America')
                    """)
    print("Reading index.X7_v4_2018_2019")
    results = query_job.result().to_dataframe()
    print('Compiled the dataframe')
    df_lines = len(results.index)
    print('Processed {0} lines'.format(df_lines))
    return results

In [12]:
# The main v4 Skytra index
skytra_df=skytra_index_table()
skytra_df.head()

Reading index.X7_v4_2018_2019
Compiled the dataframe
Processed 4650 lines


Unnamed: 0,region_pair,first_flight_date,iata_index,skytra_index,iata_rpk,skytra_rpk,iata_tickets,skytra_tickets,skytra_coverage
0,Asia/Pacific-Asia/Pacific,2017-11-20,0.084904,0.180503,1426914000.0,1022286000.0,755675,524994,69.473517
1,Asia/Pacific-Asia/Pacific,2017-11-21,0.080982,0.174716,1228978000.0,884116200.0,635151,443030,69.751917
2,Asia/Pacific-Asia/Pacific,2017-11-22,0.083657,0.1777,1201197000.0,861281300.0,606920,418892,69.019311
3,Asia/Pacific-Asia/Pacific,2017-11-23,0.083663,0.179683,1211926000.0,870181600.0,631782,439731,69.601698
4,Asia/Pacific-Asia/Pacific,2017-11-24,0.0849,0.175863,1228781000.0,875493100.0,681241,475626,69.817583


In [13]:
# Filtering for the days in 2018 & 2019
skytra_df['first_flight_date'] = pd.to_datetime(skytra_df['first_flight_date'])
# Creating a pivot table of the Skytra index based on the region pair
skytra_index_df=pd.pivot_table(skytra_df, index='first_flight_date', values='skytra_index', columns='region_pair')
skytra_index_df=skytra_index_df.loc[datetime.date(2018,1,1):datetime.date(2019,12,31), :]

In [14]:
# Creating a pivot table of the IATA index based on the region pair
skytra_iata_index_df=pd.pivot_table(skytra_df, index='first_flight_date', values='iata_index', columns='region_pair')
skytra_iata_index_df=skytra_iata_index_df.loc[datetime.date(2018,1,1):datetime.date(2019,12,31), :]

In [17]:
# Setting up and calibrating the linear regression model
lin_reg_params_df=pd.DataFrame(index=['slope', 'intercept'], columns=sorted(region_dict.values()))

for region in sorted(region_dict.values()):
    
    print('Calibrating the linear regression model for {0}'.format(region))
    x_data=skytra_iata_index_df[[region]]
    y_data=skytra_index_df[region]
    
    # the lin reg model calibration
    clf = linear_model.LinearRegression()
    clf.fit(x_data, y_data)
    
    lin_reg_params_df.loc['slope', region]=clf.coef_[0]
    lin_reg_params_df.loc['intercept', region]=clf.intercept_

Calibrating the linear regression model for Asia/Pacific-Asia/Pacific
Calibrating the linear regression model for Asia/Pacific-Europe
Calibrating the linear regression model for Asia/Pacific-North America
Calibrating the linear regression model for Europe-Europe
Calibrating the linear regression model for Europe-North America
Calibrating the linear regression model for North America-North America


In [18]:
lin_reg_params_df.head()

Unnamed: 0,Asia/Pacific-Asia/Pacific,Asia/Pacific-Europe,Asia/Pacific-North America,Europe-Europe,Europe-North America,North America-North America
slope,2.11645,1.79586,1.58533,1.08271,1.55406,1.25687
intercept,-0.0228907,0.0270886,0.0225095,0.0767634,0.058651,0.0169509


# Master content

# Customisation

# Other considerations