In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

data = pd.read_csv("airfares.csv")
data.head()
# avaergae_fair_adjusted is inflation adjusted rate based on 2020 Q1

Unnamed: 0,passenger_rank,airport_code,city_name,state_name,average_fair,average_fair_adjusted,year,quarter
0,1,LAX,Los Angeles,CA,328.17,328.17,2000,1
1,2,ORD,Chicago-O'Hare,IL,332.05,332.05,2000,1
2,3,DEN,Denver,CO,295.96,295.96,2000,1
3,4,ATL,Atlanta,GA,359.58,359.58,2000,1
4,5,BOS,Boston,MA,324.85,324.85,2000,1


In [3]:
data = data.replace(',','', regex=True)
data["average_fair"] = pd.to_numeric(data["average_fair"], downcast="float")
data["average_fair_adjusted"] = pd.to_numeric(data["average_fair_adjusted"], downcast="float")
data.describe()

# passenger rank with max = 427, Id?

Unnamed: 0,passenger_rank,average_fair,average_fair_adjusted,year,quarter
count,34460.0,34460.0,34460.0,34460.0,34460.0
mean,223.528003,426.792572,528.004028,2009.203918,2.447591
std,127.523872,167.09671,215.144684,5.812059,1.115027
min,1.0,39.709999,60.349998,2000.0,1.0
25%,112.0,339.130005,410.25,2004.0,1.0
50%,224.0,410.815002,507.01001,2009.0,2.0
75%,340.0,485.440002,606.830017,2014.0,3.0
max,427.0,2808.0,3358.98999,2019.0,4.0


In [104]:
# Check for Dimension and Hierarchy
print(len(data[['airport_code']].drop_duplicates()))
print(len(data[['airport_code', 'city_name']].drop_duplicates()))
print(len(data[['airport_code', 'city_name', 'state_name']].drop_duplicates()))

print(len(data[['city_name', 'state_name']].drop_duplicates()))
# The relationship between airport code and city name are one  
# The relationship between city name and state name is one to many
# City name may be different, need to clean it up

665
665
665
650


In [21]:
# History length
len(data[['year']].drop_duplicates()) # 20 years 

20

In [22]:
# History length by Quarter by Year

data[['year', 'quarter']].drop_duplicates().groupby('year').agg(['count'])
# 2001, 2004, 2013 all have 1 quarter missing

Unnamed: 0_level_0,quarter
Unnamed: 0_level_1,count
year,Unnamed: 1_level_2
2000,4
2001,3
2002,4
2003,4
2004,3
2005,4
2006,4
2007,4
2008,4
2009,4


In [18]:
tmp = data[['year', 'quarter']].drop_duplicates()
tmp[tmp['year'].isin([2001, 2004, 2013])]
# No Q4 in 2001
# No Q3 in 2004
# No Q4 in 2013

# Fill with average airfare in that year at that airport

Unnamed: 0,year,quarter
1955,2001,1
2501,2001,2
3047,2001,3
7711,2004,1
8220,2004,2
8720,2004,4
23132,2013,1
23568,2013,2
24014,2013,3


In [27]:
# History data per airport
tmp2 = data[['airport_code', 'year', 'quarter']].groupby('airport_code').agg('count')
tmp2
# we can see that not all airports have complete history data. Since we have not infill the 3 missing quarters, 
# airports with complete history data should end up with 77 records

Unnamed: 0_level_0,year,quarter
airport_code,Unnamed: 1_level_1,Unnamed: 2_level_1
ABE,77,77
ABI,77,77
ABL,6,6
ABQ,77,77
ABR,74,74
...,...,...
YAK,74,74
YKM,77,77
YKN,5,5
YNG,51,51


In [44]:
tmp2[tmp2['quarter'] == 77] # 239 rows

Unnamed: 0_level_0,year,quarter
airport_code,Unnamed: 1_level_1,Unnamed: 2_level_1
ABE,77,77
ABI,77,77
ABQ,77,77
ACK,77,77
ACT,77,77
...,...,...
TYS,77,77
VPS,77,77
XNA,77,77
YKM,77,77


In [92]:
# Filter on data with full history
new_data = data[data['airport_code'].isin(list(tmp2[tmp2['quarter'] == 77].index))]
new_data.head()

Unnamed: 0,passenger_rank,airport_code,city_name,state_name,average_fair,average_fair_adjusted,year,quarter
0,1,LAX,Los Angeles,CA,328.170013,328.170013,2000,1
1,2,ORD,Chicago-O'Hare,IL,332.049988,332.049988,2000,1
2,3,DEN,Denver,CO,295.959991,295.959991,2000,1
3,4,ATL,Atlanta,GA,359.579987,359.579987,2000,1
4,5,BOS,Boston,MA,324.850006,324.850006,2000,1


In [93]:
# Fill in gaps for missing quarter
# No Q4 in 2001
# No Q3 in 2004
# No Q4 in 2013

fill = new_data[new_data['year'].isin([2001, 2004, 2013])]
fill = fill[['airport_code', 'year', 'average_fair', 'average_fair_adjusted']].groupby(['airport_code', 'year']).agg('mean')

def fill_quarter(year): 
    if year == 2001:
        return 4
    elif year == 2004:
        return 3
    elif year == 2013:
        return 4

# Reset row names with airport code
fill.reset_index(inplace=True)

fill['quarter'] = [fill_quarter(x) for x in fill['year']]
fill.head()

Unnamed: 0,airport_code,year,average_fair,average_fair_adjusted,quarter
0,ABE,2001,434.390015,642.553345,4
1,ABE,2004,436.503326,612.753357,3
2,ABE,2013,366.416656,418.66333,4
3,ABI,2001,418.81665,619.469971,4
4,ABI,2004,423.126678,594.073364,3


In [100]:
# Left join with other columns
x = new_data[new_data['year'].isin([2001, 2004, 2013])]
y = x[['passenger_rank','airport_code', 'city_name', 'state_name', ]]
fill = fill.merge(y, on='airport_code', how='left')
fill.head()

Unnamed: 0,airport_code,year,average_fair,average_fair_adjusted,quarter,passenger_rank,city_name,state_name
0,ABE,2001,434.390015,642.553345,4,121,Allentown/Bethlehem/Easton,PA
1,ABE,2001,434.390015,642.553345,4,121,Allentown/Bethlehem/Easton,PA
2,ABE,2001,434.390015,642.553345,4,121,Allentown/Bethlehem/Easton,PA
3,ABE,2001,434.390015,642.553345,4,121,Allentown/Bethlehem/Easton,PA
4,ABE,2001,434.390015,642.553345,4,121,Allentown/Bethlehem/Easton,PA


In [103]:
# Row bind with new_data
complete_data = new_data.append(fill)
complete_data[['year', 'quarter']].drop_duplicates().groupby('year').agg(['count'])

Unnamed: 0_level_0,quarter
Unnamed: 0_level_1,count
year,Unnamed: 1_level_2
2000,4
2001,4
2002,4
2003,4
2004,4
2005,4
2006,4
2007,4
2008,4
2009,4


In [48]:
# Run Time series forecasting 


In [None]:
# Calculate accuracy