In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pymc3 as pm
import scipy
import scipy.stats as stats
import seaborn.apionly as sns
import statsmodels.api as sm
import theano.tensor as tt
import pymc3
from sklearn import preprocessing
%matplotlib inline

  from pandas.core import datetools


## Load in 2016 weather data.

- station identifier (GHCN Daily Identification Number)

 - date (yyyymmdd; where yyyy=year; mm=month; and, dd=day)
 - observation type (see ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt for definitions)
 - observation value (see ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt for units)
 - observation time (if available, as hhmm where hh=hour and mm=minutes in local time)


In [3]:
df_weather = pd.read_csv('../data/weather/2016_weather.csv',names=['airport_code','date','type','value','t1','t2','t3','t4'])

In [68]:
df_weather.head()

Unnamed: 0,airport_code,date,type,value,t1,t2,t3,t4
0,US1NJES0019,20160101,PRCP,0,,,N,
1,US1NJES0019,20160101,SNOW,0,,,N,
2,US1NJGL0001,20160101,PRCP,0,,,N,
3,US1NJGL0001,20160101,SNOW,0,,,N,
4,CA1AB000023,20160101,PRCP,0,,,N,


According to the readme, the value -999 is used only when there is no record available.

In [4]:
df_weather=df_weather[df_weather['value'] > -999]

## Load in airport lookup data.

The format was bad as it was seperated by spaces and not tabs. So I did the following bash command to make a tsv:

In [59]:
#!cat stations_airport.txt | awk {'printf ("%s\t%s\t%s\t%s\t%s %s %s %s %s\n", $1, $2,$3,$4,$5,$6,$7,$8,$9)'} > airports.tsv

In [5]:
df_stations=pd.read_csv('../data/weather/airports.tsv',sep='\t',names=['airport_code','lat','lon','height','name'])

In [74]:
df_stations.head()

Unnamed: 0,airport_code,lat,lon,height,name
0,AM000037699,40.533,44.383,1892.0,APARAN 37699
1,AQC00914869,-14.3333,-170.7167,3.0,AS TAFUNA AP TUTUILA
2,AQW00061705,-14.3306,-170.7136,3.7,AS PAGO PAGO WSO AP
3,ASN00017006,-29.9267,138.7517,123.0,APOLLINARIS WELL
4,ASN00019001,-33.0524,138.4277,369.0,APPILA


In [75]:
df_weather.head()

Unnamed: 0,airport_code,date,type,value,t1,t2,t3,t4
0,US1NJES0019,20160101,PRCP,0,,,N,
1,US1NJES0019,20160101,SNOW,0,,,N,
2,US1NJGL0001,20160101,PRCP,0,,,N,
3,US1NJGL0001,20160101,SNOW,0,,,N,
4,CA1AB000023,20160101,PRCP,0,,,N,


### Join the weather data to the location data

In [6]:
df_weather_stations = pd.merge(df_stations,df_weather,how='inner',on=['airport_code'])

In [7]:
df_codelatlon = pd.read_csv('../data/airport_data/airport_codes_latlon.csv')

### Load in On Time Performance Data

In [50]:
df_air=pd.read_csv('../data/On_Time_Performance/On_Time_Performance_2016_agg.tsv',sep='\t')

In [27]:
df_weather_stations['lon'].max()

171.40000000000001

## Join data on latitude and longitude 

In [28]:
df_weather_stations['lat']=df_weather_stations['lat'].apply(lambda x : abs(round(x,1)))
df_weather_stations['lon']=df_weather_stations['lon'].apply(lambda x : abs(round(x,1)))

In [29]:
df_codelatlon['lat']=df_codelatlon['Latitude'].apply(lambda x : abs(round(x,1)))
df_codelatlon['lon']=df_codelatlon['Longitude'].apply(lambda x : abs(round(x,1)))

In [30]:
df_joined=pd.merge(df_weather_stations,df_codelatlon,how='inner',on=['lat','lon'])

In [33]:
df_joined.fillna(0,inplace=True)

In [38]:
df_joined.to_csv('../data/weather/weather2016_row_airport_code.tsv',sep='\t',index=False)

### Convert to record based format

In [40]:
df_record =pd.concat([df_joined,pd.get_dummies(df_joined['type'])],axis=1)

In [43]:
df_record.head(2)

Unnamed: 0,airport_code,lat,lon,height,name,date,type,value,t1,t2,...,WT02,WT03,WT04,WT05,WT06,WT07,WT08,WT09,WT10,WT11
0,AQW00061705,14.3,170.7,3.7,AS PAGO PAGO WSO AP,20160101,TMAX,286,0,0,...,0,0,0,0,0,0,0,0,0,0
1,AQW00061705,14.3,170.7,3.7,AS PAGO PAGO WSO AP,20160101,TMIN,260,0,0,...,0,0,0,0,0,0,0,0,0,0


In [44]:
def fill_value(row):
    tmp_val = row['value']
    row[row['type']]=tmp_val
    return row

df_record=df_record.apply(lambda row : fill_value(row),axis=1)

In [47]:
df_record.to_csv('../data/weather/weather2016_row_airport_code_wdummies.tsv',sep='\t',index=False)

# Aggregation

In [49]:
pd.set_option('display.max_columns', 500)
%matplotlib inline

In [54]:
weather_cols=df_record['type'].drop_duplicates().values

In [57]:
df_agg=df_record.groupby(['locationID','airport_code','lat','lon','date'])[weather_cols].sum()

In [59]:
df_agg.to_csv('../data/weather/all_weather_data_agg.tsv',sep='\t')