In [1]:
import copy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn import preprocessing
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import f_regression, SelectKBest
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression

import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd, MultiComparison

import copy
import seaborn as sns
import os
from scipy import stats
import datetime

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

In [2]:
path='/Users/carlyfennell/DataScience/data_bootcamp/data/'

In [3]:
column_types={'mkt_carrier_fl_num': 'uint16', 'op_unique_carrier': 'category', 'tail_num': 'category', 'op_carrier_fl_num': 'uint16', 'origin_airport_id': 'uint16', 'origin': 'category', 'dest_airport_id': 'uint16', 'dest': 'category', 'crs_dep_time': 'uint16', 'dep_time': 'float32'}

In [4]:
flights_final=pd.read_csv(path+'test_flights.csv',dtype=column_types,parse_dates=['fl_date'],infer_datetime_format=True)

In [5]:
flights_final.shape

(660556, 20)

In [7]:
flights_clean=flights_final.dropna()

In [8]:
flights_clean.shape

(659057, 20)

In [9]:
flights_clean=flights_clean.drop(['op_unique_carrier','mkt_carrier','dup', 'crs_elapsed_time','flights'],axis=1)

In [10]:
flights_clean['fl_date']=flights_clean['fl_date'].astype('datetime64[ns]')
flights_clean['month']=flights_clean['fl_date'].dt.month
flights_clean['day']=flights_clean['fl_date'].dt.day

In [11]:
flights_clean=flights_clean.drop(['fl_date','origin_city_name','dest_airport_id','origin_airport_id'],axis=1)

In [12]:
flights_clean=flights_clean.drop(['op_carrier_fl_num'],axis=1)

# Read in Weather/Airport Key

In [14]:
#read weather_key into dataframe
weather_key=pd.read_csv(path+'wkey.csv')
weather_key=weather_key.drop(['Unnamed: 0','period'],axis=1)

In [15]:
new_df=pd.merge(flights_clean,weather_key[['origin','month', 'latitude', 'longitude','wdir', 'temp',
       'maxt', 'visibility', 'wspd', 'heatindex', 'cloudcover', 'mint',
       'precip', 'snowdepth', 'sealevelpressure', 'dew', 'humidity', 'wgust',
       'precipcover', 'windchill', 'closest_airport']],how='outer',on=['origin','month'])

In [16]:
busy_feat=pd.read_csv(path+'busy_feature.csv')

In [17]:
busy_feat=busy_feat.drop('Unnamed: 0',axis=1)

In [18]:
new_df_two=pd.merge(new_df,busy_feat,how='outer',on=['origin'])

In [19]:
weather_key.columns

Index(['origin', 'month', 'latitude', 'longitude', 'wdir', 'temp', 'maxt',
       'visibility', 'wspd', 'heatindex', 'cloudcover', 'mint', 'precip',
       'snowdepth', 'sealevelpressure', 'dew', 'humidity', 'wgust',
       'precipcover', 'windchill', 'closest_airport'],
      dtype='object')

# Build Carrier/Tail_num Key

### import small subsample of big dataset to build the carrier/tail_num features

In [22]:
import random

In [23]:
filename =path+"flight_final.csv"
n = sum(1 for line in open(filename)) - 1 #number of records in file (excludes header)
s = 500000 #desired sample size
skip = sorted(random.sample(range(1,n+1),n-s)) #the 0-indexed header will not be included in the skip list
df = pd.read_csv(filename, skiprows=skip)

In [24]:
df.shape

(500000, 49)

In [25]:
flights_FE = copy.deepcopy(df)

In [26]:
flights_FE = flights_FE[(flights_FE['dep_delay']>0) | (flights_FE['arr_delay']>0)]
delay_df = flights_FE.groupby('tail_num').mean()[['arr_delay','dep_delay']]
delay_df.reset_index(inplace=True)
delay_df.columns = ['tail_num','mean_tail_num_arr_delay','mean_tail_num_dep_delay']

In [27]:
flights_final = new_df_two.merge(delay_df, on=['tail_num'], how='left')

In [28]:
carrier_delay = flights_FE.groupby('mkt_unique_carrier').mean()[['arr_delay','dep_delay']]
carrier_delay.reset_index(inplace=True)
carrier_delay.columns = ['mkt_unique_carrier','mean_carrier_arr_delay','mean_carrier__dep_delay']
flights_final = flights_final.merge(carrier_delay, on=['mkt_unique_carrier'], how='left')

In [29]:
total = flights_final.isnull().sum().sort_values(ascending=False)
percent = (flights_final.isnull().sum()/flights_final.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(10)

Unnamed: 0,Total,Percent
snowdepth,495913,0.747751
heatindex,445301,0.671437
windchill,12422,0.01873
mean_tail_num_dep_delay,7905,0.011919
mean_tail_num_arr_delay,7905,0.011919
mean_carrier__dep_delay,4149,0.006256
branded_code_share,4149,0.006256
mkt_carrier_fl_num,4149,0.006256
tail_num,4149,0.006256
dest,4149,0.006256


In [30]:
flights_final['snowdepth']=flights_final['snowdepth'].fillna(0)

In [31]:
flights_final.columns

Index(['mkt_unique_carrier', 'branded_code_share', 'mkt_carrier_fl_num',
       'tail_num', 'origin', 'dest', 'dest_city_name', 'crs_dep_time',
       'crs_arr_time', 'distance', 'month', 'day', 'latitude', 'longitude',
       'wdir', 'temp', 'maxt', 'visibility', 'wspd', 'heatindex', 'cloudcover',
       'mint', 'precip', 'snowdepth', 'sealevelpressure', 'dew', 'humidity',
       'wgust', 'precipcover', 'windchill', 'closest_airport', 'total_flights',
       'mean_tail_num_arr_delay', 'mean_tail_num_dep_delay',
       'mean_carrier_arr_delay', 'mean_carrier__dep_delay'],
      dtype='object')

In [32]:
flights_final['snow_type'] = np.where(3<=flights_final['snowdepth'],2,flights_final['snowdepth'])
flights_final['snow_type'] = np.where((flights_final['snowdepth']<3)&(1<=flights_final['snowdepth']),1,flights_final['snow_type'])
flights_final['snow_type'] = np.where((flights_final['snow_type']==2)|(flights_final['snow_type']==1),flights_final['snow_type'],0)
flights_final.drop('snowdepth', axis = 1, inplace = True)

In [33]:
flights_final[['windchill', 'heatindex','wgust']].median()

windchill     5.4
heatindex    85.6
wgust        44.7
dtype: float64

In [34]:
flights_final['windchill']=flights_final['windchill'].fillna(flights_final[['windchill', 'heatindex','wgust']].median()[0])
flights_final['heatindex']=flights_final['heatindex'].fillna(flights_final[['windchill', 'heatindex','wgust']].median()[1])
flights_final['wgust']=flights_final['wgust'].fillna(flights_final[['windchill', 'heatindex','wgust']].median()[2])

In [35]:
flights_final.shape

(663206, 36)

In [36]:
test_flight_clean=copy.deepcopy(flights_final)

In [37]:
test_flight_clean=test_flight_clean.dropna()

In [38]:
test_flight_clean.shape

(654725, 36)

In [39]:
test_flight_clean['arr_hour'] = test_flight_clean['crs_arr_time'].astype(int).astype(str).str.rjust(4,'0').str[:2]
test_flight_clean['dep_hour'] = test_flight_clean['crs_dep_time'].astype(int).astype(str).str.rjust(4,'0').str[:2]

In [40]:
test_flight_clean=test_flight_clean.drop(['closest_airport','crs_arr_time','crs_dep_time','dest_city_name','dew','maxt','tail_num','mint','mkt_unique_carrier','mean_carrier__dep_delay','mean_tail_num_dep_delay'],axis=1)

In [41]:
cat_list=['origin', 'dest', 'branded_code_share','mkt_carrier_fl_num','snow_type','month', 'day','arr_hour','dep_hour']

In [42]:
X_cat=copy.deepcopy(test_flight_clean[cat_list])
X_num=copy.deepcopy(test_flight_clean.drop(cat_list,axis=1))

In [43]:
X_cat['branded_code_share'] = pd.factorize(X_cat['branded_code_share'])[0]
#X_cat['mkt_carrier_fl_num'] = pd.factorize(X_cat['mkt_carrier_fl_num'])[0]
X_cat['origin'] = pd.factorize(X_cat['origin'])[0]
X_cat['dest'] = pd.factorize(X_cat['dest'])[0]

In [44]:
X_cat.columns

Index(['origin', 'dest', 'branded_code_share', 'mkt_carrier_fl_num',
       'snow_type', 'month', 'day', 'arr_hour', 'dep_hour'],
      dtype='object')

In [45]:
X_num.columns

Index(['distance', 'latitude', 'longitude', 'wdir', 'temp', 'visibility',
       'wspd', 'heatindex', 'cloudcover', 'precip', 'sealevelpressure',
       'humidity', 'wgust', 'precipcover', 'windchill', 'total_flights',
       'mean_tail_num_arr_delay', 'mean_carrier_arr_delay'],
      dtype='object')

In [46]:
set(X_cat.columns)&set(X_num.columns)

set()

In [47]:
X_cat.shape

(654725, 9)

In [48]:
X_num.shape

(654725, 18)

In [50]:
X_num=X_num.reset_index().drop('index',axis=1)

In [51]:
X_cat=X_cat.reset_index().drop(['index'],axis=1)

In [52]:
from sklearn.preprocessing import StandardScaler

In [53]:
num_scaled = StandardScaler().fit_transform(X_num)

In [54]:
X_num_std=pd.DataFrame(num_scaled)

In [55]:
X_num_std.columns=list(X_num.columns)

In [58]:
X_cleaned=pd.concat([X_num_std,X_cat],axis=1)

In [69]:
X_cleaned.shape

(654725, 27)

In [68]:
X_cleaned.columns

Index(['distance', 'latitude', 'longitude', 'wdir', 'temp', 'visibility',
       'wspd', 'heatindex', 'cloudcover', 'precip', 'sealevelpressure',
       'humidity', 'wgust', 'precipcover', 'windchill', 'total_flights',
       'mean_tail_num_arr_delay', 'mean_carrier_arr_delay', 'origin', 'dest',
       'branded_code_share', 'mkt_carrier_fl_num', 'snow_type', 'month', 'day',
       'arr_hour', 'dep_hour'],
      dtype='object')