In [1]:
import random
import copy
import datetime
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.preprocessing import StandardScaler

In [2]:
path='/Users/carlyfennell/DataScience/data_bootcamp/data/'

## Reading in Test Flights

Column_types was found by going through an optimization process to identify optimal data types for variables within the flights table. Adding it to the read_csv. function is aimed at saving space, and decreasing computational time

In [3]:
column_types={'mkt_carrier_fl_num': 'uint16', 'op_unique_carrier': 'category', 'tail_num': 'category', 'op_carrier_fl_num': 'uint16', 'origin_airport_id': 'uint16', 'origin': 'category', 'dest_airport_id': 'uint16', 'dest': 'category', 'crs_dep_time': 'uint16', 'dep_time': 'float32'}

In [4]:
flights_final=pd.read_csv(path+'test_flights.csv',dtype=column_types,parse_dates=['fl_date'],infer_datetime_format=True)

In [5]:
flights_final.shape

(660556, 20)

In [6]:
flights_clean=flights_final.dropna()

In [7]:
flights_clean.shape

(659057, 20)

In [8]:
#dropping uneccesary columns
flights_clean=flights_clean.drop(['op_unique_carrier','mkt_carrier','dup', 'crs_elapsed_time','flights'\
                                  ,'origin_city_name','dest_airport_id','origin_airport_id','op_carrier_fl_num'],axis=1)

In [9]:
#Creating features day and month from date (we thought year was irrelevant)
flights_clean['fl_date']=flights_clean['fl_date'].astype('datetime64[ns]')
flights_clean['month']=flights_clean['fl_date'].dt.month
flights_clean['day']=flights_clean['fl_date'].dt.day

In [10]:
flights_clean=flights_clean.drop(['fl_date'],axis=1)

# Read in Weather/Airport Key - Compute Weather features

In [11]:
#read weather_key into dataframe
weather_key=pd.read_csv(path+'wkey.csv')
weather_key=weather_key.drop(['Unnamed: 0','period'],axis=1)

In [12]:
new_df=pd.merge(flights_clean,weather_key[['origin','month', 'latitude', 'longitude','wdir', 'temp',
       'maxt', 'visibility', 'wspd', 'heatindex', 'cloudcover', 'mint',
       'precip', 'snowdepth', 'sealevelpressure', 'dew', 'humidity', 'wgust',
       'precipcover', 'windchill', 'closest_airport']],how='outer',on=['origin','month'])

## Read in Airport Traffic Key - Compute aggregate tragic features

In [13]:
busy_feat=pd.read_csv(path+'busy_feature.csv')
busy_feat=busy_feat.drop('Unnamed: 0',axis=1)

In [14]:
flights_clean=pd.merge(new_df,busy_feat,how='outer',on=['origin'])

# Build Carrier/Tail_num Features from keys

In [15]:
delay_df=pd.read_csv(path+'delay_Tail_key.csv')
delay_df=delay_df.drop('Unnamed: 0',axis=1)

In [16]:
flights_final = flights_clean.merge(delay_df, on=['tail_num'], how='left')

In [17]:
carrier_delay=pd.read_csv(path+'delay_carrier_key.csv')
carrier_delay=carrier_delay.drop('Unnamed: 0',axis=1)

In [18]:
flights_clean = flights_final.merge(carrier_delay, on=['mkt_unique_carrier'], how='left')

In [19]:
flights_clean.shape

(663206, 36)

## Now that all features are merged - need to check data and clean

#### Clean up weather vars - fill nulls and delete rows

In [20]:
flights_clean['snowdepth']=flights_clean['snowdepth'].fillna(0)

In [21]:
# Changing snow depth to a category because there isn't much of a range and a LOT of the values are 0 or close to 0
flights_clean['snow_type'] = np.where(3<=flights_clean['snowdepth'],2,flights_clean['snowdepth'])
flights_clean['snow_type'] = np.where((flights_clean['snowdepth']<3)&(1<=flights_clean['snowdepth']),1,flights_clean['snow_type'])
flights_clean['snow_type'] = np.where((flights_clean['snow_type']==2)|(flights_clean['snow_type']==1),flights_clean['snow_type'],0)
flights_clean.drop('snowdepth', axis = 1, inplace = True)

In [22]:
flights_clean[['windchill', 'heatindex','wgust']].median()

windchill     5.4
heatindex    85.6
wgust        44.7
dtype: float64

In [23]:
#filling missing weather vars with median values ()
flights_clean['windchill']=flights_clean['windchill'].fillna(flights_clean[['windchill', 'heatindex','wgust']].median()[0])
flights_clean['heatindex']=flights_clean['heatindex'].fillna(flights_clean[['windchill', 'heatindex','wgust']].median()[1])
flights_clean['wgust']=flights_clean['wgust'].fillna(flights_clean[['windchill', 'heatindex','wgust']].median()[2])

In [24]:
flights_clean.shape

(663206, 36)

In [25]:
test_flight_clean=copy.deepcopy(flights_clean)

In [26]:
test_flight_clean=test_flight_clean.dropna()

In [27]:
test_flight_clean.shape

(656253, 36)

## Setting and encoding our Categorical Variables

In [28]:
#Changing Hour from continuous to categorical
test_flight_clean['arr_hour'] = test_flight_clean['crs_arr_time'].astype(int).astype(str).str.rjust(4,'0').str[:2]
test_flight_clean['dep_hour'] = test_flight_clean['crs_dep_time'].astype(int).astype(str).str.rjust(4,'0').str[:2]

In [29]:
#dropping columns with high correlation to one another
test_flight_clean=test_flight_clean.drop(['closest_airport','crs_arr_time','crs_dep_time','dest_city_name','dew','maxt','tail_num','mint','mkt_unique_carrier','mean_carrier__dep_delay','mean_tail_num_dep_delay'],axis=1)

In [30]:
#setting list of catagorical variables we don't want scaled
cat_list=['origin', 'dest', 'branded_code_share','mkt_carrier_fl_num','snow_type','month', 'day','arr_hour','dep_hour']

In [31]:
#separating X data out into categorical and numerical to prep numerical for scaling
X_cat=copy.deepcopy(test_flight_clean[cat_list])
X_num=copy.deepcopy(test_flight_clean.drop(cat_list,axis=1))

In [32]:
#encoding categorical variables utilizing "factorize"
X_cat['branded_code_share'] = pd.factorize(X_cat['branded_code_share'])[0]
#X_cat['mkt_carrier_fl_num'] = pd.factorize(X_cat['mkt_carrier_fl_num'])[0]
X_cat['origin'] = pd.factorize(X_cat['origin'])[0]
X_cat['dest'] = pd.factorize(X_cat['dest'])[0]

In [33]:
#double checking we don't have any vars overlapping
set(X_cat.columns)&set(X_num.columns)

set()

In [34]:
#resetting index so when we concatenate we don't run into issues
X_num=X_num.reset_index().drop('index',axis=1)
X_cat=X_cat.reset_index().drop(['index'],axis=1)

## Scaling Numerical variables

In [35]:
num_scaled = StandardScaler().fit_transform(X_num)

In [36]:
X_num_std=pd.DataFrame(num_scaled)

In [37]:
X_num_std.columns=list(X_num.columns)

## Concatenate final dataframe to be used in training/testing

In [38]:
X_cleaned=pd.concat([X_num_std,X_cat],axis=1)

In [39]:
X_cleaned.shape

(656253, 27)

In [40]:
X_cleaned.columns

Index(['distance', 'latitude', 'longitude', 'wdir', 'temp', 'visibility',
       'wspd', 'heatindex', 'cloudcover', 'precip', 'sealevelpressure',
       'humidity', 'wgust', 'precipcover', 'windchill', 'total_flights',
       'mean_tail_num_arr_delay', 'mean_carrier_arr_delay', 'origin', 'dest',
       'branded_code_share', 'mkt_carrier_fl_num', 'snow_type', 'month', 'day',
       'arr_hour', 'dep_hour'],
      dtype='object')

In [41]:
X_cleaned.head()

Unnamed: 0,distance,latitude,longitude,wdir,temp,visibility,wspd,heatindex,cloudcover,precip,...,mean_carrier_arr_delay,origin,dest,branded_code_share,mkt_carrier_fl_num,snow_type,month,day,arr_hour,dep_hour
0,-0.696242,-0.453005,-1.186293,-0.895814,1.110744,-0.618641,-0.404148,-1.592541,0.030473,0.985964,...,-1.641298,0,0,0,5888.0,0.0,1,1.0,19,18
1,-0.696242,-0.453005,-1.186293,-0.895814,1.110744,-0.618641,-0.404148,-1.592541,0.030473,0.985964,...,-1.641298,0,0,0,6276.0,0.0,1,1.0,13,11
2,-0.748027,-0.453005,-1.186293,-0.895814,1.110744,-0.618641,-0.404148,-1.592541,0.030473,0.985964,...,-1.641298,0,1,0,4598.0,0.0,1,1.0,21,20
3,-0.748027,-0.453005,-1.186293,-0.895814,1.110744,-0.618641,-0.404148,-1.592541,0.030473,0.985964,...,-1.641298,0,1,0,4761.0,0.0,1,1.0,14,13
4,-0.748027,-0.453005,-1.186293,-0.895814,1.110744,-0.618641,-0.404148,-1.592541,0.030473,0.985964,...,-1.641298,0,1,0,5162.0,0.0,1,1.0,10,9
