In [1]:
import pandas as pd
import os
import datetime as dt
import numpy as np

In [2]:
airport = 'DEN'
station = 72565003017

# flights

In [3]:
root = 'data/kden/flights/2018/'
file_list = []
csv_data = []

In [4]:
for path, subdirs, files in os.walk(root):
    for name in files:
        if name.endswith('.csv'):
            file_list.append(os.path.join(path, name))

In [5]:
flight_cols = ['FlightDate',
       'Marketing_Airline_Network',
       'Flight_Number_Marketing_Airline',
       'Tail_Number',
       'Origin',
       'Dest',
       'CRSDepTime',
       'DepTime',
       'DepDelay',
       'TaxiOut',
       'WheelsOff',
        'AirTime',
       'WheelsOn',
       'TaxiIn',
       'CRSArrTime',
       'ArrTime',
       'ArrDelay',
        'Cancelled',
        'CancellationCode',
        'Diverted',
        'CarrierDelay',
        'WeatherDelay',
        'NASDelay',
        'SecurityDelay',
        'LateAircraftDelay'
       ]

In [6]:
flight_dtypes = {'Marketing_Airline_Network': object,
       'Flight_Number_Marketing_Airline': int,
       'Tail_Number': object,
       'Origin': object,
       'Dest': object,
       'CRSDepTime': int,
       'DepTime': int,
       'DepDelay': float,
       'TaxiOut': int,
       'WheelsOff': int,
       'WheelsOn': int,
       'TaxiIn': int,
       'CRSArrTime': int,
       'ArrTime': int,
       'ArrDelay': float,
        'Cancelled': int,
        'CancellationCode': object,
        'Diverted': int,
        'CarrierDelay': float,
        'WeatherDelay': float,
        'NASDelay': float,
        'SecurityDelay': float,
        'LateAircraftDelay': float}

In [7]:
for file in sorted(file_list):
    csv_file = pd.read_csv(file, usecols=lambda c: c in flight_cols, parse_dates=['FlightDate'], encoding_errors='ignore', index_col=None, header=0, low_memory=False)
    csv_data.append(csv_file)

In [8]:
flights = pd.concat(csv_data, ignore_index=True)

In [9]:
flights = flights[(flights.Dest == airport)]

In [10]:
flights.CRSArrTime = flights.CRSArrTime.astype(str).str.zfill(4)

In [11]:
flights.CRSArrTime = pd.to_datetime(flights.CRSArrTime,format= '%H%M' ).dt.time

In [12]:
flights.CRSArrTime = pd.to_datetime(flights.astype(str).FlightDate + flights.astype(str).CRSArrTime, format='%Y-%m-%d%H:%M:%S')

In [13]:
flights.CRSArrTime = flights.CRSArrTime.dt.tz_localize('US/Mountain', ambiguous=np.array([False] * flights.shape[0]))
# all False -> every row considered DT, alternative is True to indicate DST.

In [14]:
flights['CRSArrTimeUTC'] = flights.CRSArrTime.dt.tz_convert('UTC')

In [15]:
# print(flights[['CRSArrTime', 'CRSArrTimeUTC']])

In [16]:
flights = flights.sort_values(by='CRSArrTimeUTC')

# weather

In [17]:
root = 'data/kden/weather/2018/'
file_list = []
csv_data = []

In [18]:
weather_cols = [
    'DATE',
    'REPORT_TYPE',
    'CALL_SIGN',
    'VIS',
    'REM']

In [19]:
for path, subdirs, files in os.walk(root):
    for name in files:
        if name.endswith('.csv'):
            file_list.append(os.path.join(path, name))

In [20]:
for file in sorted(file_list):
    csv_file = pd.read_csv(file, usecols=lambda c: c in weather_cols, parse_dates=['DATE'], encoding_errors='ignore', index_col=None, header=0, low_memory=False)
    csv_data.append(csv_file)

In [21]:
weather = pd.concat(csv_data, ignore_index=True)

In [22]:
weather = weather[(weather.REPORT_TYPE == 'FM-15') | (weather.REPORT_TYPE == 'FM-16')]

In [23]:
weather = pd.concat([weather, weather.VIS.str.split(',', expand=True)], axis=1)
weather = weather.rename(columns={0: 'VIS_METERS', 1: 'VIS_Q', 2: 'VIS_V', 3: 'VIS_QV'})
weather = weather.drop(columns={'VIS', 'VIS_Q', 'VIS_QV'})
weather.VIS_METERS = pd.to_numeric(weather.VIS_METERS)

In [24]:
weather.DATE = pd.to_datetime(weather.DATE, utc='True')

In [25]:
weather = weather[['DATE', 'VIS_METERS', 'REM']]

# merge flights & weather

In [26]:
data = pd.merge_asof(left=flights, right=weather, left_on='CRSArrTimeUTC', right_on='DATE', direction='nearest')

In [27]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 274216 entries, 0 to 274215
Data columns (total 29 columns):
 #   Column                           Non-Null Count   Dtype                      
---  ------                           --------------   -----                      
 0   FlightDate                       274216 non-null  datetime64[ns]             
 1   Marketing_Airline_Network        274216 non-null  object                     
 2   Flight_Number_Marketing_Airline  274216 non-null  int64                      
 3   Tail_Number                      273016 non-null  object                     
 4   Origin                           274216 non-null  object                     
 5   Dest                             274216 non-null  object                     
 6   CRSDepTime                       274216 non-null  int64                      
 7   DepTime                          271876 non-null  float64                    
 8   DepDelay                         271866 non-null  floa

In [28]:
data.to_csv('outputs/test.csv')