# UNITED AIRLINES DATA EXPLORER

In [1]:
import pandas as pd

**LOAD NOAA STATION METARS. CSV FILE IS ALREADY PRE-FILTERED FOR STATIONIDS THAT ARE IN UNITED'S REPORT**

In [2]:
metar = pd.read_csv('data/united/united-metar-data.csv', dtype={0: str})

metar['DATE'] = pd.to_datetime(metar['DATE'], utc='True')

metar['STATION'] = metar['STATION'].str.zfill(11)

metar = metar.sort_values(by='DATE')

metar = pd.concat([metar, metar.VIS.str.split(',', expand=True)], axis=1)
metar = metar.drop([1, 2, 3], axis=1)

metar = metar.rename(
    columns={
        0: 'VIS_METERS'})

metar.VIS_METERS = pd.to_numeric(metar.VIS_METERS)

metar = metar.drop(columns=['WND', 'CIG', 'VIS', 'TMP', 'DEW', 'SLP'])

In [3]:
metar.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4388321 entries, 530059 to 3809503
Data columns (total 5 columns):
 #   Column       Dtype              
---  ------       -----              
 0   STATION      object             
 1   DATE         datetime64[ns, UTC]
 2   REPORT_TYPE  object             
 3   REM          object             
 4   VIS_METERS   int64              
dtypes: datetime64[ns, UTC](1), int64(1), object(3)
memory usage: 200.9+ MB


**LOAD CROSS REFERENCE OF AIRPORT CODES & STATIONID THAT OCCUR IN UNITED DATA**

In [4]:
stations = pd.read_csv('data/united/station-list.csv', header=None, dtype={0: str, 1: str}, names=['AIRPORT', 'STATION'])

stations['STATION'] = stations['STATION'].str.zfill(11)

stations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 184 entries, 0 to 183
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   AIRPORT  184 non-null    object
 1   STATION  184 non-null    object
dtypes: object(2)
memory usage: 3.0+ KB


**LOAD UNITED FLIGHT DATA**

In [5]:
flights = pd.read_csv('data/united/united-flights.csv')

In [6]:
flights['DEST'] = flights['DEST'].str.replace(' ', '')

flights['ORIGIN'] = flights['ORIGIN'].str.replace(' ', '')

flights = flights.rename(columns={'Unnamed: 0': 'INDEX'})

In [7]:
flights['DESTOBSERVATIONTIME'] = pd.to_datetime(flights['DESTOBSERVATIONTIME'], utc='True', errors='coerce')

flights['ORIGINOBSERVATIONTIME'] = pd.to_datetime(flights['ORIGINOBSERVATIONTIME'], utc='True', errors='coerce')

flights['DEPARTUREDATE_ZULU'] = pd.to_datetime(flights['DEPARTUREDATE_ZULU'], utc='True', errors='coerce')

flights['ARRIVALDATE_ZULU'] = pd.to_datetime(flights['ARRIVALDATE_ZULU'], utc='True', errors='coerce')

flights['OUT_ZULU'] = pd.to_datetime(flights['OUT_ZULU'], utc='True', errors='coerce')

flights['IN_ZULU'] = pd.to_datetime(flights['IN_ZULU'], utc='True', errors='coerce')

In [8]:
flights.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5769 entries, 0 to 5768
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype              
---  ------                 --------------  -----              
 0   INDEX                  5769 non-null   int64              
 1   FLIGHTDATE             5769 non-null   int64              
 2   FLIGHTNO               5769 non-null   int64              
 3   ORIGIN                 5769 non-null   object             
 4   DEST                   5769 non-null   object             
 5   DEPARTUREDATE_ZULU     5769 non-null   datetime64[ns, UTC]
 6   ARRIVALDATE_ZULU       5769 non-null   datetime64[ns, UTC]
 7   ORIGINOBSERVATIONTIME  5593 non-null   datetime64[ns, UTC]
 8   DESTOBSERVATIONTIME    5769 non-null   datetime64[ns, UTC]
 9   OUT_ZULU               5764 non-null   datetime64[ns, UTC]
 10  IN_ZULU                5745 non-null   datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](6), int64(3), object(2)
memory us

**CROSS REFERENCE AIRPORT CODE AND MERGE STATIONID INTO DATAFRAME**

In [9]:
flights = pd.merge(flights, stations, how='left', left_on=['DEST'], right_on=['AIRPORT'])

In [10]:
flights = flights.rename(
    columns={
        'STATION': 'DEST_STATION'})

flights = flights.drop(columns=['AIRPORT'])

flights = pd.merge(flights, stations, how='left', left_on=['ORIGIN'], right_on=['AIRPORT'])

flights = flights.drop(columns=['AIRPORT'])

flights = flights.rename(
    columns={
        'STATION': 'ORIGIN_STATION'})

In [11]:
flights.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5769 entries, 0 to 5768
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype              
---  ------                 --------------  -----              
 0   INDEX                  5769 non-null   int64              
 1   FLIGHTDATE             5769 non-null   int64              
 2   FLIGHTNO               5769 non-null   int64              
 3   ORIGIN                 5769 non-null   object             
 4   DEST                   5769 non-null   object             
 5   DEPARTUREDATE_ZULU     5769 non-null   datetime64[ns, UTC]
 6   ARRIVALDATE_ZULU       5769 non-null   datetime64[ns, UTC]
 7   ORIGINOBSERVATIONTIME  5593 non-null   datetime64[ns, UTC]
 8   DESTOBSERVATIONTIME    5769 non-null   datetime64[ns, UTC]
 9   OUT_ZULU               5764 non-null   datetime64[ns, UTC]
 10  IN_ZULU                5745 non-null   datetime64[ns, UTC]
 11  DEST_STATION           5769 non-null   object           

**CREATE NEW DATAFRAMES FOR EACH TIMESTAMP COLUMN. PRESERVE INDEX.**

In [12]:
ddz = flights[['INDEX', 'FLIGHTDATE', 'FLIGHTNO', 'ORIGIN', 'ORIGIN_STATION', 'DEPARTUREDATE_ZULU']]
#ddz.info()

ddz = ddz.dropna(subset=['DEPARTUREDATE_ZULU'])
ddz = ddz.sort_values(by='DEPARTUREDATE_ZULU')
#ddz.shape

In [13]:
adz = flights[['INDEX', 'FLIGHTDATE', 'FLIGHTNO', 'DEST', 'DEST_STATION', 'ARRIVALDATE_ZULU']]
#adz.info()

adz = adz.dropna(subset='ARRIVALDATE_ZULU')
adz = adz.sort_values(by='ARRIVALDATE_ZULU')
#adz.shape

In [14]:
oot = flights[['INDEX', 'FLIGHTDATE', 'FLIGHTNO', 'ORIGIN', 'ORIGIN_STATION', 'ORIGINOBSERVATIONTIME']]
#oot.info()

oot = oot.dropna(subset='ORIGINOBSERVATIONTIME')
oot = oot.sort_values(by='ORIGINOBSERVATIONTIME')
#oot.shape

In [15]:
dot = flights[['INDEX', 'FLIGHTDATE', 'FLIGHTNO', 'DEST', 'DEST_STATION', 'DESTOBSERVATIONTIME']]
#dot.info()

dot = dot.dropna(subset='DESTOBSERVATIONTIME')
dot = dot.sort_values(by='DESTOBSERVATIONTIME')
#dot.shape

In [16]:
otz = flights[['INDEX', 'FLIGHTDATE', 'FLIGHTNO', 'ORIGIN', 'ORIGIN_STATION', 'OUT_ZULU']]
#otz.info()

otz = otz.dropna(subset='OUT_ZULU')
otz = otz.sort_values(by='OUT_ZULU')
#otz.shape

In [17]:
inz = flights[['INDEX', 'FLIGHTDATE', 'FLIGHTNO', 'DEST', 'DEST_STATION', 'IN_ZULU']]
#inz.info()

inz = inz.dropna(subset='IN_ZULU')
inz = inz.sort_values(by='IN_ZULU')
#inz.shape

**MERGE METAR DATA BY: 'DESTOBSERVATIONTIME'**

In [18]:
dot1 = pd.merge_asof(
    left=dot, 
    right=metar, 
    left_on='DESTOBSERVATIONTIME', 
    right_on='DATE', 
    direction='nearest', 
    left_by='DEST_STATION', 
    right_by='STATION')

dot1 = dot1.rename(
    columns={
        'REM': 'DEST_UA_OBS'})

**MERGE METAR DATA BY 'ORIGINOBSERVATIONTIME'**

In [19]:
oot1 = pd.merge_asof(
    left=oot, 
    right=metar, 
    left_on='ORIGINOBSERVATIONTIME', 
    right_on='DATE', 
    direction='nearest', 
    left_by='ORIGIN_STATION', 
    right_by='STATION')

oot1 = oot1.rename(
    columns={
        'REM': 'ORIGIN_UA_OBS'})

**MERGE METAR DATA BY 'DEPARTUREDATE_ZULU'**

In [20]:
ddz1 = pd.merge_asof(
    left=ddz, 
    right=metar, 
    left_on='DEPARTUREDATE_ZULU', 
    right_on='DATE', 
    direction='nearest', 
    left_by='ORIGIN_STATION', 
    right_by='STATION')

ddz1 = ddz1.rename(
    columns={
        'REM': 'SCH_DEPARTURE'})

**MERGE METAR DATA BY 'ARRIVALDATE_ZULU'**

In [21]:
adz1 = pd.merge_asof(
    left=adz, 
    right=metar, 
    left_on='ARRIVALDATE_ZULU', 
    right_on='DATE', 
    direction='nearest', 
    left_by='DEST_STATION', 
    right_by='STATION')

adz1 = adz1.rename(
    columns={
        'REM': 'SCH_ARRIVAL'})

**MERGE METAR DATA BY 'OUT_ZULU'**

In [22]:
otz1 = pd.merge_asof(
    left=otz, 
    right=metar, 
    left_on='OUT_ZULU', 
    right_on='DATE', 
    direction='nearest', 
    left_by='ORIGIN_STATION', 
    right_by='STATION')

otz1 = otz1.rename(
    columns={
        'REM': 'ACT_DEPARTURE'})
#otz1.info()

**MERGE METAR DATA BY 'IN_ZULU'**

In [23]:
inz1 = pd.merge_asof(
    left=inz, 
    right=metar, 
    left_on='IN_ZULU', 
    right_on='DATE', 
    direction='nearest', 
    left_by='DEST_STATION', 
    right_by='STATION')

inz1 = inz1.rename(
    columns={
        'REM': 'ACT_ARRIVAL'})