In [1]:
import pandas as pd

In [2]:
metar = pd.read_csv('data/united/united-metar-data.csv', dtype={0: str})

In [3]:
metar['DATE'] = pd.to_datetime(metar['DATE'], utc='True')

In [4]:
metar['STATION'] = metar['STATION'].str.zfill(11)

In [5]:
metar = metar.sort_values(by='DATE')

In [6]:
metar = pd.concat([metar, metar.VIS.str.split(',', expand=True)], axis=1)
metar = metar.drop([1, 2, 3], axis=1)
metar = metar.rename(
    columns={
        0: 'VIS_METERS'})
metar.VIS_METERS = pd.to_numeric(metar.VIS_METERS)

In [7]:
metar = pd.concat([metar, metar.WND.str.split(',', expand=True)], axis=1)
metar = metar.drop([1, 2, 4], axis=1)
metar = metar.rename(
    columns={
        0: 'WND_ANGLE',  
        3: 'WND_SPEED'})
metar.WND_ANGLE = pd.to_numeric(metar.WND_ANGLE)
metar.WND_SPEED = pd.to_numeric(metar.WND_SPEED)

In [8]:
metar = pd.concat([metar, metar.CIG.str.split(',', expand=True)], axis=1)
metar = metar.drop([1, 2, 3], axis=1)
metar = metar.rename(
    columns={
        0: 'CIG_HEIGHT'})
metar.CIG_HEIGHT = pd.to_numeric(metar.CIG_HEIGHT)

In [9]:
metar = pd.concat([metar, metar.TMP.str.split(',', expand=True)], axis=1)
metar = metar.drop([1], axis=1)
metar = metar.rename(
    columns={
        0: 'TMP_DEG_C'})
metar.TMP_DEG_C = pd.to_numeric(metar.TMP_DEG_C)

In [10]:
metar = pd.concat([metar, metar.DEW.str.split(',', expand=True)], axis=1)
metar = metar.drop([1], axis=1)
metar = metar.rename(
    columns={
        0: 'DEW_DEG_C'})
metar.DEW_DEG_C = pd.to_numeric(metar.DEW_DEG_C)

In [11]:
metar.head()

Unnamed: 0,STATION,DATE,REPORT_TYPE,WND,CIG,VIS,TMP,DEW,SLP,REM,VIS_METERS,WND_ANGLE,WND_SPEED,CIG_HEIGHT,TMP_DEG_C,DEW_DEG_C
1488931,78762099999,2018-01-01 00:00:00+00:00,FM-15,"120,1,N,0051,1","99999,9,9,Y",9900199,2101,1301,999999,MET051METAR MROC 010000Z 12010KT CAVOK 21/13 A...,9900,120,51,99999,210,130
1473919,78583099999,2018-01-01 00:00:00+00:00,FM-15,"999,9,C,0000,1","99999,9,9,N",9999199,2401,2301,999999,MET063METAR MZBZ 010000Z 00000KT 9999 FEW018 2...,9999,999,0,99999,240,230
381935,46686099999,2018-01-01 00:00:00+00:00,FM-15,"060,1,N,0067,1","01829,1,C,N",9999199,1701,1201,999999,MET074METAR RCTP 010000Z 06013KT 9999 FEW020 S...,9999,60,67,1829,170,120
1479807,78720099999,2018-01-01 00:00:00+00:00,FM-15,"040,1,N,0021,1","99999,9,9,N",9999199,2101,1401,999999,MET063METAR MHTG 010000Z 04004KT 9999 FEW030 2...,9999,40,21,99999,210,140
365008,42181099999,2018-01-01 00:00:00+00:00,FM-15,"260,1,N,0015,1","99999,9,9,N",50199,801,701,999999,MET077METAR VIDP 010000Z 26003KT 0050 R28/0950...,50,260,15,99999,80,70


In [12]:
metar.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3254284 entries, 1488931 to 2852720
Data columns (total 16 columns):
 #   Column       Dtype              
---  ------       -----              
 0   STATION      object             
 1   DATE         datetime64[ns, UTC]
 2   REPORT_TYPE  object             
 3   WND          object             
 4   CIG          object             
 5   VIS          object             
 6   TMP          object             
 7   DEW          object             
 8   SLP          object             
 9   REM          object             
 10  VIS_METERS   int64              
 11  WND_ANGLE    int64              
 12  WND_SPEED    int64              
 13  CIG_HEIGHT   int64              
 14  TMP_DEG_C    int64              
 15  DEW_DEG_C    int64              
dtypes: datetime64[ns, UTC](1), int64(6), object(9)
memory usage: 422.1+ MB


In [13]:
stations = pd.read_csv('data/united/station-list.csv', header=None, dtype={0: str, 1: str}, names=['DEST', 'STATION'])

In [14]:
stations['STATION'] = stations['STATION'].str.zfill(11)

In [15]:
stations.head()

Unnamed: 0,DEST,STATION
0,SAN,72290023188
1,OMA,72550014942
2,PTY,78792099999
3,MAD,8221099999
4,DEL,42181099999


In [16]:
stations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132 entries, 0 to 131
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   DEST     132 non-null    object
 1   STATION  132 non-null    object
dtypes: object(2)
memory usage: 2.2+ KB


In [17]:
flights = pd.read_csv('data/united/united-flights.csv')

In [18]:
del flights[flights.columns[0]]

In [19]:
flights['DEST'] = flights['DEST'].str.replace(' ', '')

In [20]:
flights['DATE'] = pd.to_datetime(flights['DESTOBSERVATIONTIME'], utc='True')

In [21]:
flights = flights.drop(columns='DESTOBSERVATIONTIME')

In [22]:
flights = pd.merge(flights, stations, how='left', on=['DEST'])

In [23]:
flights.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5769 entries, 0 to 5768
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype              
---  ------      --------------  -----              
 0   FLIGHTDATE  5769 non-null   int64              
 1   DEST        5769 non-null   object             
 2   DATE        5769 non-null   datetime64[ns, UTC]
 3   STATION     5769 non-null   object             
dtypes: datetime64[ns, UTC](1), int64(1), object(2)
memory usage: 225.4+ KB


In [24]:
flights = flights.sort_values(by='DATE')

In [25]:
flights.head()

Unnamed: 0,FLIGHTDATE,DEST,DATE,STATION
3,20180101,SAN,2018-01-01 15:51:00+00:00,72290023188
0,20180101,SAN,2018-01-01 17:20:00+00:00,72290023188
1,20180101,SAN,2018-01-01 17:20:00+00:00,72290023188
2,20180101,OMA,2018-01-02 06:26:00+00:00,72550014942
6,20180102,PTY,2018-01-02 19:00:00+00:00,78792099999


In [26]:
data = pd.merge_asof(left=flights, right=metar, on='DATE', direction='nearest', by='STATION')

In [27]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5769 entries, 0 to 5768
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype              
---  ------       --------------  -----              
 0   FLIGHTDATE   5769 non-null   int64              
 1   DEST         5769 non-null   object             
 2   DATE         5769 non-null   datetime64[ns, UTC]
 3   STATION      5769 non-null   object             
 4   REPORT_TYPE  5769 non-null   object             
 5   WND          5769 non-null   object             
 6   CIG          5769 non-null   object             
 7   VIS          5769 non-null   object             
 8   TMP          5769 non-null   object             
 9   DEW          5769 non-null   object             
 10  SLP          5769 non-null   object             
 11  REM          5769 non-null   object             
 12  VIS_METERS   5769 non-null   int64              
 13  WND_ANGLE    5769 non-null   int64              
 14  WND_SPEED    5769 non-nu

In [28]:
data.head()

Unnamed: 0,FLIGHTDATE,DEST,DATE,STATION,REPORT_TYPE,WND,CIG,VIS,TMP,DEW,SLP,REM,VIS_METERS,WND_ANGLE,WND_SPEED,CIG_HEIGHT,TMP_DEG_C,DEW_DEG_C
0,20180101,SAN,2018-01-01 15:51:00+00:00,72290023188,FM-15,"100,5,N,0015,5","00061,5,W,N","000000,A,N,A",1065,1065,102215,MET11801/01/18 07:51:02 METAR KSAN 011551Z 100...,0,100,15,61,106,106
1,20180101,SAN,2018-01-01 17:20:00+00:00,72290023188,FM-16,"999,9,C,0000,5","00061,5,W,N","000000,A,N,A",1175,1175,999999,MET11101/01/18 09:20:02 SPECI KSAN 011720Z 000...,0,999,0,61,117,117
2,20180101,SAN,2018-01-01 17:20:00+00:00,72290023188,FM-16,"999,9,C,0000,5","00061,5,W,N","000000,A,N,A",1175,1175,999999,MET11101/01/18 09:20:02 SPECI KSAN 011720Z 000...,0,999,0,61,117,117
3,20180101,OMA,2018-01-02 06:26:00+00:00,72550014942,FM-16,"999,9,C,0000,5","00030,5,W,N","000201,A,N,A",-2445,-2895,999999,MET11101/02/18 00:26:02 SPECI KOMA 020626Z 000...,201,999,0,30,-244,-289
4,20180102,PTY,2018-01-02 19:00:00+00:00,78792099999,FM-15,"090,1,N,0015,1","00366,1,9,N",000500199,2501,2301,999999,MET100METAR MPTO 021900Z 09003KT 0500 +TSRA SC...,500,90,15,366,250,230


In [29]:
data.to_csv('data/united/united-flights-metar.csv', index=False)