# METAR DATA

## Introduction
Metar stands for "Meteorological Terminal Air Report" and is therefore a documentation of meteorological data concerning specific airports.
Data was gathered from the Iowa ASOS network from the following link: __[https://mesonet.agron.iastate.edu/request/download.phtml](https://mesonet.agron.iastate.edu/request/download.phtml)__

For the codes within metar data we used __[https://www.theairlinepilots.com/forumarchive/aviation-weather/metar.pdf](https://www.theairlinepilots.com/forumarchive/aviation-weather/metar.pdf)__ to interpret their meanings

There have been found 193.033 entries with 31 columns.
## Dropped
For this project I dropped columns which are referring to the airport and are therefore equal in all rows.
- station (Abbreviation of airport - redundant)
- lon (longitude - airport longitude is known)
- lat (latitude - airport latitude is known)
- metar (unsure of use)

Additionally, valid was split up into time and date

## Missing and Trace values
Missing values are marked with M and trace values with T.
I replace missing and trace values (there were so few that this was deemed okay) values with pd.NaN to better use the functions provided by pandas/numpy

In [1]:
import pandas as pd
import re
import numpy as np
import os

from main.utils.data_manage_utils import read_table_from_subfolder

# Variables
ROOT_PATH = os.path.abspath("../../")
INPUT_FOLDER = os.path.join(ROOT_PATH, "data/input")
OUTPUT_FOLDER = os.path.join(ROOT_PATH, "data/preparation/prepped_files")

In [2]:
read_table_from_subfolder(os.path.join(INPUT_FOLDER,"data_raw/METAR_US/*_KATL_*.txt"))

  df = pd.read_table(filename, delimiter=delim)


Unnamed: 0,station,valid,lon,lat,tmpf,dwpf,relh,drct,sknt,p01i,...,skyl4,wxcodes,ice_accretion_1hr,ice_accretion_3hr,ice_accretion_6hr,peak_wind_gust,peak_wind_drct,peak_wind_time,feel,metar
0,ATL,2016-01-01 00:52,-84.4418,33.6301,51.98,42.08,68.86,320.00,10.00,0.00,...,25000.00,M,M,M,M,M,M,M,51.98,KATL 010052Z 32010KT 10SM FEW100 SCT150 BKN200...
1,ATL,2016-01-01 01:52,-84.4418,33.6301,50.00,41.00,71.07,330.00,7.00,0.00,...,M,M,M,M,M,M,M,M,46.72,KATL 010152Z 33007KT 10SM SCT150 BKN200 BKN250...
2,ATL,2016-01-01 02:52,-84.4418,33.6301,48.92,41.00,73.99,310.00,12.00,0.00,...,M,M,M,M,M,M,M,M,43.54,KATL 010252Z 31012KT 10SM FEW150 BKN200 BKN250...
3,ATL,2016-01-01 03:52,-84.4418,33.6301,46.94,39.92,76.43,320.00,13.00,0.00,...,M,M,M,M,M,M,M,M,40.71,KATL 010352Z 32013KT 10SM FEW150 SCT200 BKN250...
4,ATL,2016-01-01 04:52,-84.4418,33.6301,46.04,39.02,76.35,320.00,10.00,0.00,...,M,M,M,M,M,M,M,M,40.57,KATL 010452Z 32010KT 10SM FEW150 BKN200 BKN250...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193028,ATL,2017-12-30 23:40,-84.4418,33.6301,M,M,M,350.00,8.00,M,...,M,M,M,M,M,M,M,M,M,KATL 302340Z AUTO 35008KT 10SM 07/M01 A3008 RM...
193029,ATL,2017-12-30 23:45,-84.4418,33.6301,M,M,M,360.00,10.00,M,...,M,M,M,M,M,M,M,M,M,KATL 302345Z AUTO 36010KT 10SM 07/M01 A3008 RM...
193030,ATL,2017-12-30 23:50,-84.4418,33.6301,M,M,M,330.00,10.00,M,...,M,M,M,M,M,M,M,M,M,KATL 302350Z AUTO 33010KT 10SM 07/M01 A3009 RM...
193031,ATL,2017-12-30 23:52,-84.4418,33.6301,44.10,30.00,57.37,340.00,9.00,0.00,...,M,M,M,M,M,M,M,M,38.56,KATL 302352Z 34009KT 10SM FEW200 BKN250 07/M01...


In [3]:
atl_weather_df = read_table_from_subfolder(os.path.join(INPUT_FOLDER, "data_raw/METAR_US/*_KATL_*.txt"))
date = atl_weather_df["valid"].apply(lambda x: x.split()[0])
time = atl_weather_df["valid"].apply(lambda x: x.split()[1])
atl_weather_df = atl_weather_df.drop(labels=["station","lon", "lat","valid"], axis="columns")
atl_weather_df["date"] = date
atl_weather_df["time"] = time
atl_weather_df = atl_weather_df.mask(atl_weather_df == "M")
#atl_weather_df["valid"] = atl_weather_df["valid"].apply(lambda x: re.match('\d\d\d\d-\d\d-\d\d',x).group(0))
atl_weather_df

  df = pd.read_table(filename, delimiter=delim)


Unnamed: 0,tmpf,dwpf,relh,drct,sknt,p01i,alti,mslp,vsby,gust,...,ice_accretion_1hr,ice_accretion_3hr,ice_accretion_6hr,peak_wind_gust,peak_wind_drct,peak_wind_time,feel,metar,date,time
0,51.98,42.08,68.86,320.00,10.00,0.00,30.19,1022.50,10.0,,...,,,,,,,51.98,KATL 010052Z 32010KT 10SM FEW100 SCT150 BKN200...,2016-01-01,00:52
1,50.00,41.00,71.07,330.00,7.00,0.00,30.18,1022.20,10.0,,...,,,,,,,46.72,KATL 010152Z 33007KT 10SM SCT150 BKN200 BKN250...,2016-01-01,01:52
2,48.92,41.00,73.99,310.00,12.00,0.00,30.18,1022.00,10.0,,...,,,,,,,43.54,KATL 010252Z 31012KT 10SM FEW150 BKN200 BKN250...,2016-01-01,02:52
3,46.94,39.92,76.43,320.00,13.00,0.00,30.17,1021.90,10.0,,...,,,,,,,40.71,KATL 010352Z 32013KT 10SM FEW150 SCT200 BKN250...,2016-01-01,03:52
4,46.04,39.02,76.35,320.00,10.00,0.00,30.18,1022.00,10.0,,...,,,,,,,40.57,KATL 010452Z 32010KT 10SM FEW150 BKN200 BKN250...,2016-01-01,04:52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193028,,,,350.00,8.00,,30.08,,10.0,,...,,,,,,,,KATL 302340Z AUTO 35008KT 10SM 07/M01 A3008 RM...,2017-12-30,23:40
193029,,,,360.00,10.00,,30.08,,10.0,,...,,,,,,,,KATL 302345Z AUTO 36010KT 10SM 07/M01 A3008 RM...,2017-12-30,23:45
193030,,,,330.00,10.00,,30.09,,10.0,,...,,,,,,,,KATL 302350Z AUTO 33010KT 10SM 07/M01 A3009 RM...,2017-12-30,23:50
193031,44.10,30.00,57.37,340.00,9.00,0.00,30.09,1019.50,10.0,,...,,,,,,,38.56,KATL 302352Z 34009KT 10SM FEW200 BKN250 07/M01...,2017-12-30,23:52


In [9]:
from main.utils import data_manage_utils
import importlib
importlib.reload(data_manage_utils)
data_manage_utils.col_stats_to_table(atl_weather_df).style.to_latex()

'\\begin{tabular}{lllllll}\n & Name & Type & # Null & # Unique & Doubled & range \\\\\n0 & tmpf & object & 172637 & 182 & 20214 & char(5.0-5.0) \\\\\n1 & dwpf & object & 172639 & 169 & 20225 & char(4.0-5.0) \\\\\n2 & relh & object & 172669 & 3088 & 17276 & char(4.0-6.0) \\\\\n3 & drct & object & 5400 & 38 & 187595 & char(4.0-6.0) \\\\\n4 & sknt & object & 4879 & 37 & 188117 & char(4.0-5.0) \\\\\n5 & p01i & object & 166716 & 116 & 26201 & char(1.0-4.0) \\\\\n6 & alti & object & 31 & 216 & 192786 & char(5.0-5.0) \\\\\n7 & mslp & object & 175544 & 427 & 17062 & char(6.0-7.0) \\\\\n8 & vsby & float64 & 0 & 21 & 193012 & [0.12;10.0] \\\\\n9 & gust & object & 187790 & 39 & 5204 & char(5.0-5.0) \\\\\n10 & skyc1 & object & 7644 & 7 & 185382 & char(3.0-3.0) \\\\\n11 & skyc2 & object & 146844 & 5 & 46184 & char(3.0-3.0) \\\\\n12 & skyc3 & object & 172045 & 5 & 20983 & char(3.0-3.0) \\\\\n13 & skyc4 & object & 191156 & 4 & 1873 & char(3.0-3.0) \\\\\n14 & skyl1 & object & 96692 & 84 & 96257 & char

In [4]:
print("Counting 'T' values per column")
for col in atl_weather_df.columns:
    val_cnt = atl_weather_df[col].value_counts()
    if 'T' in val_cnt.index:
        print(col + ":\t"+ str(val_cnt.loc["T"]))
    else:
        print(col + ":\t0")

Counting 'T' values per column
tmpf:	0
dwpf:	0
relh:	0
drct:	0
sknt:	0
p01i:	1854
alti:	0
mslp:	0
vsby:	0
gust:	0
skyc1:	0
skyc2:	0
skyc3:	0
skyc4:	0
skyl1:	0
skyl2:	0
skyl3:	0
skyl4:	0
wxcodes:	0
ice_accretion_1hr:	11
ice_accretion_3hr:	2
ice_accretion_6hr:	2
peak_wind_gust:	0
peak_wind_drct:	0
peak_wind_time:	0
feel:	0
metar:	0
date:	0
time:	0


In [5]:
atl_weather_df = atl_weather_df.mask(atl_weather_df == "T")
atl_weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193033 entries, 0 to 193032
Data columns (total 29 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   tmpf               20396 non-null   object 
 1   dwpf               20394 non-null   object 
 2   relh               20364 non-null   object 
 3   drct               187633 non-null  object 
 4   sknt               188154 non-null  object 
 5   p01i               24463 non-null   object 
 6   alti               193002 non-null  object 
 7   mslp               17489 non-null   object 
 8   vsby               193033 non-null  float64
 9   gust               5243 non-null    object 
 10  skyc1              185389 non-null  object 
 11  skyc2              46189 non-null   object 
 12  skyc3              20988 non-null   object 
 13  skyc4              1877 non-null    object 
 14  skyl1              96341 non-null   object 
 15  skyl2              46189 non-null   object 
 16  sk

In [6]:
#atl_weather_df = num_df.join(str_df)
atl_weather_df

Unnamed: 0,tmpf,dwpf,relh,drct,sknt,p01i,alti,mslp,vsby,gust,...,ice_accretion_1hr,ice_accretion_3hr,ice_accretion_6hr,peak_wind_gust,peak_wind_drct,peak_wind_time,feel,metar,date,time
0,51.98,42.08,68.86,320.00,10.00,0.00,30.19,1022.50,10.0,,...,,,,,,,51.98,KATL 010052Z 32010KT 10SM FEW100 SCT150 BKN200...,2016-01-01,00:52
1,50.00,41.00,71.07,330.00,7.00,0.00,30.18,1022.20,10.0,,...,,,,,,,46.72,KATL 010152Z 33007KT 10SM SCT150 BKN200 BKN250...,2016-01-01,01:52
2,48.92,41.00,73.99,310.00,12.00,0.00,30.18,1022.00,10.0,,...,,,,,,,43.54,KATL 010252Z 31012KT 10SM FEW150 BKN200 BKN250...,2016-01-01,02:52
3,46.94,39.92,76.43,320.00,13.00,0.00,30.17,1021.90,10.0,,...,,,,,,,40.71,KATL 010352Z 32013KT 10SM FEW150 SCT200 BKN250...,2016-01-01,03:52
4,46.04,39.02,76.35,320.00,10.00,0.00,30.18,1022.00,10.0,,...,,,,,,,40.57,KATL 010452Z 32010KT 10SM FEW150 BKN200 BKN250...,2016-01-01,04:52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193028,,,,350.00,8.00,,30.08,,10.0,,...,,,,,,,,KATL 302340Z AUTO 35008KT 10SM 07/M01 A3008 RM...,2017-12-30,23:40
193029,,,,360.00,10.00,,30.08,,10.0,,...,,,,,,,,KATL 302345Z AUTO 36010KT 10SM 07/M01 A3008 RM...,2017-12-30,23:45
193030,,,,330.00,10.00,,30.09,,10.0,,...,,,,,,,,KATL 302350Z AUTO 33010KT 10SM 07/M01 A3009 RM...,2017-12-30,23:50
193031,44.10,30.00,57.37,340.00,9.00,0.00,30.09,1019.50,10.0,,...,,,,,,,38.56,KATL 302352Z 34009KT 10SM FEW200 BKN250 07/M01...,2017-12-30,23:52


In [7]:
def print_inv_nan_perc(df):
    print("Counting non-null percentage per column")
    for col in df.columns:
        perc = round(1 - (df[col].isna().sum() / df.__len__()),2)
        print(col + ":\t" + str(perc))
print_inv_nan_perc(atl_weather_df)

Counting non-null percentage per column
tmpf:	0.11
dwpf:	0.11
relh:	0.11
drct:	0.97
sknt:	0.97
p01i:	0.13
alti:	1.0
mslp:	0.09
vsby:	1.0
gust:	0.03
skyc1:	0.96
skyc2:	0.24
skyc3:	0.11
skyc4:	0.01
skyl1:	0.5
skyl2:	0.24
skyl3:	0.11
skyl4:	0.01
wxcodes:	0.09
ice_accretion_1hr:	0.0
ice_accretion_3hr:	0.0
ice_accretion_6hr:	0.0
peak_wind_gust:	0.0
peak_wind_drct:	0.0
peak_wind_time:	0.0
feel:	0.11
metar:	0.99
date:	1.0
time:	1.0


In [8]:
#Dropping all columns with missing metar data
atl_weather_df = atl_weather_df.drop(atl_weather_df[atl_weather_df["metar"].isnull()].index)
atl_weather_df

Unnamed: 0,tmpf,dwpf,relh,drct,sknt,p01i,alti,mslp,vsby,gust,...,ice_accretion_1hr,ice_accretion_3hr,ice_accretion_6hr,peak_wind_gust,peak_wind_drct,peak_wind_time,feel,metar,date,time
0,51.98,42.08,68.86,320.00,10.00,0.00,30.19,1022.50,10.0,,...,,,,,,,51.98,KATL 010052Z 32010KT 10SM FEW100 SCT150 BKN200...,2016-01-01,00:52
1,50.00,41.00,71.07,330.00,7.00,0.00,30.18,1022.20,10.0,,...,,,,,,,46.72,KATL 010152Z 33007KT 10SM SCT150 BKN200 BKN250...,2016-01-01,01:52
2,48.92,41.00,73.99,310.00,12.00,0.00,30.18,1022.00,10.0,,...,,,,,,,43.54,KATL 010252Z 31012KT 10SM FEW150 BKN200 BKN250...,2016-01-01,02:52
3,46.94,39.92,76.43,320.00,13.00,0.00,30.17,1021.90,10.0,,...,,,,,,,40.71,KATL 010352Z 32013KT 10SM FEW150 SCT200 BKN250...,2016-01-01,03:52
4,46.04,39.02,76.35,320.00,10.00,0.00,30.18,1022.00,10.0,,...,,,,,,,40.57,KATL 010452Z 32010KT 10SM FEW150 BKN200 BKN250...,2016-01-01,04:52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193028,,,,350.00,8.00,,30.08,,10.0,,...,,,,,,,,KATL 302340Z AUTO 35008KT 10SM 07/M01 A3008 RM...,2017-12-30,23:40
193029,,,,360.00,10.00,,30.08,,10.0,,...,,,,,,,,KATL 302345Z AUTO 36010KT 10SM 07/M01 A3008 RM...,2017-12-30,23:45
193030,,,,330.00,10.00,,30.09,,10.0,,...,,,,,,,,KATL 302350Z AUTO 33010KT 10SM 07/M01 A3009 RM...,2017-12-30,23:50
193031,44.10,30.00,57.37,340.00,9.00,0.00,30.09,1019.50,10.0,,...,,,,,,,38.56,KATL 302352Z 34009KT 10SM FEW200 BKN250 07/M01...,2017-12-30,23:52


In [9]:
metar_dict = {
    'TS' : 'Thunderstorm',
    'RA' : 'Rain',
    'DZ' : 'Drizzle',
    'SN' : 'Snow',
    'SG' : 'Snow Grains',
    'IC' : 'Ice Pellets',
    'GR' : 'Hail',
    'GS' : 'Small Hail',
    'FG' : 'FG',
    'VA' : 'Volcanic Ashe',
    'BR' : 'Mist',
    'HZ' : 'Haze',
    'DU' : 'Dust',
    'FU' : 'Smoke',
    'SA' : 'Sand',
    'PY' : 'Spray'
}

In [10]:
def find_event(metar: str):
    ret = []
    for event in [ele for ele in list(metar_dict.keys()) if(ele in metar)]:
        pos = str.find(metar, event)
        if metar[pos - 1] == "-" or metar[pos - 1] == "+":
            ret.append(metar[pos-1] + event)
        else:
            ret.append(event)
    if len(ret) > 0:
        return ret
    else:
        return np.nan

atl_weather_df["events"] = atl_weather_df["metar"].apply(lambda x: find_event(x))
print("Length of events: " + str(len(atl_weather_df[~atl_weather_df["events"].isna()])) + ". From " +str(len(atl_weather_df)) + " total entries.")
atl_weather_df["events"][~atl_weather_df["events"].isna()].value_counts()

Length of events: 27140. From 191331 total entries.


[SN]                     6209
[-RA]                    4123
[BR]                     3683
[-RA, BR]                2154
[-DZ, BR]                1296
                         ... 
[-TS, RA, BR]               1
[+TS, RA, SN, IC, FG]       1
[RA, FU]                    1
[SN, FG]                    1
[DZ, FG, BR]                1
Name: events, Length: 122, dtype: int64

In [11]:
atl_weather_df

Unnamed: 0,tmpf,dwpf,relh,drct,sknt,p01i,alti,mslp,vsby,gust,...,ice_accretion_3hr,ice_accretion_6hr,peak_wind_gust,peak_wind_drct,peak_wind_time,feel,metar,date,time,events
0,51.98,42.08,68.86,320.00,10.00,0.00,30.19,1022.50,10.0,,...,,,,,,51.98,KATL 010052Z 32010KT 10SM FEW100 SCT150 BKN200...,2016-01-01,00:52,
1,50.00,41.00,71.07,330.00,7.00,0.00,30.18,1022.20,10.0,,...,,,,,,46.72,KATL 010152Z 33007KT 10SM SCT150 BKN200 BKN250...,2016-01-01,01:52,
2,48.92,41.00,73.99,310.00,12.00,0.00,30.18,1022.00,10.0,,...,,,,,,43.54,KATL 010252Z 31012KT 10SM FEW150 BKN200 BKN250...,2016-01-01,02:52,
3,46.94,39.92,76.43,320.00,13.00,0.00,30.17,1021.90,10.0,,...,,,,,,40.71,KATL 010352Z 32013KT 10SM FEW150 SCT200 BKN250...,2016-01-01,03:52,
4,46.04,39.02,76.35,320.00,10.00,0.00,30.18,1022.00,10.0,,...,,,,,,40.57,KATL 010452Z 32010KT 10SM FEW150 BKN200 BKN250...,2016-01-01,04:52,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193028,,,,350.00,8.00,,30.08,,10.0,,...,,,,,,,KATL 302340Z AUTO 35008KT 10SM 07/M01 A3008 RM...,2017-12-30,23:40,
193029,,,,360.00,10.00,,30.08,,10.0,,...,,,,,,,KATL 302345Z AUTO 36010KT 10SM 07/M01 A3008 RM...,2017-12-30,23:45,
193030,,,,330.00,10.00,,30.09,,10.0,,...,,,,,,,KATL 302350Z AUTO 33010KT 10SM 07/M01 A3009 RM...,2017-12-30,23:50,
193031,44.10,30.00,57.37,340.00,9.00,0.00,30.09,1019.50,10.0,,...,,,,,,38.56,KATL 302352Z 34009KT 10SM FEW200 BKN250 07/M01...,2017-12-30,23:52,


In [12]:
print_inv_nan_perc(atl_weather_df)

Counting non-null percentage per column
tmpf:	0.11
dwpf:	0.11
relh:	0.11
drct:	0.97
sknt:	0.97
p01i:	0.13
alti:	1.0
mslp:	0.09
vsby:	1.0
gust:	0.03
skyc1:	0.96
skyc2:	0.24
skyc3:	0.11
skyc4:	0.01
skyl1:	0.5
skyl2:	0.24
skyl3:	0.11
skyl4:	0.01
wxcodes:	0.09
ice_accretion_1hr:	0.0
ice_accretion_3hr:	0.0
ice_accretion_6hr:	0.0
peak_wind_gust:	0.0
peak_wind_drct:	0.0
peak_wind_time:	0.0
feel:	0.11
metar:	1.0
date:	1.0
time:	1.0
events:	0.14


In [13]:
atl_weather_df = atl_weather_df.sort_values(by=["date","time"]).reset_index(drop=True)
atl_weather_df

Unnamed: 0,tmpf,dwpf,relh,drct,sknt,p01i,alti,mslp,vsby,gust,...,ice_accretion_3hr,ice_accretion_6hr,peak_wind_gust,peak_wind_drct,peak_wind_time,feel,metar,date,time,events
0,51.98,42.08,68.86,320.00,10.00,0.00,30.19,1022.50,10.0,,...,,,,,,51.98,KATL 010052Z 32010KT 10SM FEW100 SCT150 BKN200...,2016-01-01,00:52,
1,50.00,41.00,71.07,330.00,7.00,0.00,30.18,1022.20,10.0,,...,,,,,,46.72,KATL 010152Z 33007KT 10SM SCT150 BKN200 BKN250...,2016-01-01,01:52,
2,48.92,41.00,73.99,310.00,12.00,0.00,30.18,1022.00,10.0,,...,,,,,,43.54,KATL 010252Z 31012KT 10SM FEW150 BKN200 BKN250...,2016-01-01,02:52,
3,46.94,39.92,76.43,320.00,13.00,0.00,30.17,1021.90,10.0,,...,,,,,,40.71,KATL 010352Z 32013KT 10SM FEW150 SCT200 BKN250...,2016-01-01,03:52,
4,46.04,39.02,76.35,320.00,10.00,0.00,30.18,1022.00,10.0,,...,,,,,,40.57,KATL 010452Z 32010KT 10SM FEW150 BKN200 BKN250...,2016-01-01,04:52,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191326,,,,350.00,8.00,,30.08,,10.0,,...,,,,,,,KATL 302340Z AUTO 35008KT 10SM 07/M01 A3008 RM...,2017-12-30,23:40,
191327,,,,360.00,10.00,,30.08,,10.0,,...,,,,,,,KATL 302345Z AUTO 36010KT 10SM 07/M01 A3008 RM...,2017-12-30,23:45,
191328,,,,330.00,10.00,,30.09,,10.0,,...,,,,,,,KATL 302350Z AUTO 33010KT 10SM 07/M01 A3009 RM...,2017-12-30,23:50,
191329,44.10,30.00,57.37,340.00,9.00,0.00,30.09,1019.50,10.0,,...,,,,,,38.56,KATL 302352Z 34009KT 10SM FEW200 BKN250 07/M01...,2017-12-30,23:52,


In [14]:
atl_weather_df["time"] = atl_weather_df["time"].apply(lambda x: x.split(":")[0])
atl_weather_df = atl_weather_df.drop(["metar"], axis="columns")
atl_weather_df

Unnamed: 0,tmpf,dwpf,relh,drct,sknt,p01i,alti,mslp,vsby,gust,...,ice_accretion_1hr,ice_accretion_3hr,ice_accretion_6hr,peak_wind_gust,peak_wind_drct,peak_wind_time,feel,date,time,events
0,51.98,42.08,68.86,320.00,10.00,0.00,30.19,1022.50,10.0,,...,,,,,,,51.98,2016-01-01,00,
1,50.00,41.00,71.07,330.00,7.00,0.00,30.18,1022.20,10.0,,...,,,,,,,46.72,2016-01-01,01,
2,48.92,41.00,73.99,310.00,12.00,0.00,30.18,1022.00,10.0,,...,,,,,,,43.54,2016-01-01,02,
3,46.94,39.92,76.43,320.00,13.00,0.00,30.17,1021.90,10.0,,...,,,,,,,40.71,2016-01-01,03,
4,46.04,39.02,76.35,320.00,10.00,0.00,30.18,1022.00,10.0,,...,,,,,,,40.57,2016-01-01,04,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191326,,,,350.00,8.00,,30.08,,10.0,,...,,,,,,,,2017-12-30,23,
191327,,,,360.00,10.00,,30.08,,10.0,,...,,,,,,,,2017-12-30,23,
191328,,,,330.00,10.00,,30.09,,10.0,,...,,,,,,,,2017-12-30,23,
191329,44.10,30.00,57.37,340.00,9.00,0.00,30.09,1019.50,10.0,,...,,,,,,,38.56,2017-12-30,23,


In [15]:
def list_to_string(lst):
    ret = ""
    if type(lst) == list:
        for cnt,x in enumerate(lst):
            ret = ret + str(x)
            if cnt != len(lst) - 1:
                ret = ret + " "
        return ret
    else:
        return lst
atl_weather_df["events"] = atl_weather_df["events"].apply(lambda x: list_to_string(x))

In [16]:
one_hot = pd.get_dummies(atl_weather_df["events"].str.split(expand=True)).groupby(lambda x: x.split('_')[-1], axis = 1).sum()

In [17]:
idx = atl_weather_df[~pd.isna(atl_weather_df["events"])]["events"].index
atl_weather_df[~pd.isna(atl_weather_df["events"])]["events"]

181       -RA
182       -RA
183       -RA
184       -RA
185       -RA
         ... 
191206     FU
191207     FU
191208     FU
191209     FU
191210     FU
Name: events, Length: 27140, dtype: object

In [18]:
one_hot.loc[idx]

Unnamed: 0,+RA,+TS,-BR,-DZ,-RA,-SG,-SN,-TS,BR,DZ,FG,FU,GR,GS,HZ,IC,RA,SN,TS
181,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
182,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
183,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
184,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
185,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191206,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
191207,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
191208,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
191209,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [19]:
for col in one_hot.columns:
    if col[0] == "+":
        one_hot[col] = one_hot[col].apply(lambda x: x*3)
    elif col[0] == "-":
        pass
    else:
        one_hot[col] = one_hot[col].apply(lambda x: x*2)
one_hot.loc[idx]

Unnamed: 0,+RA,+TS,-BR,-DZ,-RA,-SG,-SN,-TS,BR,DZ,FG,FU,GR,GS,HZ,IC,RA,SN,TS
181,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
182,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
183,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
184,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
185,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191206,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0
191207,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0
191208,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0
191209,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0


In [20]:
one_hot = one_hot.groupby(lambda x: re.sub("[^a-zA-Z]+","",x), axis=1).sum()
one_hot = one_hot.add_prefix('EVENT_')
one_hot.loc[idx]

Unnamed: 0,EVENT_BR,EVENT_DZ,EVENT_FG,EVENT_FU,EVENT_GR,EVENT_GS,EVENT_HZ,EVENT_IC,EVENT_RA,EVENT_SG,EVENT_SN,EVENT_TS
181,0,0,0,0,0,0,0,0,1,0,0,0
182,0,0,0,0,0,0,0,0,1,0,0,0
183,0,0,0,0,0,0,0,0,1,0,0,0
184,0,0,0,0,0,0,0,0,1,0,0,0
185,0,0,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
191206,0,0,0,2,0,0,0,0,0,0,0,0
191207,0,0,0,2,0,0,0,0,0,0,0,0
191208,0,0,0,2,0,0,0,0,0,0,0,0
191209,0,0,0,2,0,0,0,0,0,0,0,0


In [21]:
atl_weather_df = atl_weather_df.join(one_hot)
atl_weather_df

Unnamed: 0,tmpf,dwpf,relh,drct,sknt,p01i,alti,mslp,vsby,gust,...,EVENT_FG,EVENT_FU,EVENT_GR,EVENT_GS,EVENT_HZ,EVENT_IC,EVENT_RA,EVENT_SG,EVENT_SN,EVENT_TS
0,51.98,42.08,68.86,320.00,10.00,0.00,30.19,1022.50,10.0,,...,0,0,0,0,0,0,0,0,0,0
1,50.00,41.00,71.07,330.00,7.00,0.00,30.18,1022.20,10.0,,...,0,0,0,0,0,0,0,0,0,0
2,48.92,41.00,73.99,310.00,12.00,0.00,30.18,1022.00,10.0,,...,0,0,0,0,0,0,0,0,0,0
3,46.94,39.92,76.43,320.00,13.00,0.00,30.17,1021.90,10.0,,...,0,0,0,0,0,0,0,0,0,0
4,46.04,39.02,76.35,320.00,10.00,0.00,30.18,1022.00,10.0,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191326,,,,350.00,8.00,,30.08,,10.0,,...,0,0,0,0,0,0,0,0,0,0
191327,,,,360.00,10.00,,30.08,,10.0,,...,0,0,0,0,0,0,0,0,0,0
191328,,,,330.00,10.00,,30.09,,10.0,,...,0,0,0,0,0,0,0,0,0,0
191329,44.10,30.00,57.37,340.00,9.00,0.00,30.09,1019.50,10.0,,...,0,0,0,0,0,0,0,0,0,0


In [22]:
atl_weather_df = atl_weather_df.drop(["events"], axis="columns")

In [23]:
str_cols=["skyc1","skyc2","skyc3","skyc4","wxcodes","peak_wind_time","time","date"]
num_cols =  []
for col in atl_weather_df.columns:
    if col not in str_cols and col not in list(one_hot.columns) :
        num_cols.append(col)
        atl_weather_df[col] = atl_weather_df[col].astype('float64')
atl_weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 191331 entries, 0 to 191330
Data columns (total 40 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   tmpf               20396 non-null   float64
 1   dwpf               20394 non-null   float64
 2   relh               20364 non-null   float64
 3   drct               185963 non-null  float64
 4   sknt               186484 non-null  float64
 5   p01i               24455 non-null   float64
 6   alti               191300 non-null  float64
 7   mslp               17489 non-null   float64
 8   vsby               191331 non-null  float64
 9   gust               5231 non-null    float64
 10  skyc1              183723 non-null  object 
 11  skyc2              45906 non-null   object 
 12  skyc3              20899 non-null   object 
 13  skyc4              1877 non-null    object 
 14  skyl1              95651 non-null   float64
 15  skyl2              45906 non-null   float64
 16  sk

In [24]:
str_df = atl_weather_df[str_cols].groupby(["date","time"]).agg(lambda x: np.nan if len(pd.Series.mode(x)) == 0 else pd.Series.mode(x)[0])
str_df

Unnamed: 0_level_0,Unnamed: 1_level_0,skyc1,skyc2,skyc3,skyc4,wxcodes,peak_wind_time
date,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-01-01,00,FEW,SCT,BKN,BKN,,
2016-01-01,01,SCT,BKN,BKN,,,
2016-01-01,02,FEW,BKN,BKN,,,
2016-01-01,03,FEW,SCT,BKN,,,
2016-01-01,04,FEW,BKN,BKN,,,
...,...,...,...,...,...,...,...
2017-12-30,19,FEW,FEW,,,,
2017-12-30,20,FEW,,,,,
2017-12-30,21,BKN,,,,,
2017-12-30,22,CLR,BKN,,,,


In [25]:
num_df = atl_weather_df[num_cols + ["date","time"]].groupby(["date","time"]).median()
num_df

Unnamed: 0_level_0,Unnamed: 1_level_0,tmpf,dwpf,relh,drct,sknt,p01i,alti,mslp,vsby,gust,skyl1,skyl2,skyl3,skyl4,ice_accretion_1hr,ice_accretion_3hr,ice_accretion_6hr,peak_wind_gust,peak_wind_drct,feel
date,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-01,00,51.98,42.08,68.86,320.0,10.0,0.0,30.19,1022.5,10.0,,10000.0,15000.0,20000.0,25000.0,,,,,,51.98
2016-01-01,01,50.00,41.00,71.07,330.0,7.0,0.0,30.18,1022.2,10.0,,15000.0,20000.0,25000.0,,,,,,,46.72
2016-01-01,02,48.92,41.00,73.99,310.0,12.0,0.0,30.18,1022.0,10.0,,15000.0,20000.0,25000.0,,,,,,,43.54
2016-01-01,03,46.94,39.92,76.43,320.0,13.0,0.0,30.17,1021.9,10.0,,15000.0,20000.0,25000.0,,,,,,,40.71
2016-01-01,04,46.04,39.02,76.35,320.0,10.0,0.0,30.18,1022.0,10.0,,15000.0,20000.0,25000.0,,,,,,,40.57
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-12-30,19,50.00,32.00,49.81,290.0,12.0,0.0,30.07,1018.6,10.0,20.5,3999.0,25000.0,,,,,,,,44.63
2017-12-30,20,51.10,32.00,47.81,300.0,11.0,0.0,30.06,1018.6,10.0,,3999.0,,,,,,,,,51.10
2017-12-30,21,48.00,30.90,51.34,300.0,11.0,0.0,30.07,1018.9,10.0,16.0,25000.0,,,,,,,,,43.03
2017-12-30,22,46.00,30.90,55.36,290.0,9.0,0.0,30.07,1018.9,10.0,,20000.0,25000.0,,,,,,,,40.92


In [26]:
oh_df = atl_weather_df[["date","time"] + list(one_hot.columns)].groupby(["date","time"]).max()
oh_df

Unnamed: 0_level_0,Unnamed: 1_level_0,EVENT_BR,EVENT_DZ,EVENT_FG,EVENT_FU,EVENT_GR,EVENT_GS,EVENT_HZ,EVENT_IC,EVENT_RA,EVENT_SG,EVENT_SN,EVENT_TS
date,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2016-01-01,00,0,0,0,0,0,0,0,0,0,0,0,0
2016-01-01,01,0,0,0,0,0,0,0,0,0,0,0,0
2016-01-01,02,0,0,0,0,0,0,0,0,0,0,0,0
2016-01-01,03,0,0,0,0,0,0,0,0,0,0,0,0
2016-01-01,04,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-12-30,19,0,0,0,0,0,0,0,0,0,0,0,0
2017-12-30,20,0,0,0,0,0,0,0,0,0,0,0,0
2017-12-30,21,0,0,0,0,0,0,0,0,0,0,0,0
2017-12-30,22,0,0,0,0,0,0,0,0,0,0,0,0


In [27]:
atl_weather_df = num_df.join(str_df)
atl_weather_df = atl_weather_df.join(oh_df)
print_inv_nan_perc(atl_weather_df)
atl_weather_df = atl_weather_df.drop(["peak_wind_time","peak_wind_gust","peak_wind_drct","peak_wind_time","ice_accretion_1hr", "ice_accretion_3hr","ice_accretion_6hr", "skyl1", "skyl2", "skyl3", "skyl4", "skyc1", "skyc2", "skyc3", "skyc4", "gust"], axis="columns")
atl_weather_df

Counting non-null percentage per column
tmpf:	1.0
dwpf:	1.0
relh:	1.0
drct:	1.0
sknt:	1.0
p01i:	0.95
alti:	1.0
mslp:	1.0
vsby:	1.0
gust:	0.16
skyl1:	0.87
skyl2:	0.61
skyl3:	0.36
skyl4:	0.09
ice_accretion_1hr:	0.0
ice_accretion_3hr:	0.0
ice_accretion_6hr:	0.0
peak_wind_gust:	0.03
peak_wind_drct:	0.03
feel:	1.0
skyc1:	1.0
skyc2:	0.61
skyc3:	0.36
skyc4:	0.09
wxcodes:	0.13
peak_wind_time:	0.03
EVENT_BR:	1.0
EVENT_DZ:	1.0
EVENT_FG:	1.0
EVENT_FU:	1.0
EVENT_GR:	1.0
EVENT_GS:	1.0
EVENT_HZ:	1.0
EVENT_IC:	1.0
EVENT_RA:	1.0
EVENT_SG:	1.0
EVENT_SN:	1.0
EVENT_TS:	1.0


Unnamed: 0_level_0,Unnamed: 1_level_0,tmpf,dwpf,relh,drct,sknt,p01i,alti,mslp,vsby,feel,...,EVENT_FG,EVENT_FU,EVENT_GR,EVENT_GS,EVENT_HZ,EVENT_IC,EVENT_RA,EVENT_SG,EVENT_SN,EVENT_TS
date,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2016-01-01,00,51.98,42.08,68.86,320.0,10.0,0.0,30.19,1022.5,10.0,51.98,...,0,0,0,0,0,0,0,0,0,0
2016-01-01,01,50.00,41.00,71.07,330.0,7.0,0.0,30.18,1022.2,10.0,46.72,...,0,0,0,0,0,0,0,0,0,0
2016-01-01,02,48.92,41.00,73.99,310.0,12.0,0.0,30.18,1022.0,10.0,43.54,...,0,0,0,0,0,0,0,0,0,0
2016-01-01,03,46.94,39.92,76.43,320.0,13.0,0.0,30.17,1021.9,10.0,40.71,...,0,0,0,0,0,0,0,0,0,0
2016-01-01,04,46.04,39.02,76.35,320.0,10.0,0.0,30.18,1022.0,10.0,40.57,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-12-30,19,50.00,32.00,49.81,290.0,12.0,0.0,30.07,1018.6,10.0,44.63,...,0,0,0,0,0,0,0,0,0,0
2017-12-30,20,51.10,32.00,47.81,300.0,11.0,0.0,30.06,1018.6,10.0,51.10,...,0,0,0,0,0,0,0,0,0,0
2017-12-30,21,48.00,30.90,51.34,300.0,11.0,0.0,30.07,1018.9,10.0,43.03,...,0,0,0,0,0,0,0,0,0,0
2017-12-30,22,46.00,30.90,55.36,290.0,9.0,0.0,30.07,1018.9,10.0,40.92,...,0,0,0,0,0,0,0,0,0,0


In [28]:
atl_weather_df.to_pickle(os.path.join(OUTPUT_FOLDER, "05_metar.pkl"))