## Parsing the ASOS data files

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
from metpy.io import parse_metar_to_dataframe
import glob

In [2]:
def parse_metar_file(file, wx_subset=True):
    """
    Parses METAR file from NCDC
    Input:
    --------
    file = Text file downloaded from NCDC
    wx_subset = Flag to determine whether or not to drop non-current weather obs (if True, only returns obs with observed weather)
    Output:
    --------
    df = Pandas dataframe filtered for times where current weather is not 'nan'
    
    Written by Max Grover (GitHub: @mgrover1)
    """

    # Read in the file using Pandas
    df = pd.read_csv(file, header=None)

    # Pull the timestamp from the filename
    timestamp = datetime.strptime(file[-10:], '%Y%m.dat')

    # Iterrate over rows to parse METARS
    df_list = []
    for index, row in df.iterrows():
        try:
            df_list.append(parse_metar_to_dataframe(row.values[0][52:], year=timestamp.year, month=timestamp.month))
        except:
            print('Error with METAR: ', row.values[0][52:])
    #
    merged_df = pd.concat(df_list)

    # Drop datasets that do not include current weather
    merged_df = merged_df.dropna(subset=['current_wx1'])

    # Change the index to datetime
    merged_df.index = merged_df.date_time

    # Return the merged dataset sorted by datetime
    return merged_df.sort_index()

In [3]:
# Find all files in the data directory
files = sorted(glob.glob('./data/*.dat'))

In [4]:
# Create a list to store the dataframes in
merged_datasets = []

In [16]:
%%time
# Loop through and parse the different datasets
for file in files:
    print(file[-10:-4])
    try:
        merged_datasets.append(parse_metar_file(file))
    except:
        print("Error with :", file[-10:-4])

200001


  if np.greater(np.nanmax(np.abs(value)), max_radians):


200002
200003
200004
200005
200006
200007
200008
200009
200010
Error with METAR:   KDBQ EW001 07/06 A3021  810 96 0 030/08 RMK AO2
200011
Error with METAR:   KDBQ 15N KDBQ 150815Z AUTO 25010KT 8SM CLR M07/M09 A29991010 88 -1400 250/10 RMK AO2 TSNO
200012
200101
200102
200103
200104
200105
Error with METAR:   KDBQ SLP123 T00890072 TSNO
200106
200107
200108
Error with METAR:   KDBQ 091510900 240/13 RMK AO2
200109
200110
200111
Error with METAR:   KDBQ 220649 1200 82 200 170/06 RMK AO2 TSNO $
200112
Error with METAR:   KDBNO
Error with METAR:   KDSM BR OVC006 01/M01 A2990 1100 92 -300 290/11 RMK AO2
200201
Error with METAR:   KDBQ 121300Z
200202
200203
200204
200205
200206
200207
200208
200209
200210
200211
200212
200301
200302
200303
200304
200305
200306
200307
200308
200309
200310
200311
200312
200401
200402
200403
200404
200405
200406
200407
200408
Error with METAR:   RA BR SCT003 OVC021 21/21 A2983 1160 100 2200 040/09 RMK AO2 P0005 TSNO
200409
200410
200411
200412
200501
200502
Error

In [17]:
# Return the cleaned dataset
clean_df = pd.concat(merged_datasets)

In [18]:
clean_df.head()

Unnamed: 0_level_0,station_id,latitude,longitude,elevation,date_time,wind_direction,wind_speed,current_wx1,current_wx2,current_wx3,...,cloud_coverage,air_temperature,dew_point_temperature,altimeter,present_weather,past_weather,past_weather2,air_pressure_at_sea_level,eastward_wind,northward_wind
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-01-01 00:00:00,KDBQ,42.4,-90.7,326,2000-01-01 00:00:00,260.0,9.0,BR,,,...,8,-6.0,-8.0,29.92,10,0,0,,8.86327,1.562834
2000-01-01 00:05:00,KDBQ,42.4,-90.7,326,2000-01-01 00:05:00,260.0,12.0,BR,,,...,8,-6.0,-8.0,29.92,10,0,0,,11.817693,2.083778
2000-01-01 00:10:00,KDBQ,42.4,-90.7,326,2000-01-01 00:10:00,270.0,10.0,BR,,,...,8,-6.0,-8.0,29.92,10,0,0,,10.0,1.83697e-15
2000-01-01 00:15:00,KDBQ,42.4,-90.7,326,2000-01-01 00:15:00,270.0,13.0,BR,,,...,8,-6.0,-8.0,29.92,10,0,0,,13.0,2.388061e-15
2000-01-01 00:20:00,KDBQ,42.4,-90.7,326,2000-01-01 00:20:00,260.0,12.0,BR,,,...,8,-6.0,-8.0,29.92,10,0,0,,11.817693,2.083778


In [19]:
clean_df.to_csv('./data_clean.csv')