In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import glob
import os

%matplotlib inline

In [3]:
# create list of files to concat
all_files = glob.glob('data/2021.csv')

# create empty list
li = []

cols = ['STATION', 'SOURCE', 'DATE', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'NAME', 'REPORT_TYPE', 'CALL_SIGN', 'WND', 'CIG', 'VIS', 'TMP', 'DEW', 'SLP', 'REM']

dtypes = {'STATION': str, 'SOURCE': str, 'LATITUDE': str, 'LONGITUDE': str, 'ELEVATION': str, 'NAME': object, 'REPORT_TYPE': object, 'CALL_SIGN': object, 'WND': object, 'CIG': object, 'VIS': object, 'TMP': object, 'DEW': object, 'SLP': object, 'REM': object}



for filename in all_files:
    df = pd.read_csv(filename, 
                     parse_dates=['DATE'], 
                     index_col=None, 
                     header=0,
                     usecols=lambda c: c in cols,
                     dtype=dtypes
                    )
    li.append(df)
df = pd.concat(li, axis=0, ignore_index=True)
print('all rows =',df.shape)

df = df[df['CALL_SIGN']!=99999]
df = df[df['CALL_SIGN']!='99999']
print('only call signs =',df.shape)

df_15 = df[df['REPORT_TYPE']=='FM-15']
df_16 = df[df['REPORT_TYPE']=='FM-16']
print('FM_15 reports =',df_15.shape)
print('FM_16 reports =',df_16.shape)

wx_data = pd.concat([df_15, df_16], axis=0, ignore_index=True)
print('combined rows =',wx_data.shape)

all rows = (129058255, 16)
only call signs = (34597991, 16)
FM_15 reports = (31581343, 16)
FM_16 reports = (2388645, 16)
combined rows = (33969988, 16)


In [4]:
unique_stations = wx_data.CALL_SIGN.unique()
unique_stations

array(['KNXP ', 'PATO ', 'PASK ', ..., 'KDYA ', 'KFWB ', 'TXKF '],
      dtype=object)

## check that the data lengths match

## expand cols with additional data and rename

In [9]:
wx_data = pd.concat([wx_data, wx_data.VIS.str.split(',', expand=True)], axis=1)
wx_data = wx_data.rename(
    columns={
        0: 'VIS_METERS', 
        1: 'VIS_Q', 
        2: 'VIS_V', 
        3: 'VIS_QV'})
wx_data.VIS_METERS = pd.to_numeric(wx_data.VIS_METERS)

In [10]:
wx_data = pd.concat([wx_data, wx_data.WND.str.split(',', expand=True)], axis=1)
wx_data = wx_data.rename(
    columns={
        0: 'WND_ANGLE', 
        1: 'WND_QC', 
        2: 'WND_TYPE', 
        3: 'WND_SPEED',
        4: 'WND_SPEED_QC'})
wx_data.WND_ANGLE = pd.to_numeric(wx_data.WND_ANGLE)
wx_data.WND_SPEED = pd.to_numeric(wx_data.WND_SPEED)

In [11]:
wx_data = pd.concat([wx_data, wx_data.CIG.str.split(',', expand=True)], axis=1)
wx_data = wx_data.rename(
    columns={
        0: 'CIG_HEIGHT', 
        1: 'CIG_QC', 
        2: 'CIG_DC', # Determination Code 
        3: 'CIG_CAVOK'})
wx_data.CIG_HEIGHT = pd.to_numeric(wx_data.CIG_HEIGHT)

In [11]:
wx_data = pd.concat([wx_data, wx_data.TMP.str.split(',', expand=True)], axis=1)
wx_data = wx_data.rename(
    columns={
        0: 'TMP_DEG_C', 
        1: 'TMP_QC'})
wx_data.TMP_DEG_C = pd.to_numeric(wx_data.TMP_DEG_C)

In [12]:
wx_data = wx_data.rename(
    columns={
        0: 'TMP_DEG_C', 
        1: 'TMP_QC'})
wx_data.TMP_DEG_C = pd.to_numeric(wx_data.TMP_DEG_C)

In [13]:
wx_data = pd.concat([wx_data, wx_data.DEW.str.split(',', expand=True)], axis=1)
wx_data = wx_data.rename(
    columns={
        0: 'DEW_DEG_C', 
        1: 'DEW_QC'})
wx_data.DEW_DEG_C = pd.to_numeric(wx_data.DEW_DEG_C)

In [14]:
wx_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10180 entries, 0 to 10179
Data columns (total 33 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   STATION       10180 non-null  int64         
 1   DATE          10180 non-null  datetime64[ns]
 2   SOURCE        10180 non-null  int64         
 3   LATITUDE      10180 non-null  float64       
 4   LONGITUDE     10180 non-null  float64       
 5   ELEVATION     10180 non-null  float64       
 6   NAME          10180 non-null  object        
 7   REPORT_TYPE   10180 non-null  object        
 8   CALL_SIGN     10180 non-null  object        
 9   WND           10180 non-null  object        
 10  CIG           10180 non-null  object        
 11  VIS           10180 non-null  object        
 12  TMP           10180 non-null  object        
 13  DEW           10180 non-null  object        
 14  SLP           10180 non-null  object        
 15  REM           10180 non-null  object

In [15]:
wx_data.head()

Unnamed: 0,STATION,DATE,SOURCE,LATITUDE,LONGITUDE,ELEVATION,NAME,REPORT_TYPE,CALL_SIGN,WND,...,WND_SPEED,WND_SPEED_QC,CIG_HEIGHT,CIG_QC,CIG_DC,CIG_CAVOK,TMP_DEG_C,TMP_QC,DEW_DEG_C,DEW_QC
0,72565003017,2021-01-01 00:53:00,7,39.84657,-104.65623,1647.2,"DENVER INTERNATIONAL AIRPORT, CO US",FM-15,KDEN,"340,5,N,0026,5",...,26,5,3353,5,M,N,0,5,-67,5
1,72565003017,2021-01-01 01:53:00,7,39.84657,-104.65623,1647.2,"DENVER INTERNATIONAL AIRPORT, CO US",FM-15,KDEN,"340,5,N,0026,5",...,26,5,3658,5,M,N,-6,5,-61,5
2,72565003017,2021-01-01 02:53:00,7,39.84657,-104.65623,1647.2,"DENVER INTERNATIONAL AIRPORT, CO US",FM-15,KDEN,"999,9,C,0000,5",...,0,5,3658,5,M,N,0,5,-78,5
3,72565003017,2021-01-01 03:53:00,7,39.84657,-104.65623,1647.2,"DENVER INTERNATIONAL AIRPORT, CO US",FM-15,KDEN,"999,9,C,0000,5",...,0,5,3658,5,M,N,0,5,-67,5
4,72565003017,2021-01-01 04:53:00,7,39.84657,-104.65623,1647.2,"DENVER INTERNATIONAL AIRPORT, CO US",FM-15,KDEN,"010,5,N,0015,5",...,15,5,6706,5,M,N,0,5,-89,5


In [16]:
daily_average = wx_data.resample('d', on='DATE').mean().dropna(how='all')

In [17]:
daily_average.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 365 entries, 2021-01-01 to 2021-12-31
Freq: D
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   STATION     365 non-null    float64
 1   SOURCE      365 non-null    float64
 2   LATITUDE    365 non-null    float64
 3   LONGITUDE   365 non-null    float64
 4   ELEVATION   365 non-null    float64
 5   VIS_METERS  365 non-null    float64
 6   WND_ANGLE   365 non-null    float64
 7   WND_SPEED   365 non-null    float64
 8   CIG_HEIGHT  365 non-null    float64
 9   TMP_DEG_C   365 non-null    float64
 10  DEW_DEG_C   365 non-null    float64
dtypes: float64(11)
memory usage: 34.2 KB


In [18]:
daily_average.head()

Unnamed: 0_level_0,STATION,SOURCE,LATITUDE,LONGITUDE,ELEVATION,VIS_METERS,WND_ANGLE,WND_SPEED,CIG_HEIGHT,TMP_DEG_C,DEW_DEG_C
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2021-01-01,72565000000.0,7.0,39.84657,-104.65623,1647.2,16093.0,369.833333,21.291667,11186.083333,-6.916667,-74.541667
2021-01-02,72565000000.0,7.0,39.84657,-104.65623,1647.2,16093.0,227.5,43.958333,21311.916667,0.25,-88.791667
2021-01-03,72565000000.0,7.0,39.84657,-104.65623,1647.2,16093.0,347.791667,42.416667,19311.25,30.291667,-98.541667
2021-01-04,72565000000.0,7.0,39.84657,-104.65623,1647.2,16093.0,221.666667,45.75,21235.75,50.916667,-100.0
2021-01-05,72565000000.0,7.0,39.84657,-104.65623,1647.2,16093.0,217.2,51.44,17469.84,46.48,-76.28
