In [36]:
import pandas as pd
import numpy as np

### Use the Iowa State University dataset that Geoff suggested

In [4]:
weather = pd.read_csv('../../project_datasets/asos.txt')

  interactivity=interactivity, compiler=compiler, result=result)


In [20]:
# metadata


# station:
#     three or four character site identifier
# valid:
#     timestamp of the observation
# tmpf:
#     Air Temperature in Fahrenheit, typically @ 2 meters
# dwpf:
#     Dew Point Temperature in Fahrenheit, typically @ 2 meters
# relh:
#     Relative Humidity in %
# drct:
#     Wind Direction in degrees from north
# sknt:
#     Wind Speed in knots 
# p01i:
#     One hour precipitation for the period from the observation time to the time of the previous hourly precipitation reset. This varies slightly by site. Values are in inches. This value may or may not contain frozen precipitation melted by some device on the sensor or estimated by some other means. Unfortunately, we do not know of an authoritative database denoting which station has which sensor.
# alti:
#     Pressure altimeter in inches
# mslp:
#     Sea Level Pressure in millibar
# vsby:
#     Visibility in miles
# gust:
#     Wind Gust in knots
# skyc1:
#     Sky Level 1 Coverage
# skyc2:
#     Sky Level 2 Coverage
# skyc3:
#     Sky Level 3 Coverage
# skyc4:
#     Sky Level 4 Coverage
# skyl1:
#     Sky Level 1 Altitude in feet
# skyl2:
#     Sky Level 2 Altitude in feet
# skyl3:
#     Sky Level 3 Altitude in feet
# skyl4:
#     Sky Level 4 Altitude in feet
# presentwx:
#     Present Weather Codes (space seperated)
# metar:
#     unprocessed reported observation in METAR format 

In [5]:
weather.station.value_counts()

# PAE: Everett/Paine Field
# SEA: Seattle-Tacoma Int'l airport
# RNT: Renton Municipal
# BFI: Seattle Boeing Field (King County Airport)

PAE    229849
RNT    225614
SEA    224407
BFI    220462
Name: station, dtype: int64

In [18]:
weather.columns.values

array(['station', 'valid', 'lon', 'lat', 'tmpf', ' dwpf', ' relh',
       ' drct', ' sknt', ' p01i', ' alti', ' mslp', ' vsby', ' gust',
       ' skyc1', ' skyc2', ' skyc3', ' skyc4', ' skyl1', ' skyl2',
       ' skyl3', ' skyl4', ' wxcodes', ' metar'], dtype=object)

In [22]:
weather_final = weather[['station', 'valid','tmpf',' p01i',' sknt']]

In [24]:
print("Original dataframe shape:", weather_final.shape)
print("# of rows with missing temperature: ",len(weather_final[weather_final['tmpf'] == 'M']))
weather_final = weather_final[ (weather_final['tmpf'] != 'M') 
                              & (weather_final[' p01i'] != 'M')
                              & (weather_final[' sknt'] != 'M')
             ] #Remove fields with missing values.

weather_final.columns = ['Station', 'TimeStamp','Temp(F)','Precipitation(in)','WindSpeed(kts)']
weather_final['Temp(F)'] = weather_final['Temp(F)'].astype('float64') #convert types to floats
weather_final['Precipitation(in)'] = weather_final['Precipitation(in)'].astype('float64')
weather_final['WindSpeed(kts)'] = weather_final['WindSpeed(kts)'].astype('float64')
weather_final.head()

Original dataframe shape: (185537, 5)
# of rows with missing temperature:  0


Unnamed: 0,Station,TimeStamp,Temp(F),Precipitation(in),WindSpeed(kts)
0,PAE,2013-12-31 23:22,37.4,0.0,0.0
1,PAE,2013-12-31 23:27,39.2,0.0,0.0
2,SEA,2013-12-31 23:40,42.8,0.0,0.0
3,RNT,2013-12-31 23:46,39.2,0.0,0.0
4,BFI,2013-12-31 23:53,42.98,0.0,3.0


In [26]:
weather_final['TimeStamp'] = pd.to_datetime(weather_final['TimeStamp'])
weather_final['Year'] = weather_final['TimeStamp'].dt.year
weather_final['Month'] = weather_final['TimeStamp'].dt.month
weather_final['Day'] = weather_final['TimeStamp'].dt.day
weather_final['Hour'] = weather_final['TimeStamp'].dt.hour

In [28]:
weather_final.head()

Unnamed: 0,Station,TimeStamp,Temp(F),Precipitation(in),WindSpeed(kts),Year,Month,Day,Hour
0,PAE,2013-12-31 23:22:00,37.4,0.0,0.0,2013,12,31,23
1,PAE,2013-12-31 23:27:00,39.2,0.0,0.0,2013,12,31,23
2,SEA,2013-12-31 23:40:00,42.8,0.0,0.0,2013,12,31,23
3,RNT,2013-12-31 23:46:00,39.2,0.0,0.0,2013,12,31,23
4,BFI,2013-12-31 23:53:00,42.98,0.0,3.0,2013,12,31,23


In [33]:
weather_final.Year.value_counts()/4

2014    10739.75
2017    10637.25
2015    10629.25
2016    10488.25
2018     3887.50
2013        2.25
Name: Year, dtype: float64

In [37]:
weather_final_hour = weather_final.groupby(['Year','Month','Day','Hour']).agg({'Temp(F)': np.mean,
                                                          'Precipitation(in)': np.mean,
                                                          'WindSpeed(kts)': np.mean}).reset_index()

In [44]:
weather_final_hour.Year.value_counts()

2016    8776
2014    8743
2017    8735
2015    8732
2018    3206
2013       1
Name: Year, dtype: int64

In [46]:
weather_final_day = weather_final_hour.groupby(['Year','Month','Day']).agg(
                                                        {'Temp(F)': [np.mean, np.max, np.min],
                                                          'Precipitation(in)': np.sum,
                                                          'Hour' : np.size,
                                                          'WindSpeed(kts)': np.mean}).reset_index()
weather_final_day.columns = weather_final_day.columns.get_level_values(0)
weather_final_day.columns = ['Year','Month','Day','Temp(F) Mean','Temp(F) High'
                             ,'Temp(F) Low','Precipitation(in)','CountOfObs','WindSpeed(kts)']

In [48]:
weather_final_day.Year.value_counts()

2016    366
2017    365
2015    365
2014    365
2018    134
2013      1
Name: Year, dtype: int64

In [49]:
print("mean # of observations:",np.mean(weather_final_day['CountOfObs']))
weather_final_day.groupby(weather_final_day['CountOfObs']).size().to_frame()

mean # of observations: 23.93045112781955


Unnamed: 0_level_0,0
CountOfObs,Unnamed: 1_level_1
1,1
16,1
18,1
19,3
20,1
21,5
22,8
23,24
24,1552


We see from the above that most days have 23 or more observations, so we shouldn't be highlighly concerned with missing data impacting our results.

To reduce the risk the risk of any day with a low number of observations impacting our temperature average (and therefore possibly affecting our analysis results), we'll include only days with 22 or more observations.

In [50]:
weather_final_day = weather_final_day[weather_final_day['CountOfObs'] >= 22]

In [51]:
weather_final_day.head()

Unnamed: 0,Year,Month,Day,Temp(F) Mean,Temp(F) High,Temp(F) Low,Precipitation(in),CountOfObs,WindSpeed(kts)
1,2014,1,1,41.220565,43.73,37.48,0.002,24,1.158135
2,2014,1,2,45.942307,50.27,42.584,0.319591,24,5.513905
3,2014,1,3,41.507676,45.554,35.924,0.078167,24,3.689567
4,2014,1,4,37.729214,44.285,32.495,0.0,24,3.076885
5,2014,1,5,36.407791,44.015,30.41,0.0,24,3.830745
