In [163]:
from os.path import join, basename, splitext
from glob import glob
from dask import dataframe as dd
from matplotlib import rcParams
import pandas as pd
import dask
from collections import Counter
import pickle
import numpy as np
from datetime import datetime


from deep_aqi import ROOT


pd.set_option('max_columns', 50)
pd.set_option('max_rows', 25)

In [164]:
PROCESSED_DATA = join(ROOT, 'data', 'processed')
INTERIM_DATA = join(ROOT, 'data', 'interim')
RAW_DATA = join(ROOT, 'data', 'raw')

In [200]:
weather_path = join(INTERIM_DATA, 'combined-WEATHER.parquet')
df = dd.read_parquet(weather_path)

In [199]:
df.head()



Unnamed: 0,SiteCode,LocalDate,WindDir,WindSpeed,Temperature,Pressure,RelHum
0,Nebraska_Douglas_19.0,2014-04-03 23:00:00,149.0,150.660964,2.222222,96780.0,95.0
1,Tennessee_Sevier_101.0,2015-02-21 11:00:00,183.0,94.090202,2.222222,87100.0,84.0


In [10]:
df.describe().compute()

Unnamed: 0,WindDir,WindSpeed,Temperature,Pressure,RelHum
count,4889150.0,4889150.0,4889150.0,4889150.0,4889150.0
mean,194.0485,9.013095,12.41722,95880.95,64.07332
std,100.3657,7.40707,11.37627,6921.313,22.48
min,0.0,0.0,-50.0,0.0,0.0
25%,118.4,3.693624,4.111111,93080.0,48.0
50%,198.0,6.998445,13.33333,99060.0,67.0
75%,282.0,12.05288,21.11111,100400.0,82.4
max,360.0,150.661,57.22222,159695.0,100.0


### Wind Speed [m/s]

In [26]:
df[df.WindSpeed > 70].describe().compute()

Unnamed: 0,WindDir,WindSpeed,Temperature,Pressure,RelHum
count,1118.0,1118.0,1118.0,1118.0,1118.0
mean,68.559571,83.637745,6.271368,96977.271914,65.03059
std,75.083276,5.890211,6.123105,6072.597902,20.744798
min,0.0,70.178849,-12.888889,77110.0,12.0
25%,33.0,84.37014,2.611111,98880.0,53.5
50%,33.0,84.37014,6.388889,99815.0,73.5
75%,36.0,84.564541,11.111111,100200.0,85.0
max,360.0,150.660964,25.833333,101700.0,99.6


In [25]:
df.loc[df.WindSpeed > 70, 'SiteCode'].value_counts().compute()

Alabama_Jefferson_23.0        888
Tennessee_Sevier_101.0        129
Wyoming_Sublette_99.0          35
Wyoming_Sweetwater_200.0       11
Wyoming_Sweetwater_300.0       10
Maryland_Garrett_2.0           10
Nevada_Clark_540.0              9
Nebraska_Douglas_19.0           7
North Dakota_Burke_4.0          6
Ohio_Preble_1001.0              5
New Mexico_Bernalillo_23.0      3
New York_Monroe_1007.0          2
Colorado_Rio Blanco_6.0         2
Wisconsin_Dodge_1.0             1
Name: SiteCode, dtype: int64

In [16]:
df.loc[(df.SiteCode == 'Alabama_Jefferson_23.0') & (df.WindSpeed > 70), 'LocalDate'].dt.year.value_counts().compute()

2015    879
2014      8
2017      1
Name: LocalDate, dtype: int64

In [17]:
df.loc[(df.SiteCode == 'Alabama_Jefferson_23.0') & (df.WindSpeed > 70) & (df.LocalDate.dt.year == 2015), 'LocalDate'].dt.month.value_counts().compute()

1    647
2    232
Name: LocalDate, dtype: int64

Reason - especially severe conditions in central alabama in 2015 
https://www.weather.gov/bmx/climo_2015review#part5

In [28]:
df.loc[df.WindSpeed > 92.5, 'SiteCode'].value_counts().compute()

Tennessee_Sevier_101.0    13
Nebraska_Douglas_19.0      5
Maryland_Garrett_2.0       5
Ohio_Preble_1001.0         2
Wisconsin_Dodge_1.0        1
New York_Monroe_1007.0     1
Nevada_Clark_540.0         1
Name: SiteCode, dtype: int64

Excluding all wind speed values exceeding 92.5 m/s - low border of F4 hurricane; 28 rows

### Pressure [Pa]

In [31]:
df[df.Pressure < 80000].describe().compute()

Unnamed: 0,WindDir,WindSpeed,Temperature,Pressure,RelHum
count,280525.0,280525.0,280525.0,280525.0,280525.0
mean,207.002035,14.861334,4.233203,78070.165155,59.773673
std,97.415866,11.015743,10.951103,1867.500179,24.460226
min,0.0,0.0,-36.388889,0.0,0.0
25%,129.0,6.804044,-3.333333,77680.0,40.0
50%,229.0,12.052877,3.611111,78795.0,63.0
75%,283.0,21.189736,12.222222,79370.0,81.0
max,360.0,86.508554,33.888889,79990.0,100.0


Very low pressure in Wymoing, not caused by error: http://www.wrds.uwyo.edu/sco/climateatlas/pressure.html
Maybe values are a little too low, but I believe not completely wrong.

In [54]:
df[df.Pressure < 72500].compute()

Unnamed: 0,SiteCode,LocalDate,WindDir,WindSpeed,Temperature,Pressure,RelHum
59090,North Dakota_Cass_1004.0,2014-10-31 10:00:00,131.0,7.776050,-2.222222,24130.0,76.0
289123,Wyoming_Fremont_99.0,2010-12-29 06:00:00,235.6,42.962675,-6.666667,72470.0,91.0
289125,Wyoming_Fremont_99.0,2010-12-29 16:00:00,225.2,22.939347,-5.000000,72230.0,94.0
289126,Wyoming_Fremont_99.0,2010-12-30 01:00:00,5.6,23.716952,-9.444444,72400.0,91.0
404217,North Carolina_Mecklenburg_41.0,2011-09-03 02:00:00,313.0,0.972006,21.666667,0.0,100.0
404218,North Carolina_Mecklenburg_41.0,2011-09-03 03:00:00,308.0,0.583204,21.111111,0.0,100.0
524331,District Of Columbia_District of Columbia_43.0,2015-06-10 04:00:00,331.0,8.553655,12.722222,52875.0,65.3
289211,Wyoming_Fremont_99.0,2010-12-29 22:00:00,296.4,4.471229,-5.000000,72330.0,96.0
401324,New Mexico_Bernalillo_23.0,2011-09-26 17:00:00,271.2,11.080871,9.722222,72400.0,6.0
524557,District Of Columbia_District of Columbia_43.0,2015-06-20 00:00:00,291.0,7.970451,27.416667,50415.0,75.0


Drop pressure values lower than 7250 hPa, most likely measurement errors.

### Temperature [C]

In [166]:
df[df.Temperature < -35].compute()

Unnamed: 0,SiteCode,LocalDate,WindDir,WindSpeed,Temperature,Pressure,RelHum
50500,Nevada_Clark_2002.0,2014-11-17 13:00:00,59.0,2.916019,-37.222222,95800.0,42.0
166895,Wyoming_Sublette_99.0,2017-01-06 05:00:00,48.0,1.166407,-35.555556,77940.0,63.0
166896,Wyoming_Sublette_99.0,2017-01-06 06:00:00,263.0,0.777605,-35.277778,77970.0,63.0
551523,Michigan_Schoolcraft_1.0,2015-02-20 04:00:00,50.0,3.304821,-36.111111,99900.0,72.0
166914,Wyoming_Sublette_99.0,2017-01-06 07:00:00,211.0,2.721617,-36.388889,78030.0,62.0
168015,Wyoming_Sweetwater_200.0,2017-01-06 02:00:00,123.0,1.166407,-35.555556,79020.0,69.0
551701,Michigan_Schoolcraft_1.0,2015-02-20 06:00:00,58.0,3.304821,-36.111111,99700.0,72.0
50347,Nevada_Clark_2002.0,2014-11-13 13:00:00,48.0,3.304821,-50.0,94500.0,8.0
552743,Michigan_Schoolcraft_1.0,2015-02-20 05:00:00,54.0,3.304821,-36.111111,99700.0,71.0
166786,Wyoming_Sublette_99.0,2017-01-06 02:00:00,270.0,2.332815,-35.277778,77880.0,64.0


In [169]:
temp = df.loc[(df.SiteCode == 'Nevada_Clark_2002.0') & (df.LocalDate >= datetime(2014, 11, 13)) & (df.LocalDate <= datetime(2014, 11, 18)), :].compute()
temp.sort_values(by='LocalDate')

Unnamed: 0,SiteCode,LocalDate,WindDir,WindSpeed,Temperature,Pressure,RelHum
50483,Nevada_Clark_2002.0,2014-11-13 00:00:00,241.0,0.583204,13.333333,94700.0,32.0
50428,Nevada_Clark_2002.0,2014-11-13 01:00:00,62.0,2.332815,13.888889,94700.0,34.0
50452,Nevada_Clark_2002.0,2014-11-13 02:00:00,49.0,4.66563,13.888889,94700.0,26.0
50484,Nevada_Clark_2002.0,2014-11-13 03:00:00,35.0,4.276827,15.0,94700.0,27.0
50453,Nevada_Clark_2002.0,2014-11-13 04:00:00,41.0,4.276827,14.444444,94700.0,27.0
50454,Nevada_Clark_2002.0,2014-11-13 05:00:00,47.0,4.66563,13.333333,94800.0,25.0
50455,Nevada_Clark_2002.0,2014-11-13 06:00:00,42.0,4.276827,13.333333,94800.0,28.0
50498,Nevada_Clark_2002.0,2014-11-13 07:00:00,52.0,5.248834,14.444444,94800.0,26.0
50499,Nevada_Clark_2002.0,2014-11-13 08:00:00,85.0,5.637636,16.666667,94900.0,24.0
50347,Nevada_Clark_2002.0,2014-11-13 13:00:00,48.0,3.304821,-50.0,94500.0,8.0


Sign of temperature is switching, most likely an error.

Check how common are such errors.

In [170]:
temps = df.loc[:, ['SiteCode', 'LocalDate', 'Temperature']]

In [171]:
sites = temps.SiteCode.unique().compute().tolist()
len(sites)

99

In [172]:
tables_to_investigate = []
for site in sites:
    a = temps.loc[temps.SiteCode == site, :].compute().sort_values(by='LocalDate')
    b = a['Temperature'].shift(periods=1)
    a['ShiftedTemperature'] = b
    a['absolute_difference'] = (a.Temperature - a.ShiftedTemperature).abs()
    thresh = a.loc[a.absolute_difference > 5]
    tables_to_investigate.append(thresh)
    
result = pd.concat(tables_to_investigate)

In [173]:
result.SiteCode.value_counts()

California_Santa Barbara_2011.0    684
New York_Herkimer_5.0              521
Nevada_Washoe_1026.0               485
Wyoming_Teton_8.0                  450
Colorado_Rio Blanco_5.0            388
Louisiana_East Baton Rouge_13.0    364
Wyoming_Natrona_2601.0             326
Wyoming_Sweetwater_200.0           310
Wyoming_Laramie_100.0              290
Wyoming_Sublette_99.0              239
Iowa_Linn_40.0                     232
Colorado_Rio Blanco_6.0            202
                                  ... 
California_Madera_2010.0            32
Michigan_Wayne_19.0                 31
Nevada_Clark_2002.0                 28
Michigan_Wayne_1.0                  27
Maryland_Dorchester_4.0             24
Nevada_Clark_75.0                   23
New Jersey_Essex_3.0                23
Michigan_Kent_20.0                  19
Oregon_Multnomah_80.0               14
Massachusetts_Suffolk_42.0          10
Washington_King_80.0                 3
Hawaii_Hawaii_7.0                    3
Name: SiteCode, Length: 9

In [174]:
result.loc[result.SiteCode == 'California_Santa Barbara_2011.0', 'LocalDate'].dt.hour.value_counts()

8     255
9     197
7     122
10     28
17     22
6      14
18     14
23      4
21      4
11      3
1       3
12      3
22      2
15      2
2       2
20      2
0       2
13      1
14      1
16      1
19      1
5       1
Name: LocalDate, dtype: int64

In [175]:
before_temp = result.loc[result.SiteCode == 'California_Santa Barbara_2011.0', 'ShiftedTemperature']
after_temp = result.loc[result.SiteCode == 'California_Santa Barbara_2011.0', 'Temperature']

(before_temp < after_temp).sum() / len(before_temp < after_temp)

0.9283625730994152

'California_Santa Barbara_2011.0' drastic temp shift seems to be sunrise

https://www.timeanddate.com/sun/usa/santa-barbara

In [176]:
result[result.SiteCode == 'Nevada_Clark_2002.0']

Unnamed: 0,SiteCode,LocalDate,Temperature,ShiftedTemperature,absolute_difference
207544,Nevada_Clark_2002.0,2012-01-06 09:00:00,10.555556,3.888889,6.666667
207694,Nevada_Clark_2002.0,2012-01-07 09:00:00,14.444444,8.333333,6.111111
207832,Nevada_Clark_2002.0,2012-03-07 04:00:00,3.888889,22.222222,18.333333
207777,Nevada_Clark_2002.0,2012-03-25 21:00:00,13.333333,18.888889,5.555556
208216,Nevada_Clark_2002.0,2012-08-10 09:00:00,36.666667,26.666667,10.000000
208058,Nevada_Clark_2002.0,2012-10-02 13:00:00,35.000000,28.888889,6.111111
480210,Nevada_Clark_2002.0,2013-04-08 07:00:00,10.000000,15.555556,5.555556
480662,Nevada_Clark_2002.0,2013-07-19 19:00:00,26.111111,41.111111,15.000000
480665,Nevada_Clark_2002.0,2013-07-20 22:00:00,25.555556,39.444444,13.888889
480622,Nevada_Clark_2002.0,2013-08-18 15:00:00,30.000000,37.777778,7.777778


In [180]:
result.loc[result.absolute_difference > 20, 'SiteCode'].value_counts()

Nevada_Clark_540.0                10
Nevada_Clark_2002.0                5
New Mexico_Bernalillo_23.0         3
Iowa_Scott_15.0                    2
Nevada_Washoe_1026.0               2
Michigan_Wayne_1.0                 2
Louisiana_East Baton Rouge_9.0     2
Wyoming_Sweetwater_200.0           2
Alabama_Jefferson_23.0             2
Texas_Harris_416.0                 1
Idaho_Nez Perce_12.0               1
Missouri_Jefferson_19.0            1
Ohio_Hamilton_40.0                 1
Oregon_Klamath_4.0                 1
Michigan_Wayne_19.0                1
California_Kern_6001.0             1
Name: SiteCode, dtype: int64

Conclusion, there is something definitely wrong with temperature measurements in Nevada Clark 2002 and 540.

Solution, drop faulty months of data from these 2 sites.

In [190]:
cond = (result.absolute_difference > 20) & (result.SiteCode.isin(['Nevada_Clark_540.0', 'Nevada_Clark_2002.0']))
temp = result.loc[cond, ['SiteCode', 'LocalDate']]
temp.LocalDate = temp.LocalDate.dt.date
temp.drop_duplicates('LocalDate')

Unnamed: 0,SiteCode,LocalDate
50347,Nevada_Clark_2002.0,2014-11-13
50464,Nevada_Clark_2002.0,2014-11-17
398501,Nevada_Clark_540.0,2011-04-14
398430,Nevada_Clark_540.0,2011-07-15
398585,Nevada_Clark_540.0,2011-09-01
399012,Nevada_Clark_540.0,2011-09-27
399021,Nevada_Clark_540.0,2011-09-29


to drop:
* nov 2014
* apr, jul, sep 2011

In [None]:
clark2002 = (weather.SiteCode == 'Nevada_Clark_2002.0') & (weather.LocalDate.dt.year == 2014) & (weather.LocalDate.dt.month == 11)
clark540 = (weather.SiteCode == 'Nevada_Clark_540.0') & (weather.LocalDate.dt.year == 2011) & (weather.LocalDate.dt.month.isin([4, 7, 9]))

In [204]:
weather =df

In [205]:
print(len(weather))

4889068


In [207]:
clark2002 = (weather.SiteCode == 'Nevada_Clark_2002.0') & (weather.LocalDate.dt.year == 2014) & (weather.LocalDate.dt.month == 11)
clark540 = (weather.SiteCode == 'Nevada_Clark_540.0') & (weather.LocalDate.dt.year == 2011) & (weather.LocalDate.dt.month.isin([4, 7, 9]))
weather = weather.loc[~clark2002 & ~clark540, :]
print(len(weather))

4886268


In [201]:
len(df)

4889068