In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
from datetime import datetime,date,timedelta
import pickle

In [2]:
station = 'KCAWATSO86'
staname = 'watsonvil'
now = datetime.now().date()
sdate = datetime(now.year-2,now.month-5,now.day-5).date()
datediff = (now-sdate).days

In [3]:
allwind = pd.read_pickle(f'{staname}_raw_{sdate}_{now-timedelta(days=1)}.pkl')

In [4]:
pd.concat([allwind[:3],allwind[-3:]]) 

Unnamed: 0,Date,Time,Season,Month,Daypart,Temp,Wind,Azimuth,Speed,Gust
0,2020-05-25,4:33 PM,Spring,May,Afternoon,91.8,WSW,247.5,0.6,1.1
1,2020-05-25,4:39 PM,Spring,May,Afternoon,92.3,WSW,247.5,2.8,3.1
2,2020-05-25,4:44 PM,Spring,May,Afternoon,92.5,WSW,247.5,2.6,3.0
285,2022-10-25,11:49 PM,Fall,Oct,Overnight,45.5,SSE,157.5,0.0,0.0
286,2022-10-25,11:54 PM,Fall,Oct,Overnight,46.2,SSE,157.5,0.0,0.0
287,2022-10-25,11:59 PM,Fall,Oct,Overnight,46.3,SSE,157.5,0.0,0.0


In [6]:
dfcov=pd.DataFrame(columns=['Date','Records','PctTotal','NoDir','PctNoDir'])
fullcov=0
for i in range(datediff):
    d = sdate+timedelta(days=i)
    itvls = len(allwind[allwind['Date']==d])
    cov = round(100*itvls/(12*24),1)
    nullwcov = (allwind[allwind['Date']==d]['Wind']=='').sum()
    if cov == 100.0:
        fullcov+=1
    if itvls != 0:
        pctnullw = round(100*nullwcov/itvls,1)
    else:
        pctnullw = 0.0
    rdata = pd.DataFrame([{'Date':d,'Records':itvls,'PctTotal':cov,'NoDir':nullwcov,'PctNoDir':pctnullw}])
    dfcov = pd.concat([dfcov,rdata])

In [6]:
print('duplicate entries when daylight savings rolls back:')
dfcov.sort_values(['PctTotal','PctNoDir'],ascending=[False,True]).head(3)

duplicate entries when daylight savings rolls back:


Unnamed: 0,Date,Records,PctTotal,NoDir,PctNoDir
0,2021-11-07,300,104.2,4,1.3
0,2021-06-01,288,100.0,0,0.0
0,2021-06-03,288,100.0,0,0.0


In [7]:
from collections import Counter
cdict = Counter(list(allwind[allwind['Date']==date(2020,11,1)].Time))
print("2020-11-01 extras:")
print({x:count for x,count in cdict.items() if count > 1})
cdict2 = Counter(list(allwind[allwind['Date']==date(2021,11,7)].Time))
print("\n2021-11-07 extras:")
print({x:count for x,count in cdict2.items() if count > 1})

2020-11-01 extras:
{}

2021-11-07 extras:
{'1:04 AM': 2, '1:09 AM': 2, '1:14 AM': 2, '1:19 AM': 2, '1:24 AM': 2, '1:29 AM': 2, '1:34 AM': 2, '1:39 AM': 2, '1:44 AM': 2, '1:49 AM': 2, '1:54 AM': 2, '1:59 AM': 2}


In [8]:
print('Entries from 1AM-2AM are recorded twice due to daylight savings')
allwind[(allwind['Date']==date(2020,11,1)) & (allwind['Time'].str[:3] == '1:0') & (allwind['Time'].str[-2] == 'A')]

Entries from 1AM-2AM are recorded twice due to daylight savings


Unnamed: 0,Date,Time,Temp,Wind,Azimuth,Speed,Gust,Month


In [9]:
#using 'startswith' instead of indexing
allwind[(allwind['Date']==date(2021,11,7)) & (allwind['Time'].str.startswith('1:0'))& (allwind['Time'].str.endswith('AM')) ]

Unnamed: 0,Date,Time,Temp,Wind,Azimuth,Speed,Gust,Month
12,2021-11-07,1:04 AM,56.2,WNW,292.5,2.6,5.2,Nov
13,2021-11-07,1:09 AM,56.1,WNW,292.5,2.0,3.5,Nov
24,2021-11-07,1:04 AM,54.8,WNW,292.5,2.1,3.0,Nov
25,2021-11-07,1:09 AM,54.9,WNW,292.5,2.6,4.8,Nov


In [10]:
print(f"# days with full coverage of all 5-min intervals: {fullcov} ({round(100*(fullcov/datediff),1)}%)")

# days with full coverage of all 5-min intervals: 328 (44.9%)


In [11]:
print("large gap in coverage:")
pd.concat([allwind[allwind['Date']==date(2022, 9, 21)][-2:],allwind[allwind['Date']==date(2022, 9, 22)][:2]]) 

large gap in coverage:


Unnamed: 0,Date,Time,Temp,Wind,Azimuth,Speed,Gust,Month
158,2022-09-21,3:19 PM,68.6,SW,225.0,3.8,8.9,Sep
159,2022-09-21,7:39 PM,66.7,WSW,247.5,0.1,0.7,Sep
0,2022-09-22,8:24 AM,55.2,WNW,292.5,0.3,1.5,Sep
1,2022-09-22,8:29 AM,55.7,WNW,292.5,0.4,1.3,Sep


In [12]:
dfcov.to_pickle(f'ppoint_raw_coverage_{sdate}_{now-timedelta(days=1)}.pkl')

In [13]:
dfcov.to_csv(f'ppoint_raw_coverage_{sdate}_{now-timedelta(days=1)}.csv')