In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import xarray as xr
from matplotlib import gridspec
from mpl_toolkits.axes_grid1 import make_axes_locatable
from matplotlib import ticker
from datetime import datetime, timedelta, date
from matplotlib.path import Path
import matplotlib.patheffects as pe
import glob,os
import time
from matplotlib.dates import DateFormatter
import matplotlib.dates as mdates
import scipy
from scipy import stats

In [57]:
#specify date interval for the range of proto2 files you want to convert over to 5min
dates=[]
dates.append([entry for entry in np.arange(datetime(2021,6,10),datetime(2022,6,1),timedelta(days=1)).astype(datetime)])
dates=np.array(dates).flatten()

#loop through each day in the date interval
for day in dates:
    print(day)
    
    proto2_filepaths=[]

    dyr=day.year
    dmo=day.month
    ddy=day.day
    sdyr,sdmo,sddy=str(dyr),str(dmo).zfill(2),str(ddy).zfill(2)

    #raw proto2 files are saved hourly at 5 second intervals so loop through each hour of the day and save the filepath to a file
    for hour in range(24):
        fname='/data/aq/inhouse/raw/aqmon002/'+sdyr+'/'+sdmo+'/'+sdyr+sdmo+sddy+'T'+str(hour).zfill(2)+'.csv'
        #sometimes a file isn't written for a specific hour, so only add file to list of paths if it actually exists
        if os.path.isfile(fname):
            proto2_filepaths.append(fname)
    
    #read in all the files in the list and concatenate it into a single dataframe
    proto2_from_files=(pd.read_csv(f,skiprows=[0,2],on_bad_lines='skip') for f in proto2_filepaths)
    proto2=pd.concat(proto2_from_files)

    #set a numerical index
    proto2=proto2.set_index(np.arange(len(proto2)))

    #some weird stuff going on with the data, need to filter it out and drop the bad rows
    #basically just drop any row where the year+month+day isn't in the datetime column
    badrows=proto2[~proto2['datetimestamp'].str.contains(sdyr+sdmo+sddy)].index
    proto2=proto2.drop(badrows)

    #reset the index back to a datetime index
    times=pd.to_datetime(proto2['datetimestamp'],format='%Y%m%dT%H%M%S')
    proto2=proto2.set_index(times)

    proto2['datetimestamp']=pd.to_datetime(proto2['datetimestamp'])
    proto2.index.names=['date']

    #sort the index so it's in chronological order
    proto2=proto2.sort_index()

    #resample the data to 5 minu since we don't really use the 5 second interval
    #saves on filespace and computation time
    proto2_5min=proto2.apply(pd.to_numeric).resample('5min').mean()
    
    #sometimes there are still some missing times that lack data
    #this loop will just go through and fill any missing 5 minute intervals with NaN
    if len(proto2_5min)!=288:
        print('HEY THIS DATE ISNT FULL 5 MINUTE INTERVAL')
        proto2_5min=proto2_5min.reindex(pd.date_range(sdyr+sdmo+sddy+' 00',sdyr+sdmo+sddy+'2355',freq='5min'))
        print('FIXED LENGTH (SHOULD BE 288): '+str(len(proto2_5min)))

    #write the output to a file 
    proto2_5min.to_csv('/data/aq/inhouse/corrected/bad_rows_removed_5min/'+sdyr+sdmo+sddy+'.csv')

2021-06-10 00:00:00
2021-06-11 00:00:00
2021-06-12 00:00:00
2021-06-13 00:00:00
2021-06-14 00:00:00
HEY THIS DATE ISNT FULL 5 MINUTE INTERVAL
FIXED LENGTH (SHOULD BE 288): 288
2021-06-15 00:00:00
2021-06-16 00:00:00
2021-06-17 00:00:00
2021-06-18 00:00:00
2021-06-19 00:00:00
2021-06-20 00:00:00
2021-06-21 00:00:00
2021-06-22 00:00:00
2021-06-23 00:00:00
2021-06-24 00:00:00
2021-06-25 00:00:00
2021-06-26 00:00:00
2021-06-27 00:00:00
2021-06-28 00:00:00
2021-06-29 00:00:00
2021-06-30 00:00:00
2021-07-01 00:00:00
2021-07-02 00:00:00
2021-07-03 00:00:00
2021-07-04 00:00:00
2021-07-05 00:00:00
2021-07-06 00:00:00
2021-07-07 00:00:00
2021-07-08 00:00:00
2021-07-09 00:00:00
HEY THIS DATE ISNT FULL 5 MINUTE INTERVAL
FIXED LENGTH (SHOULD BE 288): 288
2021-07-10 00:00:00
2021-07-11 00:00:00
2021-07-12 00:00:00
2021-07-13 00:00:00
2021-07-14 00:00:00
2021-07-15 00:00:00
2021-07-16 00:00:00
2021-07-17 00:00:00
2021-07-18 00:00:00
2021-07-19 00:00:00
2021-07-20 00:00:00
2021-07-21 00:00:00
2021-07-