In [None]:
import numpy as np
import pandas as pd
import re

__author__ = "Gifrey John M. Sulay"
__copyright__ = "Xavier University - Engineering Resource Center"


In [None]:
#open data
tag=pd.read_csv('Edited_Data/tag.csv')
rg1=pd.read_csv('Edited_Data/rg1.csv')
rg0=pd.read_csv('Edited_Data/rg0.csv')

In [None]:
#filter data
tag_f=tag[['Timestamp','Sensor Value']].rename(columns={'Sensor Value':'Water_Level'})
rg0_f=rg0[['Timestamp','Sensor Value']].rename(columns={'Sensor Value':'RG0_Level'})
rg1_f=rg1[['Timestamp','Sensor Value']].rename(columns={'Sensor Value':'RG1_Level'})

In [4]:
#create function to convert timestamp to day year/month/day hour
def timestamp_conv(df):
    headers=list(df.columns.values)
    array=df.to_numpy()
    timestamps=array[:,0]
    values=array[:,1]
    
    timestamp_edited =[]
    for i in timestamps:
        timestamp=i.split()
        date=timestamp[0]
        time=timestamp[1]
        hms=time.split(':')
        
        if len(timestamp)==3:
            morn_aft=timestamp[2]
            if int(hms[0])==12 and morn_aft=='AM':
                hour=0
            elif int(hms[0])==12 and morn_aft=='PM':
                hour=12
            elif morn_aft=='PM':
                hour=int(hms[0])+12
            elif morn_aft=='AM':
                hour=int(hms[0])

        if len(timestamp)==2:
            hour=int(hms[0])
        #reorganized timestamp to y/m/d
        date_split=date.split('/')
        month=date_split[0]
        day=date_split[1]
        year=date_split[2]
        reorganized_date=f"{int(year)}/{int(month)}/{int(day)}"

        new_timestamp=f"{reorganized_date} {int(hour)}:00:00"
        timestamp_edited.append(new_timestamp)
    
    timestamp_edited_arr=np.array(timestamp_edited)
    
    timestamp_edited_arr=timestamp_edited_arr[:,np.newaxis]
    values=values[:, np.newaxis]
    
    l1=list(timestamp_edited_arr)
    l2=list(values)
    
    data=np.hstack([timestamp_edited_arr, values])
    
    new_dataframe=pd.DataFrame(data, columns=headers)
    return new_dataframe



In [5]:
#convert timestamps
tag_n=timestamp_conv(tag_f)
rg0_n=timestamp_conv(rg0_f)
rg1_n=timestamp_conv(rg1_f)

In [6]:
#generate timestamp array for timestamp index for dataframe
#returns clean_timestamp
start="2020-03-25"
end="2021-05-19"
import datetime
x = pd.date_range(start=start, end=end, freq='H').tolist()
clean_timestamp=[]
for i in x:
    month=i.strftime("%m")
    day=i.strftime("%d")
    year=i.strftime("%Y")
    hour=i.strftime("%H")
    
    string=f"{int(year)}/{int(month)}/{int(day)} {int(hour)}:00:00"
    clean_timestamp.append(string)

In [7]:
#create df of duplicated values
tag_n_duplicates=pd.concat(g for _, g in tag_n.groupby("Timestamp") if len(g) > 1)
rg0_n_duplicates=pd.concat(g for _, g in rg0_n.groupby("Timestamp") if len(g) > 1)
rg1_n_duplicates=pd.concat(g for _, g in rg0_n.groupby("Timestamp") if len(g) > 1)

#drop redundant data points
tag_n.drop_duplicates(subset='Timestamp', keep=False, inplace=True)
rg0_n.drop_duplicates(subset='Timestamp', keep=False, inplace=True)
rg1_n.drop_duplicates(subset='Timestamp', keep=False, inplace=True)

In [8]:
#create dataframe for complete and correct timestamp
ts=pd.DataFrame({'Timestamp':clean_timestamp})

#create base dataframe(timestamp and water level)
ts_tag_n=pd.merge(ts,tag_n,how='left',on='Timestamp')
base=ts_tag_n

In [9]:
#merge base and rain gauge 0
base_rg0=pd.merge(base,rg0_n,how='left',on='Timestamp').set_index('Timestamp')
base_rg0

Unnamed: 0_level_0,Water_Level,RG0_Level
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2020/3/25 0:00:00,1.53544,319.6
2020/3/25 1:00:00,1.55932,319.6
2020/3/25 2:00:00,1.46382,319.6
2020/3/25 3:00:00,1.48769,319.6
2020/3/25 4:00:00,1.48769,319.6
...,...,...
2021/5/18 20:00:00,,
2021/5/18 21:00:00,,
2021/5/18 22:00:00,,
2021/5/18 23:00:00,,


In [10]:
#merge base and rain gauge 1
base_rg1=pd.merge(base,rg1_n,how='left',on='Timestamp').set_index('Timestamp')
base_rg1

Unnamed: 0_level_0,Water_Level,RG1_Level
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2020/3/25 0:00:00,1.53544,0
2020/3/25 1:00:00,1.55932,0
2020/3/25 2:00:00,1.46382,0
2020/3/25 3:00:00,1.48769,0
2020/3/25 4:00:00,1.48769,0
...,...,...
2021/5/18 20:00:00,,
2021/5/18 21:00:00,,
2021/5/18 22:00:00,,
2021/5/18 23:00:00,,


In [11]:
#save to excel sheet
with pd.ExcelWriter('Cleaned_Data.xlsx') as writer:
    base_rg0.to_excel(writer, sheet_name='Water Level - Rain Gauge 0')
    base_rg1.to_excel(writer, sheet_name='Water Level - Rain Gauge 1')

In [12]:
#create spreadsheet of missing values
water_level_missing=base_rg0[base_rg0['Water_Level'].isnull()].index.tolist()
rg0_missing=base_rg0[base_rg0['RG0_Level'].isnull()].index.tolist()
rg1_missing=base_rg1[base_rg1['RG1_Level'].isnull()].index.tolist()

water_level_missing_df=pd.DataFrame({'Missing Water Level':water_level_missing})
rg0_missing_df=pd.DataFrame({'Missing RG0':rg0_missing})
rg1_missing_df=pd.DataFrame({'Missing RG1':rg1_missing})

with pd.ExcelWriter('Missing Values.xlsx') as writer:
    water_level_missing_df.to_excel(writer, sheet_name='Water Level')
    rg0_missing_df.to_excel(writer, sheet_name='Rain Gauge 0')
    rg1_missing_df.to_excel(writer, sheet_name='Rain Gauge 1')
    
with pd.ExcelWriter('Duplicated Values.xlsx') as writer:
    tag_n_duplicates.set_index('Timestamp').to_excel(writer, sheet_name='Water Level')
    rg0_n_duplicates.set_index('Timestamp').to_excel(writer, sheet_name='Rain Gauge 0')
    rg1_n_duplicates.set_index('Timestamp').to_excel(writer, sheet_name='Rain Gauge 1')

In [20]:
#interpolate water level
new_water_level=base['Water_Level'].astype('float64').interpolate()

#replace nan values to 0 in RG0
rg0_copy=base_rg0['RG0_Level'].fillna(0).to_numpy()

#create difference on Rain Gauge 0
rg0_diff=[0]
count=1
for i in rg0_copy[1:]:
    diff=i-rg0_copy[count-1]
    rg0_diff.append(diff)
    count+=1


10081

In [24]:
#create dataframe with corrected data using base dataframe
corrected_df=base
corrected_df['Corrected_Water_Level']= new_water_level
corrected_df=corrected_df.drop(columns=['Water_Level'])
corrected_df['Corrected_RG0_Level','RG0_']=rg0_copy
corrected_df['RG0_Diff']=rg0_diff
corrected_df=corrected_df.set_index('Timestamp')
corrected_df

Unnamed: 0_level_0,Corrected_Water_Level,"(Corrected_RG0_Level, RG0_)",RG0_Diff
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020/3/25 0:00:00,1.535440,319.6,0.0
2020/3/25 1:00:00,1.559315,319.6,0.0
2020/3/25 2:00:00,1.463815,319.6,0.0
2020/3/25 3:00:00,1.487690,319.6,0.0
2020/3/25 4:00:00,1.487690,319.6,0.0
...,...,...,...
2021/5/18 20:00:00,2.251690,0.0,0.0
2021/5/18 21:00:00,2.251690,0.0,0.0
2021/5/18 22:00:00,2.251690,0.0,0.0
2021/5/18 23:00:00,2.251690,0.0,0.0


In [27]:
with pd.ExcelWriter('Corrected_Data.xlsx') as writer:
    corrected_df.to_excel(writer,sheet_name='Corrected_Water_and_RG0_Level')