In [1]:
import numpy as np
import pandas as pd
import re

__author__ = "Gifrey John M. Sulay"
__copyright__ = "Xavier University - Engineering Resource Center"


In [2]:
#open data
tag=pd.read_csv('Edited_Data/tag.csv')
rg1=pd.read_csv('Edited_Data/rg1.csv')
rg0=pd.read_csv('Edited_Data/rg0.csv')

In [3]:
#filter data
tag_f=tag[['Timestamp','Sensor Value']]
rg0_f=rg0[['Timestamp','Sensor Value']]
rg1_f=rg1[['Timestamp','Sensor Value']]

In [4]:
#create function to convert timestamp to day year/month/day hour
def timestamp_conv(df):
    headers=list(df.columns.values)
    array=df.to_numpy()
    timestamps=array[:,0]
    values=array[:,1]
    
    timestamp_edited =[]
    for i in timestamps:
        timestamp=i.split()
        date=timestamp[0]
        time=timestamp[1]
        hms=time.split(':')
        
        if len(timestamp)==3:
            morn_aft=timestamp[2]
            if int(hms[0])==12 and morn_aft=='AM':
                hour=0
            elif int(hms[0])==12 and morn_aft=='PM':
                hour=12
            elif morn_aft=='PM':
                hour=int(hms[0])+12
            elif morn_aft=='AM':
                hour=int(hms[0])

        if len(timestamp)==2:
            hour=int(hms[0])
        #reorganized timestamp to y/m/d
        date_split=date.split('/')
        month=date_split[0]
        day=date_split[1]
        year=date_split[2]
        reorganized_date=f"{int(year)}/{int(month)}/{int(day)}"

        new_timestamp=f"{reorganized_date} {int(hour)}"
        timestamp_edited.append(new_timestamp)
    
    timestamp_edited_arr=np.array(timestamp_edited)
    
    l1=list(timestamp_edited_arr)
    l2=list(values)
    
    new_dataframe=pd.DataFrame({'Timestamp':l1, 'Values':l2})
    return new_dataframe



In [5]:
#convert timestamps
tag_n=timestamp_conv(tag_f)
rg0_n=timestamp_conv(rg0_f)
rg1_n=timestamp_conv(rg1_f)

In [6]:
#generate timestamp array for timestamp index for dataframe
#returns clean_timestamp
start="2020-03-25"
end="2021-05-19"
import datetime
x = pd.date_range(start=start, end=end, freq='H').tolist()
clean_timestamp=[]
for i in x:
    month=i.strftime("%m")
    day=i.strftime("%d")
    year=i.strftime("%Y")
    hour=i.strftime("%H")
    
    string=f"{int(year)}/{int(month)}/{int(day)} {int(hour)}"
    clean_timestamp.append(string)
len(clean_timestamp)

10081

In [7]:
#create df of duplicated values
tag_n_duplicates=pd.concat(g for _, g in tag_n.groupby("Timestamp") if len(g) > 1)
rg0_n_duplicates=pd.concat(g for _, g in rg0_n.groupby("Timestamp") if len(g) > 1)
rg1_n_duplicates=pd.concat(g for _, g in rg0_n.groupby("Timestamp") if len(g) > 1)

#drop redundant data points
tag_n.drop_duplicates(subset='Timestamp', keep=False, inplace=True)
rg0_n.drop_duplicates(subset='Timestamp', keep=False, inplace=True)
rg1_n.drop_duplicates(subset='Timestamp', keep=False, inplace=True)

In [8]:
#create dataframe for complete and correct timestamp
ts=pd.DataFrame({'Timestamp':clean_timestamp})

#create base dataframe(timestamp and water level)
ts_tag_n=pd.merge(ts,tag_n,how='left',on='Timestamp')
base=ts_tag_n.rename(columns={'Values':'Water_Level'})
base

Unnamed: 0,Timestamp,Water_Level
0,2020/3/25 0,1.535440
1,2020/3/25 1,1.559315
2,2020/3/25 2,1.463815
3,2020/3/25 3,1.487690
4,2020/3/25 4,1.487690
...,...,...
10076,2021/5/18 20,
10077,2021/5/18 21,
10078,2021/5/18 22,
10079,2021/5/18 23,


In [9]:
#merge base and rain gauge 0
base_rg0=pd.merge(base,rg0_n,how='left',on='Timestamp').set_index('Timestamp').rename(columns={'Values':'RG0','Timestamp':'Timestamp y/m/d h'})
base_rg0

Unnamed: 0_level_0,Water_Level,RG0
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2020/3/25 0,1.535440,319.6
2020/3/25 1,1.559315,319.6
2020/3/25 2,1.463815,319.6
2020/3/25 3,1.487690,319.6
2020/3/25 4,1.487690,319.6
...,...,...
2021/5/18 20,,
2021/5/18 21,,
2021/5/18 22,,
2021/5/18 23,,


In [10]:
#merge base and rain gauge 1
base_rg1=pd.merge(base,rg1_n,how='left',on='Timestamp').set_index('Timestamp').rename(columns={'Values':'RG1','Timestamp':'Timestamp y/m/d h'})
base_rg1

Unnamed: 0_level_0,Water_Level,RG1
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2020/3/25 0,1.535440,0.0
2020/3/25 1,1.559315,0.0
2020/3/25 2,1.463815,0.0
2020/3/25 3,1.487690,0.0
2020/3/25 4,1.487690,0.0
...,...,...
2021/5/18 20,,
2021/5/18 21,,
2021/5/18 22,,
2021/5/18 23,,


In [11]:
#save to excel sheet
with pd.ExcelWriter('Cleaned_Data.xlsx') as writer:
    base_rg0.to_excel(writer, sheet_name='Water Level - Rain Gauge 0')
    base_rg1.to_excel(writer, sheet_name='Water Level - Rain Gauge 1')

In [13]:
#create spreadsheet of missing values
water_level_missing=base_rg0[base_rg0['Water_Level'].isnull()].index.tolist()
rg0_missing=base_rg0[base_rg0['RG0'].isnull()].index.tolist()
rg1_missing=base_rg1[base_rg1['RG1'].isnull()].index.tolist()

water_level_missing_df=pd.DataFrame({'Missing Water Level':water_level_missing})
rg0_missing_df=pd.DataFrame({'Missing RG0':rg0_missing})
rg1_missing_df=pd.DataFrame({'Missing RG1':rg1_missing})

with pd.ExcelWriter('Missing Values.xlsx') as writer:
    water_level_missing_df.to_excel(writer, sheet_name='Water Level')
    rg0_missing_df.to_excel(writer, sheet_name='Rain Gauge 0')
    rg1_missing_df.to_excel(writer, sheet_name='Rain Gauge 1')
    
with pd.ExcelWriter('Duplicated Values.xlsx') as writer:
    tag_n_duplicates.set_index('Timestamp').to_excel(writer, sheet_name='Water Level')
    rg0_n_duplicates.set_index('Timestamp').to_excel(writer, sheet_name='Rain Gauge 0')
    rg1_n_duplicates.set_index('Timestamp').to_excel(writer, sheet_name='Rain Gauge 1')