In [57]:
import pandas as pd
from pathlib import Path
import os

In [58]:
base_path = Path(r'Rain Gauge\Uncleaned Cagayan Data\CDORB (Ugiaban) Rain Data GPM IMERG Half-hourly data')
all_files = os.listdir(base_path)

def xl_file_sorter(all_files):
    xl_files = []
    for file in all_files:
        if file.split('.')[-1] == 'xlsx':
            xl_files.append(os.path.join(base_path,file))
    return xl_files

xl_files = xl_file_sorter(all_files)

## Convert Excel to Pandas Dataframe then to CSV

In [59]:
from openpyxl import load_workbook

In [60]:
import numpy as np
import datetime

def ws_to_df(ws):
    arry = []
    for r_index, row in enumerate(ws.iter_rows(min_row=10, min_col=1, max_col = 2)):
        row_arry=[]
        for c_index, cell in enumerate(row):
            row_arry.append(cell.value)
        arry.append(row_arry)
    df = pd.DataFrame(arry, columns =['Timestamp','Sensor Value'])
    df.iloc[:,0] = df.iloc[:,0].apply(lambda x: pd.to_datetime(x).strftime("%m/%d/%Y %H:%M:%S"))
    df.iloc[:,1] = df.iloc[:,1].apply(lambda x: float("{:.5f}".format(x)))
    return df


In [61]:
df_list = []
for index, file in enumerate(xl_files):
    wb = load_workbook(file)    #load excel using openpyxl
    ws = wb.active    #select active worksheet
    df_list.append(ws_to_df(ws))

In [62]:
xl_files

['Rain Gauge\\Uncleaned Cagayan Data\\CDORB (Ugiaban) Rain Data GPM IMERG Half-hourly data\\g4.areaAvgTimeSeries.GPM_3IMERGHHE_06_precipitationCal.20210101-20210131.124E_8N_124E_8N.csv.xlsx',
 'Rain Gauge\\Uncleaned Cagayan Data\\CDORB (Ugiaban) Rain Data GPM IMERG Half-hourly data\\g4.areaAvgTimeSeries.GPM_3IMERGHHE_06_precipitationCal.20210201-20210228.124E_8N_124E_8N.csv.xlsx',
 'Rain Gauge\\Uncleaned Cagayan Data\\CDORB (Ugiaban) Rain Data GPM IMERG Half-hourly data\\g4.areaAvgTimeSeries.GPM_3IMERGHHE_06_precipitationCal.20210301-20210331.124E_8N_124E_8N.csv.xlsx',
 'Rain Gauge\\Uncleaned Cagayan Data\\CDORB (Ugiaban) Rain Data GPM IMERG Half-hourly data\\g4.areaAvgTimeSeries.GPM_3IMERGHHE_06_precipitationCal.20210401-20210430.124E_8N_124E_8N.csv.xlsx',
 'Rain Gauge\\Uncleaned Cagayan Data\\CDORB (Ugiaban) Rain Data GPM IMERG Half-hourly data\\g4.areaAvgTimeSeries.GPM_3IMERGHHE_06_precipitationCal.20210501-20210531.124E_8N_124E_8N.csv.xlsx',
 'Rain Gauge\\Uncleaned Cagayan Data\\CD

In [63]:
main_df = pd.concat(df_list,ignore_index=True)

In [64]:
main_df['Timestamp_New'] = main_df['Timestamp'].apply(lambda x: pd.to_datetime(x))
main_df = main_df.sort_values(by='Timestamp_New').reset_index(drop=True).drop(columns=['Timestamp_New'])

In [65]:
main_df

Unnamed: 0,Timestamp,Sensor Value
0,12/31/2020 00:00:00,0.00000
1,12/31/2020 00:30:00,0.00000
2,12/31/2020 01:00:00,0.00000
3,12/31/2020 01:30:00,0.00000
4,12/31/2020 02:00:00,0.01515
...,...,...
26203,06/30/2022 21:30:00,0.00000
26204,06/30/2022 22:00:00,0.00000
26205,06/30/2022 22:30:00,0.00000
26206,06/30/2022 23:00:00,0.00000


In [66]:
#returns string start date YearMonthDate
start = main_df.iloc[0,0].split()[0].split('/')
start[2] = start[2][2:] #Extracts only the last 2 ends of the year
start = "".join([start[i] for i in [2,1,0]]) #joins and rearranges list to specified date

#returns string end date Month_Year
end = main_df.iloc[-1,0].split()[0].split('/')
end[2] = end[2][2:] #Extracts only the last 2 ends of the year
end = "".join([end[i] for i in [2,1,0]]) #joins and rearranges list to specified date

print(start,end)

203112 223006


In [67]:
filename = f'CAG Rain Gauge {start} to {end}.xlsx'
main_df.to_excel(Path(f'Rain Gauge/Cagayan/Rain Gauge 1/{filename}'))
main_df.to_excel(Path(f'Rain Gauge/Cagayan/Rain Gauge 2/{filename}'))