Import ONI data

In [55]:
import pandas as pd
import os
import numpy as np

oni_data = pd.read_csv("oni.txt", sep="\s+")
oni_data

Unnamed: 0,YR,MON,TOTAL,ClimAdjust,ANOM
0,1950,1,24.56,26.18,-1.62
1,1950,2,25.07,26.39,-1.32
2,1950,3,25.88,26.95,-1.07
3,1950,4,26.29,27.39,-1.11
4,1950,5,26.19,27.56,-1.37
...,...,...,...,...,...
893,2024,6,27.91,27.73,0.18
894,2024,7,27.34,27.29,0.05
895,2024,8,26.74,26.86,-0.11
896,2024,9,26.47,26.72,-0.25


Import DID/MET data

In [56]:
import pandas as pd
import os
import re
folder_path = "miri"
precipitation_data = pd.DataFrame()
for file in os.listdir(folder_path):
    if re.search(r"rainfall-data\d*\.csv", file):
        df = pd.read_csv(f"{folder_path}/{file}")
        precipitation_data = pd.concat([precipitation_data, df], ignore_index=True)

In [57]:
precipitation_data["DateTime"] = pd.to_datetime(precipitation_data["DateTime"],format="%Y%m%d %H%M")
precipitation_data = precipitation_data[["DateTime", "Rainfall (mm)"]]
precipitation_data.rename(columns={"Rainfall (mm)": "Rainfall"}, inplace=True)
precipitation_data

Unnamed: 0,DateTime,Rainfall
0,1997-11-23 00:00:00,1.5
1,1997-11-23 01:00:00,0.0
2,1997-11-23 02:00:00,0.5
3,1997-11-23 03:00:00,0.5
4,1997-11-23 04:00:00,0.0
...,...,...
223028,2023-05-03 20:00:00,0.0
223029,2023-05-03 21:00:00,0.0
223030,2023-05-03 22:00:00,0.0
223031,2023-05-03 23:00:00,0.0


Merge DID and ONI data

In [58]:
precipitation_data = pd.merge(
    precipitation_data,
    oni_data[["YR", "MON", "TOTAL", "ClimAdjust", "ANOM"]],
    left_on=[precipitation_data["DateTime"].dt.year, precipitation_data["DateTime"].dt.month],
    right_on= ["YR", "MON"]
).drop(["YR", "MON"], axis=1)
precipitation_data.set_index("DateTime",inplace=True)
precipitation_data


Unnamed: 0_level_0,Rainfall,TOTAL,ClimAdjust,ANOM
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1997-11-23 00:00:00,1.5,29.11,26.63,2.48
1997-11-23 01:00:00,0.0,29.11,26.63,2.48
1997-11-23 02:00:00,0.5,29.11,26.63,2.48
1997-11-23 03:00:00,0.5,29.11,26.63,2.48
1997-11-23 04:00:00,0.0,29.11,26.63,2.48
...,...,...,...,...
2023-05-03 20:00:00,0.0,28.39,27.94,0.46
2023-05-03 21:00:00,0.0,28.39,27.94,0.46
2023-05-03 22:00:00,0.0,28.39,27.94,0.46
2023-05-03 23:00:00,0.0,28.39,27.94,0.46


Import Wunderground data

In [59]:
feature_data = pd.read_csv(f"{folder_path}/rainfall-feature-wunderground.csv")
feature_data.rename(columns={"Time":"DateTime"},inplace=True)
feature_data["DateTime"] = pd.to_datetime(feature_data["DateTime"], format="%Y-%m-%d %H:%M:%S%z")
feature_data["DateTime"] = feature_data["DateTime"].dt.tz_localize(None)
feature_data.set_index("DateTime",inplace=True)
# feature_data.drop(columns={"Wind Gust", "Precip."}, inplace=True)
feature_data.fillna({"Wind Speed":0}, inplace=True)
feature_data.ffill(inplace=True)
feature_data.rename(columns={"Dew Point": "DewPoint", "Wind Speed": "WindSpeed", "Wind Direction": "WindDir"}, inplace=True)
feature_data

Unnamed: 0_level_0,Temperature,DewPoint,Humidity,Visibility,WindSpeed,Pressure,WindDir
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2002-08-01 00:00:00,75.0,73.0,94.0,6.0,1.0,29.72,40.0
2002-08-01 01:00:00,75.0,75.0,100.0,6.0,0.0,29.72,40.0
2002-08-01 02:00:00,75.0,73.0,94.0,6.0,3.0,29.69,40.0
2002-08-01 03:00:00,75.0,73.0,94.0,6.0,0.0,29.69,40.0
2002-08-01 04:00:00,73.0,73.0,100.0,6.0,9.0,29.66,40.0
...,...,...,...,...,...,...,...
2024-12-31 19:00:00,84.0,77.0,79.0,6.0,6.0,29.67,270.0
2024-12-31 20:00:00,84.0,77.0,79.0,6.0,6.0,29.70,270.0
2024-12-31 21:00:00,82.0,77.0,84.0,6.0,6.0,29.73,260.0
2024-12-31 22:00:00,84.0,77.0,79.0,6.0,6.0,29.76,270.0


Add Topographic data

In [60]:
topo_loc = {
    "lawas":
    {
        "lat": 4.847301, "lon": 115.406703, "elev": 6.5,
    },
    "mulu":
    {
        "lat": 4.049213, "lon": 114.810996, "elev": 28.7,
    },
    "kuching":
    {
        "lat": 1.487123, "lon": 110.341599, "elev": 22.3,
    },
    "kota-kinabalu":
    {
        "lat": 5.923283, "lon": 116.051239, "elev": 5.4,
    },
    "kuantan":
    {
        "lat": 3.780726, "lon": 103.215062, "elev": 16,
    },
    "miri":
    {
        "lat": 4.322991, "lon": 113.987416, "elev": 19,
    }

}

# Add topographic data to the DataFrame
location = folder_path.lower()
if location in topo_loc:
    feature_data["Latitude"] = topo_loc[location]["lat"]
    feature_data["Longitude"] = topo_loc[location]["lon"]
    feature_data["Elevation"] = topo_loc[location]["elev"]
print(feature_data.head())

                     Temperature  DewPoint  Humidity  Visibility  WindSpeed  \
DateTime                                                                      
2002-08-01 00:00:00         75.0      73.0      94.0         6.0        1.0   
2002-08-01 01:00:00         75.0      75.0     100.0         6.0        0.0   
2002-08-01 02:00:00         75.0      73.0      94.0         6.0        3.0   
2002-08-01 03:00:00         75.0      73.0      94.0         6.0        0.0   
2002-08-01 04:00:00         73.0      73.0     100.0         6.0        9.0   

                     Pressure  WindDir  Latitude   Longitude  Elevation  
DateTime                                                                 
2002-08-01 00:00:00     29.72     40.0  4.322991  113.987416         19  
2002-08-01 01:00:00     29.72     40.0  4.322991  113.987416         19  
2002-08-01 02:00:00     29.69     40.0  4.322991  113.987416         19  
2002-08-01 03:00:00     29.69     40.0  4.322991  113.987416         19  
20

Combine all data

In [61]:
lawas_rainfall = precipitation_data.join(feature_data)
# lawas_rainfall.dropna(inplace=True)
lawas_rainfall

Unnamed: 0_level_0,Rainfall,TOTAL,ClimAdjust,ANOM,Temperature,DewPoint,Humidity,Visibility,WindSpeed,Pressure,WindDir,Latitude,Longitude,Elevation
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1997-11-23 00:00:00,1.5,29.11,26.63,2.48,,,,,,,,,,
1997-11-23 01:00:00,0.0,29.11,26.63,2.48,,,,,,,,,,
1997-11-23 02:00:00,0.5,29.11,26.63,2.48,,,,,,,,,,
1997-11-23 03:00:00,0.5,29.11,26.63,2.48,,,,,,,,,,
1997-11-23 04:00:00,0.0,29.11,26.63,2.48,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-05-03 20:00:00,0.0,28.39,27.94,0.46,86.0,79.0,79.0,6.0,5.0,29.70,280.0,4.322991,113.987416,19.0
2023-05-03 21:00:00,0.0,28.39,27.94,0.46,86.0,79.0,79.0,6.0,5.0,29.73,300.0,4.322991,113.987416,19.0
2023-05-03 22:00:00,0.0,28.39,27.94,0.46,84.0,79.0,84.0,6.0,2.0,29.76,300.0,4.322991,113.987416,19.0
2023-05-03 23:00:00,0.0,28.39,27.94,0.46,82.0,79.0,89.0,6.0,2.0,29.76,300.0,4.322991,113.987416,19.0


Check for duplicate data

In [62]:
duplicates = lawas_rainfall.index[lawas_rainfall.index.duplicated()]
if not duplicates.empty:
    print("Duplicate timestamps found:", duplicates)
    numeric_cols = lawas_rainfall.select_dtypes(include=['number']).columns
    non_numeric_cols = lawas_rainfall.select_dtypes(exclude=['number']).columns

    # Handle duplicate index by grouping
    # For numeric columns, take the mean
    # For non-numeric columns, take the first occurrence (or other appropriate method)
    lawas_rainfall = (
        lawas_rainfall.groupby(lawas_rainfall.index)
        .agg({**{col: 'mean' for col in numeric_cols},
            **{col: 'first' for col in non_numeric_cols}})
    )

lawas_rainfall

Duplicate timestamps found: DatetimeIndex(['2006-08-02 17:00:00', '2006-08-02 18:00:00',
               '2006-08-02 19:00:00'],
              dtype='datetime64[ns]', name='DateTime', freq=None)


Unnamed: 0_level_0,Rainfall,TOTAL,ClimAdjust,ANOM,Temperature,DewPoint,Humidity,Visibility,WindSpeed,Pressure,WindDir,Latitude,Longitude,Elevation
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1997-11-23 00:00:00,1.5,29.11,26.63,2.48,,,,,,,,,,
1997-11-23 01:00:00,0.0,29.11,26.63,2.48,,,,,,,,,,
1997-11-23 02:00:00,0.5,29.11,26.63,2.48,,,,,,,,,,
1997-11-23 03:00:00,0.5,29.11,26.63,2.48,,,,,,,,,,
1997-11-23 04:00:00,0.0,29.11,26.63,2.48,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-05-03 20:00:00,0.0,28.39,27.94,0.46,86.0,79.0,79.0,6.0,5.0,29.70,280.0,4.322991,113.987416,19.0
2023-05-03 21:00:00,0.0,28.39,27.94,0.46,86.0,79.0,79.0,6.0,5.0,29.73,300.0,4.322991,113.987416,19.0
2023-05-03 22:00:00,0.0,28.39,27.94,0.46,84.0,79.0,84.0,6.0,2.0,29.76,300.0,4.322991,113.987416,19.0
2023-05-03 23:00:00,0.0,28.39,27.94,0.46,82.0,79.0,89.0,6.0,2.0,29.76,300.0,4.322991,113.987416,19.0


Find the data losses (Training Input)

In [63]:
datetime_range = pd.date_range(start=lawas_rainfall.index.min(), end=lawas_rainfall.index.max(), freq='1h')

missing_date = datetime_range.difference(lawas_rainfall.index)
missing_date

DatetimeIndex([], dtype='datetime64[ns]', freq='h')

In [81]:
# start_date_input = "2020-01-01"
# end_date_input = "2022-07-17 11:00:00"

start_date_input = "2002-08-01"
end_date_input = "2021-02-03"
# There is data losses from Feb 2021 - Dec 2021, as there is major loss. The data will be use from Aug 2002 until Jan 2021

missing_date_input = missing_date[(missing_date >= start_date_input) & (missing_date <= end_date_input)]
missing_date_input

DatetimeIndex([], dtype='datetime64[ns]', freq='h')

In [65]:
lawas_rainfall_input = lawas_rainfall.reindex(
                    pd.date_range(start=start_date_input, end=end_date_input,
                    freq='1h'),
                    method='ffill'
                    )
lawas_rainfall_input.index.name = "DateTime"
lawas_rainfall_input

Unnamed: 0_level_0,Rainfall,TOTAL,ClimAdjust,ANOM,Temperature,DewPoint,Humidity,Visibility,WindSpeed,Pressure,WindDir,Latitude,Longitude,Elevation
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2002-08-01 00:00:00,0.0,27.72,26.91,0.81,75.0,73.0,94.0,6.0,1.0,29.72,40.0,4.322991,113.987416,19.0
2002-08-01 01:00:00,0.0,27.72,26.91,0.81,75.0,75.0,100.0,6.0,0.0,29.72,40.0,4.322991,113.987416,19.0
2002-08-01 02:00:00,0.0,27.72,26.91,0.81,75.0,73.0,94.0,6.0,3.0,29.69,40.0,4.322991,113.987416,19.0
2002-08-01 03:00:00,0.0,27.72,26.91,0.81,75.0,73.0,94.0,6.0,0.0,29.69,40.0,4.322991,113.987416,19.0
2002-08-01 04:00:00,0.0,27.72,26.91,0.81,73.0,73.0,100.0,6.0,9.0,29.66,40.0,4.322991,113.987416,19.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-02-02 20:00:00,0.0,25.76,26.76,-1.00,79.0,77.0,94.0,6.0,9.0,29.76,30.0,4.322991,113.987416,19.0
2021-02-02 21:00:00,0.0,25.76,26.76,-1.00,79.0,75.0,89.0,6.0,8.0,29.76,40.0,4.322991,113.987416,19.0
2021-02-02 22:00:00,0.0,25.76,26.76,-1.00,79.0,75.0,89.0,6.0,7.0,29.79,40.0,4.322991,113.987416,19.0
2021-02-02 23:00:00,0.0,25.76,26.76,-1.00,79.0,75.0,89.0,6.0,8.0,29.79,30.0,4.322991,113.987416,19.0


In [66]:
# Check on start and end data
print(f"Head Data:\n{lawas_rainfall_input.head()}\n")
print(f"Tail Data:\n{lawas_rainfall_input.tail()}")

Head Data:
                     Rainfall  TOTAL  ClimAdjust  ANOM  Temperature  DewPoint  \
DateTime                                                                        
2002-08-01 00:00:00       0.0  27.72       26.91  0.81         75.0      73.0   
2002-08-01 01:00:00       0.0  27.72       26.91  0.81         75.0      75.0   
2002-08-01 02:00:00       0.0  27.72       26.91  0.81         75.0      73.0   
2002-08-01 03:00:00       0.0  27.72       26.91  0.81         75.0      73.0   
2002-08-01 04:00:00       0.0  27.72       26.91  0.81         73.0      73.0   

                     Humidity  Visibility  WindSpeed  Pressure  WindDir  \
DateTime                                                                  
2002-08-01 00:00:00      94.0         6.0        1.0     29.72     40.0   
2002-08-01 01:00:00     100.0         6.0        0.0     29.72     40.0   
2002-08-01 02:00:00      94.0         6.0        3.0     29.69     40.0   
2002-08-01 03:00:00      94.0         6.0     

In [67]:
lawas_rainfall_input.columns

Index(['Rainfall', 'TOTAL', 'ClimAdjust', 'ANOM', 'Temperature', 'DewPoint',
       'Humidity', 'Visibility', 'WindSpeed', 'Pressure', 'WindDir',
       'Latitude', 'Longitude', 'Elevation'],
      dtype='object')

In [68]:
def resultant_wind_direction(x):
    radians = np.deg2rad(x)

    mean_sin = np.mean(np.sin(radians))
    mean_cos = np.mean(np.cos(radians))

    resultant_radians = np.arctan2(mean_sin, mean_cos)
    resultant_degrees = np.rad2deg(resultant_radians) % 360
    
    return resultant_degrees

Downsample data - daily, weekly

In [69]:
lawas_rainfall_input_daily = lawas_rainfall_input.resample('D').agg({
    'Rainfall': 'sum',
    'TOTAL': 'mean',
    'ClimAdjust': 'mean',
    'ANOM': 'mean',
    'Temperature': 'mean',
    'DewPoint': 'mean',
    'Humidity': 'mean',
    'Visibility': 'mean',
    'WindSpeed': 'mean',
    'Pressure': 'mean',
    'Latitude': 'mean',
    'Longitude': 'mean',
    'Elevation': 'mean',
    'WindDir': resultant_wind_direction,
    # 'Condition': lambda x: x.mode()[0]
})

lawas_rainfall_input_weekly = lawas_rainfall_input.resample('W').agg({
    'Rainfall': 'sum',
    'TOTAL': 'mean',
    'ClimAdjust': 'mean',
    'ANOM': 'mean',
    'Temperature': 'mean',
    'DewPoint': 'mean',
    'Humidity': 'mean',
    'Visibility': 'mean',
    'WindSpeed': 'mean',
    'Pressure': 'mean',
    'Latitude': 'mean',
    'Longitude': 'mean',
    'Elevation': 'mean',
    'WindDir': resultant_wind_direction,
    # 'Condition': lambda x: x.mode()[0]
})

In [70]:
lawas_rainfall_input_daily

Unnamed: 0_level_0,Rainfall,TOTAL,ClimAdjust,ANOM,Temperature,DewPoint,Humidity,Visibility,WindSpeed,Pressure,Latitude,Longitude,Elevation,WindDir
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2002-08-01,0.0,27.72,26.91,0.81,81.500000,74.541667,81.000000,6.000000,3.791667,29.67250,4.322991,113.987416,19.0,9.124025
2002-08-02,29.5,27.72,26.91,0.81,77.791667,75.833333,94.125000,5.416667,3.625000,29.69000,4.322991,113.987416,19.0,67.441467
2002-08-03,0.5,27.72,26.91,0.81,80.666667,75.083333,83.958333,6.000000,5.791667,29.71750,4.322991,113.987416,19.0,213.818221
2002-08-04,0.5,27.72,26.91,0.81,82.083333,74.541667,79.500000,6.000000,6.500000,29.71500,4.322991,113.987416,19.0,168.345097
2002-08-05,0.0,27.72,26.91,0.81,83.291667,75.666667,79.708333,6.000000,6.458333,29.72625,4.322991,113.987416,19.0,336.902931
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-01-30,0.0,25.56,26.55,-0.99,81.583333,77.583333,88.416667,6.000000,6.666667,29.71875,4.322991,113.987416,19.0,49.423316
2021-01-31,0.0,25.56,26.55,-0.99,81.000000,77.250000,88.875000,6.000000,4.916667,29.72750,4.322991,113.987416,19.0,63.839766
2021-02-01,9.5,25.76,26.76,-1.00,80.125000,77.416667,91.833333,5.750000,5.541667,29.76750,4.322991,113.987416,19.0,51.879803
2021-02-02,4.5,25.76,26.76,-1.00,79.458333,76.250000,90.541667,6.000000,7.875000,29.76750,4.322991,113.987416,19.0,63.753665


In [71]:
lawas_rainfall_input_weekly

Unnamed: 0_level_0,Rainfall,TOTAL,ClimAdjust,ANOM,Temperature,DewPoint,Humidity,Visibility,WindSpeed,Pressure,Latitude,Longitude,Elevation,WindDir
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2002-08-04,30.5,27.720000,26.910000,0.810000,80.510417,75.000000,84.645833,5.854167,4.927083,29.698750,4.322991,113.987416,19.0,84.265860
2002-08-11,4.0,27.720000,26.910000,0.810000,81.875000,74.529762,79.761905,6.000000,5.535714,29.754286,4.322991,113.987416,19.0,164.799802
2002-08-18,14.5,27.720000,26.910000,0.810000,82.273810,72.553571,74.648810,3.702381,7.625000,29.696429,4.322991,113.987416,19.0,208.972324
2002-08-25,58.0,27.720000,26.910000,0.810000,82.303030,75.309091,81.103030,3.848485,3.993939,29.748727,4.322991,113.987416,19.0,42.779401
2002-09-01,71.0,27.732857,26.894286,0.838571,80.688623,75.502994,85.365269,5.406766,3.874251,29.753593,4.322991,113.987416,19.0,131.859657
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-01-10,3.0,25.560000,26.550000,-0.990000,79.821429,76.714286,90.702381,5.851190,4.696429,29.671905,4.322991,113.987416,19.0,85.081288
2021-01-17,0.0,25.560000,26.550000,-0.990000,78.722581,76.612903,93.645161,5.729032,5.122581,29.706000,4.322991,113.987416,19.0,80.170371
2021-01-24,0.0,25.560000,26.550000,-0.990000,78.273810,75.714286,92.511905,5.714286,3.500000,29.732143,4.322991,113.987416,19.0,52.843410
2021-01-31,0.0,25.560000,26.550000,-0.990000,80.401198,76.988024,89.856287,5.922156,5.299401,29.718144,4.322991,113.987416,19.0,43.907331


In [72]:
lawas_rainfall_input

Unnamed: 0_level_0,Rainfall,TOTAL,ClimAdjust,ANOM,Temperature,DewPoint,Humidity,Visibility,WindSpeed,Pressure,WindDir,Latitude,Longitude,Elevation
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2002-08-01 00:00:00,0.0,27.72,26.91,0.81,75.0,73.0,94.0,6.0,1.0,29.72,40.0,4.322991,113.987416,19.0
2002-08-01 01:00:00,0.0,27.72,26.91,0.81,75.0,75.0,100.0,6.0,0.0,29.72,40.0,4.322991,113.987416,19.0
2002-08-01 02:00:00,0.0,27.72,26.91,0.81,75.0,73.0,94.0,6.0,3.0,29.69,40.0,4.322991,113.987416,19.0
2002-08-01 03:00:00,0.0,27.72,26.91,0.81,75.0,73.0,94.0,6.0,0.0,29.69,40.0,4.322991,113.987416,19.0
2002-08-01 04:00:00,0.0,27.72,26.91,0.81,73.0,73.0,100.0,6.0,9.0,29.66,40.0,4.322991,113.987416,19.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-02-02 20:00:00,0.0,25.76,26.76,-1.00,79.0,77.0,94.0,6.0,9.0,29.76,30.0,4.322991,113.987416,19.0
2021-02-02 21:00:00,0.0,25.76,26.76,-1.00,79.0,75.0,89.0,6.0,8.0,29.76,40.0,4.322991,113.987416,19.0
2021-02-02 22:00:00,0.0,25.76,26.76,-1.00,79.0,75.0,89.0,6.0,7.0,29.79,40.0,4.322991,113.987416,19.0
2021-02-02 23:00:00,0.0,25.76,26.76,-1.00,79.0,75.0,89.0,6.0,8.0,29.79,30.0,4.322991,113.987416,19.0


Data for validation

In [73]:
# start_date_valid = "2020-01-01"
# end_date_valid = "2022-07-16"
# # There is data losses from Feb 2021 - Dec 2021, as there is major loss. The data will be use from Jan 2002 until Apr 2021

# missing_date_valid = missing_date[(missing_date >= start_date_valid) & (missing_date <= end_date_valid)]

In [74]:
# lawas_rainfall_valid = lawas_rainfall.reindex(
#                     pd.date_range(start=start_date_valid, end=end_date_valid,
#                     freq='1h'),
#                     method='ffill'
#                     )
# lawas_rainfall_valid.index.name = "DateTime"
# lawas_rainfall_valid

In [75]:
# # Check on start and end data
# print(f"Head Data:\n{lawas_rainfall_valid.head()}\n")
# print(f"Tail Data:\n{lawas_rainfall_valid.tail()}")

In [76]:
# lawas_rainfall_valid.columns

Downsample for validation data

In [77]:
# lawas_rainfall_valid_daily = lawas_rainfall_valid.resample('D').agg({
#     'Rainfall': 'sum',
#     'TOTAL': 'mean',
#     'ClimAdjust': 'mean',
#     'ANOM': 'mean',
#     'Temperature': 'mean',
#     'DewPoint': 'mean',
#     'Humidity': 'mean',
#     'Visibility': 'mean',
#     'WindSpeed': 'mean',
#     'Pressure': 'mean',
#     'Latitude': 'mean',
#     'Longitude': 'mean',
#     'Elevation': 'mean',
#     'WindDir': resultant_wind_direction,
#     # 'Condition': lambda x: x.mode()[0]
# })

# lawas_rainfall_valid_weekly = lawas_rainfall_valid.resample('W').agg({
#     'Rainfall': 'sum',
#     'TOTAL': 'mean',
#     'ClimAdjust': 'mean',
#     'ANOM': 'mean',
#     'Temperature': 'mean',
#     'DewPoint': 'mean',
#     'Humidity': 'mean',
#     'Visibility': 'mean',
#     'WindSpeed': 'mean',
#     'Pressure': 'mean',
#     'Latitude': 'mean',
#     'Longitude': 'mean',
#     'Elevation': 'mean',
#     'WindDir': resultant_wind_direction,
#     # 'Condition': lambda x: x.mode()[0]
# })

In [78]:
# lawas_rainfall_valid_daily

In [79]:
# lawas_rainfall_valid_weekly

Export as .csv file

In [80]:
# input data
lawas_rainfall_input.to_csv(f"{folder_path}/{folder_path}-rainfall.csv",index=True)
lawas_rainfall_input_daily.to_csv(f"{folder_path}/{folder_path}-rainfall-daily.csv",index=True)
lawas_rainfall_input_weekly.to_csv(f"{folder_path}/{folder_path}-rainfall-weekly.csv",index=True)

# # valid data
# lawas_rainfall_valid.to_csv(f"valid/{folder_path}-rainfall-valid.csv",index=True)
# lawas_rainfall_valid_daily.to_csv(f"valid/{folder_path}-rainfall-valid-daily.csv",index=True)
# lawas_rainfall_valid_weekly.to_csv(f"valid/{folder_path}-rainfall-valid-weekly.csv",index=True)