Import ONI data

In [148]:
import pandas as pd
import os

oni_data = pd.read_csv("oni.txt", sep="\s+")
oni_data

Unnamed: 0,YR,MON,TOTAL,ClimAdjust,ANOM
0,1950,1,24.56,26.18,-1.62
1,1950,2,25.07,26.39,-1.32
2,1950,3,25.88,26.95,-1.07
3,1950,4,26.29,27.39,-1.11
4,1950,5,26.19,27.56,-1.37
...,...,...,...,...,...
893,2024,6,27.91,27.73,0.18
894,2024,7,27.34,27.29,0.05
895,2024,8,26.74,26.86,-0.11
896,2024,9,26.47,26.72,-0.25


Import DID/MET data

In [149]:
import pandas as pd
import os
import re
folder_path = "lawas"
precipitation_data = pd.DataFrame()
for file in os.listdir(folder_path):
    if re.search(r"rainfall-data\d+\.csv", file):
        df = pd.read_csv(f"{folder_path}/{file}")
        precipitation_data = pd.concat([precipitation_data, df], ignore_index=True)

In [150]:
precipitation_data["DateTime"] = pd.to_datetime(precipitation_data["DateTime"],format="%Y%m%d %H%M")
precipitation_data = precipitation_data[["DateTime", "Rainfall (mm)"]]
precipitation_data.rename(columns={"Rainfall (mm)": "Rainfall"}, inplace=True)
precipitation_data

Unnamed: 0,DateTime,Rainfall
0,1998-07-13 12:00:00,1.0
1,1998-07-13 13:00:00,2.0
2,1998-07-13 14:00:00,0.0
3,1998-07-13 15:00:00,0.0
4,1998-07-13 16:00:00,0.0
...,...,...
200096,2022-04-07 20:00:00,1.0
200097,2022-04-07 21:00:00,0.0
200098,2022-04-07 22:00:00,0.0
200099,2022-04-07 23:00:00,0.0


Merge DID and ONI data

In [151]:
precipitation_data = pd.merge(
    precipitation_data,
    oni_data[["YR", "MON", "TOTAL", "ClimAdjust", "ANOM"]],
    left_on=[precipitation_data["DateTime"].dt.year, precipitation_data["DateTime"].dt.month],
    right_on= ["YR", "MON"]
).drop(["YR", "MON"], axis=1)
precipitation_data.set_index("DateTime",inplace=True)
precipitation_data


Unnamed: 0_level_0,Rainfall,TOTAL,ClimAdjust,ANOM
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1998-07-13 12:00:00,1.0,26.32,27.18,-0.86
1998-07-13 13:00:00,2.0,26.32,27.18,-0.86
1998-07-13 14:00:00,0.0,26.32,27.18,-0.86
1998-07-13 15:00:00,0.0,26.32,27.18,-0.86
1998-07-13 16:00:00,0.0,26.32,27.18,-0.86
...,...,...,...,...
2022-04-07 20:00:00,1.0,26.72,27.83,-1.11
2022-04-07 21:00:00,0.0,26.72,27.83,-1.11
2022-04-07 22:00:00,0.0,26.72,27.83,-1.11
2022-04-07 23:00:00,0.0,26.72,27.83,-1.11


Import Wunderground data

In [152]:
feature_data = pd.read_csv(f"{folder_path}/rainfall-feature-wunderground.csv")
feature_data.rename(columns={"Time":"DateTime"},inplace=True)
feature_data["DateTime"] = pd.to_datetime(feature_data["DateTime"], format="%Y-%m-%d %H:%M:%S%z")
feature_data["DateTime"] = feature_data["DateTime"].dt.tz_localize(None)
feature_data.set_index("DateTime",inplace=True)
feature_data.drop(columns={"Wind Gust", "Precip."}, inplace=True)
feature_data.fillna({"Wind Speed":0}, inplace=True)
feature_data.ffill(inplace=True)
feature_data.rename(columns={"Dew Point": "DewPoint", "Wind Speed": "WindSpeed"}, inplace=True)
feature_data

Unnamed: 0_level_0,Temperature,DewPoint,Humidity,Visibility,WindSpeed,Pressure,Wind,Condition
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2002-08-01 00:00:00,79.0,75.0,89.0,6.0,3.0,29.69,E,Mostly Cloudy
2002-08-01 01:00:00,79.0,73.0,83.0,6.0,2.0,29.69,E,Mostly Cloudy
2002-08-01 02:00:00,77.0,75.0,94.0,6.0,2.0,29.66,W,Mostly Cloudy
2002-08-01 03:00:00,77.0,73.0,89.0,6.0,0.0,29.66,CALM,Mostly Cloudy
2002-08-01 04:00:00,79.0,73.0,83.0,6.0,1.0,29.63,NE,Mostly Cloudy
...,...,...,...,...,...,...,...,...
2021-02-03 19:00:00,79.0,75.0,89.0,4.0,10.0,29.69,WNW,Light Rain
2021-02-03 20:00:00,77.0,75.0,94.0,5.0,7.0,29.72,WNW,Light Rain
2021-02-03 21:00:00,79.0,75.0,89.0,6.0,6.0,29.72,WNW,Light Rain
2021-02-03 22:00:00,79.0,75.0,89.0,6.0,6.0,29.75,W,Partly Cloudy


Add Topographic data

In [153]:
topo_loc = {
    "lawas":
    {
        "lat": 4.847301, "lon": 115.406703, "elev": 6.5,
    },
    "mulu":
    {
        "lat": 4.049213, "lon": 114.810996, "elev": 28.7,
    },
    "kuching":
    {
        "lat": 1.487123, "lon": 110.341599, "elev": 22.3,
    },
    "kota-kinabalu":
    {
        "lat": 5.923283, "lon": 116.051239, "elev": 5.4,
    },
    "kuantan":
    {
        "lat": 3.780726, "lon": 103.215062, "elev": 16,
    }
}

# Add topographic data to the DataFrame
location = folder_path.lower()
if location in topo_loc:
    feature_data["Latitude"] = topo_loc[location]["lat"]
    feature_data["Longitude"] = topo_loc[location]["lon"]
    feature_data["Elevation"] = topo_loc[location]["elev"]
print(feature_data.head())

                     Temperature  DewPoint  Humidity  Visibility  WindSpeed  \
DateTime                                                                      
2002-08-01 00:00:00         79.0      75.0      89.0         6.0        3.0   
2002-08-01 01:00:00         79.0      73.0      83.0         6.0        2.0   
2002-08-01 02:00:00         77.0      75.0      94.0         6.0        2.0   
2002-08-01 03:00:00         77.0      73.0      89.0         6.0        0.0   
2002-08-01 04:00:00         79.0      73.0      83.0         6.0        1.0   

                     Pressure  Wind      Condition  Latitude   Longitude  \
DateTime                                                                   
2002-08-01 00:00:00     29.69     E  Mostly Cloudy  4.847301  115.406703   
2002-08-01 01:00:00     29.69     E  Mostly Cloudy  4.847301  115.406703   
2002-08-01 02:00:00     29.66     W  Mostly Cloudy  4.847301  115.406703   
2002-08-01 03:00:00     29.66  CALM  Mostly Cloudy  4.847301  115.

Combine all data

In [154]:
lawas_rainfall = precipitation_data.join(feature_data)
lawas_rainfall.dropna(inplace=True)
lawas_rainfall

Unnamed: 0_level_0,Rainfall,TOTAL,ClimAdjust,ANOM,Temperature,DewPoint,Humidity,Visibility,WindSpeed,Pressure,Wind,Condition,Latitude,Longitude,Elevation
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2002-08-01 00:00:00,0.0,27.72,26.91,0.81,79.0,75.0,89.0,6.0,3.0,29.69,E,Mostly Cloudy,4.847301,115.406703,6.5
2002-08-01 01:00:00,0.0,27.72,26.91,0.81,79.0,73.0,83.0,6.0,2.0,29.69,E,Mostly Cloudy,4.847301,115.406703,6.5
2002-08-01 02:00:00,0.0,27.72,26.91,0.81,77.0,75.0,94.0,6.0,2.0,29.66,W,Mostly Cloudy,4.847301,115.406703,6.5
2002-08-01 03:00:00,0.0,27.72,26.91,0.81,77.0,73.0,89.0,6.0,0.0,29.66,CALM,Mostly Cloudy,4.847301,115.406703,6.5
2002-08-01 04:00:00,0.0,27.72,26.91,0.81,79.0,73.0,83.0,6.0,1.0,29.63,NE,Mostly Cloudy,4.847301,115.406703,6.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-02-03 08:00:00,0.0,25.76,26.76,-1.00,79.0,75.0,89.0,6.0,9.0,29.75,NW,Fair,4.847301,115.406703,6.5
2021-02-03 09:00:00,0.0,25.76,26.76,-1.00,81.0,75.0,84.0,6.0,7.0,29.75,NW,Fair,4.847301,115.406703,6.5
2021-02-03 10:00:00,0.0,25.76,26.76,-1.00,82.0,75.0,79.0,6.0,9.0,29.75,NNW,Fair,4.847301,115.406703,6.5
2021-02-03 11:00:00,0.0,25.76,26.76,-1.00,82.0,75.0,79.0,6.0,10.0,29.75,NW,Fair,4.847301,115.406703,6.5


Check for duplicate data

In [155]:
duplicates = lawas_rainfall.index[lawas_rainfall.index.duplicated()]
if not duplicates.empty:
    print("Duplicate timestamps found:", duplicates)

Duplicate timestamps found: DatetimeIndex(['2006-08-02 17:00:00', '2006-08-02 18:00:00',
               '2006-08-02 19:00:00', '2017-01-14 01:00:00',
               '2020-12-03 00:00:00', '2020-12-03 01:00:00',
               '2020-12-03 02:00:00', '2020-12-03 03:00:00',
               '2020-12-03 04:00:00', '2020-12-03 05:00:00',
               '2020-12-03 06:00:00', '2020-12-03 07:00:00',
               '2020-12-03 08:00:00', '2020-12-03 09:00:00',
               '2020-12-03 10:00:00', '2020-12-03 11:00:00',
               '2020-12-03 12:00:00', '2020-12-03 13:00:00',
               '2020-12-03 14:00:00', '2020-12-03 15:00:00',
               '2020-12-03 16:00:00'],
              dtype='datetime64[ns]', name='DateTime', freq=None)


In [156]:
numeric_cols = lawas_rainfall.select_dtypes(include=['number']).columns
non_numeric_cols = lawas_rainfall.select_dtypes(exclude=['number']).columns

# Handle duplicate index by grouping
# For numeric columns, take the mean
# For non-numeric columns, take the first occurrence (or other appropriate method)
lawas_rainfall = (
    lawas_rainfall.groupby(lawas_rainfall.index)
    .agg({**{col: 'mean' for col in numeric_cols},
          **{col: 'first' for col in non_numeric_cols}})
)
lawas_rainfall

Unnamed: 0_level_0,Rainfall,TOTAL,ClimAdjust,ANOM,Temperature,DewPoint,Humidity,Visibility,WindSpeed,Pressure,Latitude,Longitude,Elevation,Wind,Condition
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2002-08-01 00:00:00,0.0,27.72,26.91,0.81,79.0,75.0,89.0,6.0,3.0,29.69,4.847301,115.406703,6.5,E,Mostly Cloudy
2002-08-01 01:00:00,0.0,27.72,26.91,0.81,79.0,73.0,83.0,6.0,2.0,29.69,4.847301,115.406703,6.5,E,Mostly Cloudy
2002-08-01 02:00:00,0.0,27.72,26.91,0.81,77.0,75.0,94.0,6.0,2.0,29.66,4.847301,115.406703,6.5,W,Mostly Cloudy
2002-08-01 03:00:00,0.0,27.72,26.91,0.81,77.0,73.0,89.0,6.0,0.0,29.66,4.847301,115.406703,6.5,CALM,Mostly Cloudy
2002-08-01 04:00:00,0.0,27.72,26.91,0.81,79.0,73.0,83.0,6.0,1.0,29.63,4.847301,115.406703,6.5,NE,Mostly Cloudy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-02-03 08:00:00,0.0,25.76,26.76,-1.00,79.0,75.0,89.0,6.0,9.0,29.75,4.847301,115.406703,6.5,NW,Fair
2021-02-03 09:00:00,0.0,25.76,26.76,-1.00,81.0,75.0,84.0,6.0,7.0,29.75,4.847301,115.406703,6.5,NW,Fair
2021-02-03 10:00:00,0.0,25.76,26.76,-1.00,82.0,75.0,79.0,6.0,9.0,29.75,4.847301,115.406703,6.5,NNW,Fair
2021-02-03 11:00:00,0.0,25.76,26.76,-1.00,82.0,75.0,79.0,6.0,10.0,29.75,4.847301,115.406703,6.5,NW,Fair


Find the data losses (Training Input)

In [157]:
datetime_range = pd.date_range(start=lawas_rainfall.index.min(), end=lawas_rainfall.index.max(), freq='1h')

missing_date = datetime_range.difference(lawas_rainfall.index)
missing_date

DatetimeIndex(['2002-08-21 09:00:00', '2002-08-21 10:00:00',
               '2002-08-21 11:00:00', '2002-09-10 08:00:00',
               '2002-09-10 12:00:00', '2002-09-10 13:00:00',
               '2002-09-10 14:00:00', '2002-09-10 15:00:00',
               '2002-09-10 16:00:00', '2002-09-18 14:00:00',
               ...
               '2020-11-08 23:00:00', '2020-11-10 01:00:00',
               '2020-11-16 01:00:00', '2021-01-12 00:00:00',
               '2021-01-12 01:00:00', '2021-01-12 02:00:00',
               '2021-01-12 03:00:00', '2021-01-12 04:00:00',
               '2021-01-12 05:00:00', '2021-01-28 20:00:00'],
              dtype='datetime64[ns]', length=2213, freq=None)

In [158]:
# start_date_input = "2020-01-01"
# end_date_input = "2022-07-17 11:00:00"

start_date_input = "2002-08-01"
end_date_input = "2021-02-03 12:00:00"
# There is data losses from Feb 2021 - Dec 2021, as there is major loss. The data will be use from Aug 2002 until Jan 2021

missing_date_input = missing_date[(missing_date >= start_date_input) & (missing_date <= end_date_input)]
missing_date_input

DatetimeIndex(['2002-08-21 09:00:00', '2002-08-21 10:00:00',
               '2002-08-21 11:00:00', '2002-09-10 08:00:00',
               '2002-09-10 12:00:00', '2002-09-10 13:00:00',
               '2002-09-10 14:00:00', '2002-09-10 15:00:00',
               '2002-09-10 16:00:00', '2002-09-18 14:00:00',
               ...
               '2020-11-08 23:00:00', '2020-11-10 01:00:00',
               '2020-11-16 01:00:00', '2021-01-12 00:00:00',
               '2021-01-12 01:00:00', '2021-01-12 02:00:00',
               '2021-01-12 03:00:00', '2021-01-12 04:00:00',
               '2021-01-12 05:00:00', '2021-01-28 20:00:00'],
              dtype='datetime64[ns]', length=2213, freq=None)

In [159]:
lawas_rainfall_input = lawas_rainfall.reindex(
                    pd.date_range(start=start_date_input, end=end_date_input,
                    freq='1h'),
                    method='ffill'
                    )
lawas_rainfall_input.index.name = "DateTime"
lawas_rainfall_input

Unnamed: 0_level_0,Rainfall,TOTAL,ClimAdjust,ANOM,Temperature,DewPoint,Humidity,Visibility,WindSpeed,Pressure,Latitude,Longitude,Elevation,Wind,Condition
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2002-08-01 00:00:00,0.0,27.72,26.91,0.81,79.0,75.0,89.0,6.0,3.0,29.69,4.847301,115.406703,6.5,E,Mostly Cloudy
2002-08-01 01:00:00,0.0,27.72,26.91,0.81,79.0,73.0,83.0,6.0,2.0,29.69,4.847301,115.406703,6.5,E,Mostly Cloudy
2002-08-01 02:00:00,0.0,27.72,26.91,0.81,77.0,75.0,94.0,6.0,2.0,29.66,4.847301,115.406703,6.5,W,Mostly Cloudy
2002-08-01 03:00:00,0.0,27.72,26.91,0.81,77.0,73.0,89.0,6.0,0.0,29.66,4.847301,115.406703,6.5,CALM,Mostly Cloudy
2002-08-01 04:00:00,0.0,27.72,26.91,0.81,79.0,73.0,83.0,6.0,1.0,29.63,4.847301,115.406703,6.5,NE,Mostly Cloudy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-02-03 08:00:00,0.0,25.76,26.76,-1.00,79.0,75.0,89.0,6.0,9.0,29.75,4.847301,115.406703,6.5,NW,Fair
2021-02-03 09:00:00,0.0,25.76,26.76,-1.00,81.0,75.0,84.0,6.0,7.0,29.75,4.847301,115.406703,6.5,NW,Fair
2021-02-03 10:00:00,0.0,25.76,26.76,-1.00,82.0,75.0,79.0,6.0,9.0,29.75,4.847301,115.406703,6.5,NNW,Fair
2021-02-03 11:00:00,0.0,25.76,26.76,-1.00,82.0,75.0,79.0,6.0,10.0,29.75,4.847301,115.406703,6.5,NW,Fair


In [160]:
# Check on start and end data
print(f"Head Data:\n{lawas_rainfall_input.head()}\n")
print(f"Tail Data:\n{lawas_rainfall_input.tail()}")

Head Data:
                     Rainfall  TOTAL  ClimAdjust  ANOM  Temperature  DewPoint  \
DateTime                                                                        
2002-08-01 00:00:00       0.0  27.72       26.91  0.81         79.0      75.0   
2002-08-01 01:00:00       0.0  27.72       26.91  0.81         79.0      73.0   
2002-08-01 02:00:00       0.0  27.72       26.91  0.81         77.0      75.0   
2002-08-01 03:00:00       0.0  27.72       26.91  0.81         77.0      73.0   
2002-08-01 04:00:00       0.0  27.72       26.91  0.81         79.0      73.0   

                     Humidity  Visibility  WindSpeed  Pressure  Latitude  \
DateTime                                                                   
2002-08-01 00:00:00      89.0         6.0        3.0     29.69  4.847301   
2002-08-01 01:00:00      83.0         6.0        2.0     29.69  4.847301   
2002-08-01 02:00:00      94.0         6.0        2.0     29.66  4.847301   
2002-08-01 03:00:00      89.0         6.0

In [161]:
lawas_rainfall_input.columns

Index(['Rainfall', 'TOTAL', 'ClimAdjust', 'ANOM', 'Temperature', 'DewPoint',
       'Humidity', 'Visibility', 'WindSpeed', 'Pressure', 'Latitude',
       'Longitude', 'Elevation', 'Wind', 'Condition'],
      dtype='object')

Downsample data - daily, weekly

In [162]:
lawas_rainfall_input_daily = lawas_rainfall_input.resample('D').agg({
    'Rainfall': 'sum',
    'TOTAL': 'mean',
    'ClimAdjust': 'mean',
    'ANOM': 'mean',
    'Temperature': 'mean',
    'DewPoint': 'mean',
    'Humidity': 'mean',
    'Visibility': 'mean',
    'WindSpeed': 'mean',
    'Pressure': 'mean',
    'Latitude': 'mean',
    'Longitude': 'mean',
    'Elevation': 'mean',
    'Wind': lambda x: x.mode()[0],
    'Condition': lambda x: x.mode()[0]
})

lawas_rainfall_input_weekly = lawas_rainfall_input.resample('W').agg({
    'Rainfall': 'sum',
    'TOTAL': 'mean',
    'ClimAdjust': 'mean',
    'ANOM': 'mean',
    'Temperature': 'mean',
    'DewPoint': 'mean',
    'Humidity': 'mean',
    'Visibility': 'mean',
    'WindSpeed': 'mean',
    'Pressure': 'mean',
    'Latitude': 'mean',
    'Longitude': 'mean',
    'Elevation': 'mean',
    'Wind': lambda x: x.mode()[0],
    'Condition': lambda x: x.mode()[0]
})

In [163]:
lawas_rainfall_input_daily

Unnamed: 0_level_0,Rainfall,TOTAL,ClimAdjust,ANOM,Temperature,DewPoint,Humidity,Visibility,WindSpeed,Pressure,Latitude,Longitude,Elevation,Wind,Condition
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2002-08-01,0.5,27.72,26.91,0.81,83.500000,73.500000,73.208333,6.000000,3.625000,29.632500,4.847301,115.406703,6.5,E,Mostly Cloudy
2002-08-02,46.0,27.72,26.91,0.81,82.666667,75.416667,80.375000,5.583333,6.666667,29.652500,4.847301,115.406703,6.5,NE,Mostly Cloudy
2002-08-03,23.0,27.72,26.91,0.81,81.958333,74.250000,78.833333,5.916667,6.041667,29.678750,4.847301,115.406703,6.5,SSE,Mostly Cloudy
2002-08-04,9.5,27.72,26.91,0.81,82.791667,73.000000,73.458333,5.916667,5.750000,29.676250,4.847301,115.406703,6.5,WNW,Mostly Cloudy
2002-08-05,1.5,27.72,26.91,0.81,81.708333,74.125000,79.500000,5.083333,6.083333,29.697500,4.847301,115.406703,6.5,NW,Mostly Cloudy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-01-30,9.0,25.56,26.55,-0.99,81.041667,74.250000,81.125000,6.000000,6.583333,29.695000,4.847301,115.406703,6.5,NW,Fair
2021-01-31,0.0,25.56,26.55,-0.99,81.750000,74.500000,80.041667,6.000000,5.166667,29.701250,4.847301,115.406703,6.5,NW,Fair
2021-02-01,13.5,25.76,26.76,-1.00,81.333333,75.166667,82.708333,5.875000,7.583333,29.732500,4.847301,115.406703,6.5,NW,Fair
2021-02-02,4.0,25.76,26.76,-1.00,80.458333,74.666667,83.833333,5.583333,6.666667,29.738750,4.847301,115.406703,6.5,NW,Fair


In [164]:
lawas_rainfall_input_weekly

Unnamed: 0_level_0,Rainfall,TOTAL,ClimAdjust,ANOM,Temperature,DewPoint,Humidity,Visibility,WindSpeed,Pressure,Latitude,Longitude,Elevation,Wind,Condition
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2002-08-04,79.0,27.720000,26.910000,0.810000,82.729167,74.041667,76.468750,5.854167,5.520833,29.660000,4.847301,115.406703,6.5,SSE,Mostly Cloudy
2002-08-11,71.0,27.720000,26.910000,0.810000,82.898810,73.226190,73.726190,5.801310,6.255952,29.714821,4.847301,115.406703,6.5,W,Mostly Cloudy
2002-08-18,30.5,27.720000,26.910000,0.810000,82.416667,71.273810,70.779762,4.613810,9.464286,29.648929,4.847301,115.406703,6.5,SW,Mostly Cloudy
2002-08-25,12.0,27.720000,26.910000,0.810000,82.910714,74.166667,76.488095,4.244048,4.940476,29.715536,4.847301,115.406703,6.5,E,Mostly Cloudy
2002-09-01,66.5,27.732857,26.894286,0.838571,81.261905,74.047619,80.154762,5.553571,5.017857,29.718214,4.847301,115.406703,6.5,S,Mostly Cloudy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-01-10,65.5,25.560000,26.550000,-0.990000,81.196429,75.726190,84.285714,5.838869,3.482143,29.639464,4.847301,115.406703,6.5,VAR,Fair
2021-01-17,143.5,25.560000,26.550000,-0.990000,78.541667,74.458333,88.113095,5.477917,4.541667,29.671250,4.847301,115.406703,6.5,VAR,Fair
2021-01-24,124.0,25.560000,26.550000,-0.990000,78.494048,74.482143,88.434524,5.530774,2.059524,29.693036,4.847301,115.406703,6.5,VAR,Fair
2021-01-31,80.5,25.560000,26.550000,-0.990000,81.470238,74.958333,81.744048,5.886488,5.511905,29.685714,4.847301,115.406703,6.5,NW,Fair


In [165]:
lawas_rainfall_input

Unnamed: 0_level_0,Rainfall,TOTAL,ClimAdjust,ANOM,Temperature,DewPoint,Humidity,Visibility,WindSpeed,Pressure,Latitude,Longitude,Elevation,Wind,Condition
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2002-08-01 00:00:00,0.0,27.72,26.91,0.81,79.0,75.0,89.0,6.0,3.0,29.69,4.847301,115.406703,6.5,E,Mostly Cloudy
2002-08-01 01:00:00,0.0,27.72,26.91,0.81,79.0,73.0,83.0,6.0,2.0,29.69,4.847301,115.406703,6.5,E,Mostly Cloudy
2002-08-01 02:00:00,0.0,27.72,26.91,0.81,77.0,75.0,94.0,6.0,2.0,29.66,4.847301,115.406703,6.5,W,Mostly Cloudy
2002-08-01 03:00:00,0.0,27.72,26.91,0.81,77.0,73.0,89.0,6.0,0.0,29.66,4.847301,115.406703,6.5,CALM,Mostly Cloudy
2002-08-01 04:00:00,0.0,27.72,26.91,0.81,79.0,73.0,83.0,6.0,1.0,29.63,4.847301,115.406703,6.5,NE,Mostly Cloudy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-02-03 08:00:00,0.0,25.76,26.76,-1.00,79.0,75.0,89.0,6.0,9.0,29.75,4.847301,115.406703,6.5,NW,Fair
2021-02-03 09:00:00,0.0,25.76,26.76,-1.00,81.0,75.0,84.0,6.0,7.0,29.75,4.847301,115.406703,6.5,NW,Fair
2021-02-03 10:00:00,0.0,25.76,26.76,-1.00,82.0,75.0,79.0,6.0,9.0,29.75,4.847301,115.406703,6.5,NNW,Fair
2021-02-03 11:00:00,0.0,25.76,26.76,-1.00,82.0,75.0,79.0,6.0,10.0,29.75,4.847301,115.406703,6.5,NW,Fair


Data for validation

In [166]:
# start_date_valid = "2022-01-01"
# end_date_valid = "2022-04-07 23:00:00"
# # There is data losses from Feb 2021 - Dec 2021, as there is major loss. The data will be use from Jan 2002 until Apr 2021

# missing_date_valid = missing_date[(missing_date >= start_date_valid) & (missing_date <= end_date_valid)]

In [167]:
# lawas_rainfall_valid = lawas_rainfall.reindex(
#                     pd.date_range(start=start_date_valid, end=end_date_valid,
#                     freq='1h'),
#                     method='ffill'
#                     )
# lawas_rainfall_valid.index.name = "DateTime"
# lawas_rainfall_valid

In [168]:
# # Check on start and end data
# print(f"Head Data:\n{lawas_rainfall_valid.head()}\n")
# print(f"Tail Data:\n{lawas_rainfall_valid.tail()}")

In [169]:
# lawas_rainfall_valid.columns

Downsample for validation data

In [170]:
# lawas_rainfall_valid_daily = lawas_rainfall_valid.resample('D').agg({
#     'Rainfall': 'sum',
#     'TOTAL': 'mean',
#     'ClimAdjust': 'mean',
#     'ANOM': 'mean',
#     'Temperature': 'mean',
#     'DewPoint': 'mean',
#     'Humidity': 'mean',
#     'Visibility': 'mean',
#     'WindSpeed': 'mean',
#     'Pressure': 'mean',
#     'Latitude': 'mean',
#     'Longitude': 'mean',
#     'Elevation': 'mean',
#     'Wind': lambda x: x.mode()[0],
#     'Condition': lambda x: x.mode()[0]
# })

# lawas_rainfall_valid_weekly = lawas_rainfall_valid.resample('W').agg({
#     'Rainfall': 'sum',
#     'TOTAL': 'mean',
#     'ClimAdjust': 'mean',
#     'ANOM': 'mean',
#     'Temperature': 'mean',
#     'DewPoint': 'mean',
#     'Humidity': 'mean',
#     'Visibility': 'mean',
#     'WindSpeed': 'mean',
#     'Pressure': 'mean',
#     'Latitude': 'mean',
#     'Longitude': 'mean',
#     'Elevation': 'mean',
#     'Wind': lambda x: x.mode()[0],
#     'Condition': lambda x: x.mode()[0]
# })

In [171]:
# lawas_rainfall_valid_daily

In [172]:
# lawas_rainfall_valid_weekly

Export as .csv file

In [173]:
# input data
lawas_rainfall_input.to_csv(f"{folder_path}/{folder_path}-rainfall.csv",index=True)
lawas_rainfall_input_daily.to_csv(f"{folder_path}/{folder_path}-rainfall-daily.csv",index=True)
lawas_rainfall_input_weekly.to_csv(f"{folder_path}/{folder_path}-rainfall-weekly.csv",index=True)

# valid data
# lawas_rainfall_valid.to_csv(f"valid/{folder_path}-rainfall-valid.csv",index=True)
# lawas_rainfall_valid_daily.to_csv(f"valid/{folder_path}-rainfall-valid-daily.csv",index=True)
# lawas_rainfall_valid_weekly.to_csv(f"valid/{folder_path}-rainfall-valid-weekly.csv",index=True)