In [2]:
pip install openmeteo-requests requests-cache retry-requests pandas numpy scikit-learn joblib streamlit matplotlib


Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [1]:
# ===============================
# IMPORTS
# ===============================
import openmeteo_requests
import pandas as pd
import requests_cache
from retry_requests import retry
import os

# ===============================
# SETUP OPEN-METEO CLIENT
# ===============================
cache_session = requests_cache.CachedSession(".cache", expire_after=-1)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
openmeteo = openmeteo_requests.Client(session=retry_session)

# ===============================
# FETCH HISTORICAL RAINFALL DATA
# (Shah Alam, Selangor)
# ===============================
url = "https://archive-api.open-meteo.com/v1/archive"

params = {
    "latitude": 3.043092,
    "longitude": 101.441392,
    "start_date": "2020-01-01",
    "end_date": "2025-12-31",
    "hourly": "rain",
}

responses = openmeteo.weather_api(url, params=params)
response = responses[0]

# ===============================
# BUILD BASE DATAFRAME
# ===============================
hourly = response.Hourly()
rain = hourly.Variables(0).ValuesAsNumpy()

df = pd.DataFrame({
    "datetime": pd.date_range(
        start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
        end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True),
        freq=pd.Timedelta(seconds=hourly.Interval()),
        inclusive="left",
    ),
    "rain_1h": rain,
})

# ‚úÖ Create Date column (YYYY-MM-DD)
df["date"] = df["datetime"].dt.date

# ===============================
# FEATURE ENGINEERING
# ===============================
df["rain_3h_sum"] = df["rain_1h"].rolling(3).sum()
df["rain_6h_sum"] = df["rain_1h"].rolling(6).sum()
df["rain_12h_sum"] = df["rain_1h"].rolling(12).sum()

# Remove NaN rows caused by rolling windows
df = df.dropna().reset_index(drop=True)

# ===============================
# TARGET VARIABLE (FLOOD RISK PROXY)
# ===============================
RAIN_6H_THRESHOLD = 30.0  # mm in 6 hours
df["target"] = (df["rain_6h_sum"] >= RAIN_6H_THRESHOLD).astype(int)

# ===============================
# FINAL MODEL-READY DATASET
# (Include Date column)
# ===============================
final_df = df[
    [
        "date",
        "rain_1h",
        "rain_3h_sum",
        "rain_6h_sum",
        "rain_12h_sum",
        "target",
    ]
]

# ===============================
# SAVE DATASET (EXPECTED BY MODELS)
# ===============================
os.makedirs("data", exist_ok=True)
output_path = "data/dataset.csv"
final_df.to_csv(output_path, index=False)

# ===============================
# QUICK VERIFICATION
# ===============================
print("‚úÖ Dataset created successfully")
print("üìÅ Saved to:", output_path)
print("\nDataset shape:", final_df.shape)
print("\nTarget distribution:")
print(final_df["target"].value_counts())
print("\nSample rows:")
print(final_df.head())


‚úÖ Dataset created successfully
üìÅ Saved to: data/dataset.csv

Dataset shape: (52597, 6)

Target distribution:
target
0    52427
1      170
Name: count, dtype: int64

Sample rows:
         date  rain_1h  rain_3h_sum  rain_6h_sum  rain_12h_sum  target
0  2020-01-01      0.0          0.2          0.5           0.5       0
1  2020-01-01      0.0          0.2          0.4           0.5       0
2  2020-01-01      0.0          0.0          0.3           0.5       0
3  2020-01-01      0.0          0.0          0.2           0.5       0
4  2020-01-01      0.0          0.0          0.2           0.5       0
