In [5]:
pip install openmeteo-requests requests-cache retry-requests pandas numpy scikit-learn joblib streamlit matplotlib


Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp39-cp39-macosx_12_0_arm64.whl (11.1 MB)
Collecting joblib
  Using cached joblib-1.5.3-py3-none-any.whl (309 kB)
Collecting streamlit
  Using cached streamlit-1.50.0-py3-none-any.whl (10.1 MB)
Collecting threadpoolctl>=3.1.0
  Using cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Collecting scipy>=1.6.0
  Using cached scipy-1.13.1-cp39-cp39-macosx_12_0_arm64.whl (30.3 MB)
Collecting click<9,>=7.0
  Using cached click-8.1.8-py3-none-any.whl (98 kB)
Collecting toml<2,>=0.10.1
  Using cached toml-0.10.2-py2.py3-none-any.whl (16 kB)
Collecting protobuf<7,>=3.20
  Using cached protobuf-6.33.3-cp39-abi3-macosx_10_9_universal2.whl (427 kB)
Collecting pyarrow>=7.0
  Using cached pyarrow-21.0.0-cp39-cp39-macosx_12_0_arm64.whl (31.2 MB)
Collecting gitpython!=3.1.19,<4,>=3.0.7
  Using cached gitpython-3.1.46-py3-none-any.whl (208 kB)
Collecting altair!=5.4.0

In [8]:
# ===============================
# IMPORTS
# ===============================
import openmeteo_requests
import pandas as pd
import requests_cache
from retry_requests import retry
import os

# ===============================
# SETUP OPEN-METEO CLIENT
# ===============================
cache_session = requests_cache.CachedSession(
    ".cache", expire_after=-1
)
retry_session = retry(
    cache_session, retries=5, backoff_factor=0.2
)
openmeteo = openmeteo_requests.Client(session=retry_session)

# ===============================
# FETCH HISTORICAL RAINFALL DATA
# (Shah Alam, Selangor)
# ===============================
url = "https://archive-api.open-meteo.com/v1/archive"

params = {
    "latitude": 3.043092,
    "longitude": 101.441392,
    "start_date": "2020-01-01",
    "end_date": "2024-12-31",
    "hourly": "rain",
}

responses = openmeteo.weather_api(url, params=params)
response = responses[0]

# ===============================
# BUILD BASE DATAFRAME
# ===============================
hourly = response.Hourly()
rain = hourly.Variables(0).ValuesAsNumpy()

df = pd.DataFrame({
    "datetime": pd.date_range(
        start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
        end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True),
        freq=pd.Timedelta(seconds=hourly.Interval()),
        inclusive="left",
    ),
    "rain_1h": rain,
})

# ===============================
# FEATURE ENGINEERING
# ===============================
df["rain_3h_sum"] = df["rain_1h"].rolling(3).sum()
df["rain_6h_sum"] = df["rain_1h"].rolling(6).sum()
df["rain_12h_sum"] = df["rain_1h"].rolling(12).sum()

# Remove NaN rows caused by rolling windows
df = df.dropna().reset_index(drop=True)

# ===============================
# TARGET VARIABLE (FLOOD RISK PROXY)
# ===============================
# Flood proxy threshold (domain-based assumption)
RAIN_6H_THRESHOLD = 30.0  # mm in 6 hours

df["target"] = (df["rain_6h_sum"] >= RAIN_6H_THRESHOLD).astype(int)

# ===============================
# FINAL MODEL-READY DATASET
# ===============================
final_df = df[
    [
        "rain_1h",
        "rain_3h_sum",
        "rain_6h_sum",
        "rain_12h_sum",
        "target",
    ]
]

# ===============================
# SAVE DATASET (EXPECTED BY MODELS)
# ===============================
os.makedirs("data", exist_ok=True)
output_path = "data/dataset.csv"
final_df.to_csv(output_path, index=False)

# ===============================
# QUICK VERIFICATION
# ===============================
print("‚úÖ Dataset created successfully")
print("üìÅ Saved to:", output_path)
print("\nDataset shape:", final_df.shape)
print("\nTarget distribution:")
print(final_df["target"].value_counts())
print("\nSample rows:")
final_df.head()


‚úÖ Dataset created successfully
üìÅ Saved to: data/dataset.csv

Dataset shape: (43837, 5)

Target distribution:
target
0    43679
1      158
Name: count, dtype: int64

Sample rows:


Unnamed: 0,rain_1h,rain_3h_sum,rain_6h_sum,rain_12h_sum,target
0,0.0,0.2,0.5,0.5,0
1,0.0,0.2,0.4,0.5,0
2,0.0,0.0,0.3,0.5,0
3,0.0,0.0,0.2,0.5,0
4,0.0,0.0,0.2,0.5,0
