## 1. Downloading rainfall in India

### Downloading rainfall data

We will first download rainfall data for Bihar for 10 years (during the monsoon season of June to September) from the Indian Meteorological Department. 


In [20]:
#!pip install imdlib
#!pip install netCDF4
#!pip install scikit-learn
#!pip install lightgbm
!pip install openmeteo_requests retry_requests


Defaulting to user installation because normal site-packages is not writeable
Collecting openmeteo_requests
  Downloading openmeteo_requests-1.7.4-py3-none-any.whl (7.0 kB)
Collecting retry_requests
  Downloading retry_requests-2.0.0-py3-none-any.whl (15 kB)
Collecting niquests>=3.15.2
  Downloading niquests-3.15.2-py3-none-any.whl (167 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m167.1/167.1 KB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m[36m0:00:01[0m
Collecting openmeteo-sdk>=1.22.0
  Downloading openmeteo_sdk-1.23.0-py3-none-any.whl (18 kB)
Collecting wassima<3,>=1.0.1
  Downloading wassima-2.0.2-py3-none-any.whl (145 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m145.8/145.8 KB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Collecting urllib3-future<3,>=2.13.903
  Downloading urllib3_future-2.14.908-py3-none-any.whl (683 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m683.2/683.2 KB[0m 

In [5]:
import imdlib as imd

# Downloading 8 years of rainfall data for India
start_yr = 2014
end_yr = 2023
variable = 'rain' # other options are ('tmin'/ 'tmax')
file_dir = 'dataFolder'
data = imd.get_data(variable, start_yr, end_yr, fn_format='yearwise', file_dir=file_dir)


Downloading: rain for year 2014
Downloading: rain for year 2015
Downloading: rain for year 2016
Downloading: rain for year 2017
Downloading: rain for year 2018
Downloading: rain for year 2019
Downloading: rain for year 2020
Downloading: rain for year 2021
Downloading: rain for year 2022
Downloading: rain for year 2023
Download Successful !!!


In [18]:
import xarray as xr

ds = data.get_xarray()
type(ds)
lat_min, lat_max = 24.5, 27.5
lon_min, lon_max = 83.0, 88.0
spatial_subset = ds.sel(lat=slice(lat_min, lat_max), lon=slice(lon_min, lon_max))

months = [6, 7, 8, 9]
time_subset = spatial_subset.where(spatial_subset['time.month'].isin(months), drop=True)

time_subset.to_netcdf('Bihar_June-Sept.nc')
time_subset

In [9]:
import xarray as xr
ds = xr.open_dataset('Bihar_June-Sept.nc')
# See dataset information
print(ds)

<xarray.Dataset>
Dimensions:  (time: 1220, lat: 13, lon: 21)
Coordinates:
  * time     (time) datetime64[ns] 2014-06-01 2014-06-02 ... 2023-09-30
  * lon      (lon) float64 83.0 83.25 83.5 83.75 84.0 ... 87.25 87.5 87.75 88.0
  * lat      (lat) float64 24.5 24.75 25.0 25.25 25.5 ... 26.75 27.0 27.25 27.5
Data variables:
    rain     (time, lat, lon) float64 ...
Attributes:
    Conventions:  CF-1.7
    title:        IMD gridded data
    source:       https://imdpune.gov.in/
    history:      2025-12-08 18:48:21.455196 Python
    references:   
    comment:      
    crs:          epsg:4326


In [1]:
import netCDF4 as nc
import pandas as pd
import numpy as np

def netcdf_to_csv_netcdf4(input_file, output_file):
    # Open the NetCDF file
    dataset = nc.Dataset(input_file, 'r')
    
    # Print available variables and their shapes to understand the data structure
    print("Available variables:")
    for var_name in dataset.variables.keys():
        var = dataset.variables[var_name]
        print(f"  {var_name}: shape = {var.shape}, dimensions = {var.dimensions}")
    
    # Get the data - adjust variable names based on your actual file
    # Common names: 'lat', 'latitude', 'lon', 'longitude', 'time', 'rainfall', 'precipitation'
    
    try:
        # Try to get coordinate variables
        lat = dataset.variables['lat'][:]
        lon = dataset.variables['lon'][:]
        
        # Try to get rainfall variable (adjust name if needed)
        if 'rainfall' in dataset.variables:
            rainfall = dataset.variables['rainfall'][:]
        elif 'precipitation' in dataset.variables:
            rainfall = dataset.variables['precipitation'][:]
        else:
            # Use the first data variable that's not a coordinate
            data_vars = [v for v in dataset.variables.keys() 
                        if v not in ['lat', 'lon', 'latitude', 'longitude', 'time']]
            rainfall = dataset.variables[data_vars[0]][:]
        
        # Handle different data structures
        if rainfall.ndim == 3:  # (time, lat, lon)
            # Create meshgrid for all combinations
            times = np.arange(rainfall.shape[0])
            lon_grid, lat_grid, time_grid = np.meshgrid(lon, lat, times, indexing='ij')
            
            df = pd.DataFrame({
                'longitude': lon_grid.flatten(),
                'latitude': lat_grid.flatten(),
                'time': time_grid.flatten(),
                'rainfall': rainfall.transpose(2, 1, 0).flatten()  # Reorder to match meshgrid
            })
            
        elif rainfall.ndim == 2:  # (lat, lon)
            lon_grid, lat_grid = np.meshgrid(lon, lat, indexing='ij')
            
            df = pd.DataFrame({
                'longitude': lon_grid.flatten(),
                'latitude': lat_grid.flatten(),
                'rainfall': rainfall.flatten()
            })
            
        else:  # 1D or other structure
            # Simple case: just flatten everything
            min_length = min(len(lat), len(lon), rainfall.size)
            df = pd.DataFrame({
                'latitude': lat.flatten()[:min_length],
                'longitude': lon.flatten()[:min_length],
                'rainfall': rainfall.flatten()[:min_length]
            })
        
        # Save to CSV
        df.to_csv(output_file, index=False)
        print(f"\nSuccessfully converted to {output_file}")
        print(f"Output shape: {df.shape}")
        print(f"\nFirst few rows:\n{df.head()}")
        
    except KeyError as e:
        print(f"\nError: Could not find variable {e}")
        print("Please check the variable names printed above and adjust the code accordingly.")
    
    finally:
        dataset.close()

# Usage
netcdf_to_csv_netcdf4("Bihar_June-Sept.nc", "bihar_rainfall.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'Bihar_June-Sept.nc'

## 2. Using a hugging face model to look at rainfall predictions

In [13]:
import sys
import sys
import sklearn
from pathlib import Path

# Use the current working directory instead
ROOT = Path.cwd().parent  # Gets current working directory

# If you need to go up one or more directories:
# ROOT = Path.cwd().parent  # Go up one level
# ROOT = Path.cwd().parent.parent  # Go up two levels

sys.path.insert(0, str(ROOT / "src"))

import pandas as pd
from rainfallprediction.model import load_model, predict
from rainfallprediction.features import prepare_features
from rainfallprediction.evaluate import compute_metrics, compute_metrics_by_month

# Paths
DATA_DIR = ROOT / "data"
INPUT_FILE = DATA_DIR / "processed" / "bihar_spatial_avg.csv"
OUTPUT_DIR = ROOT / "outputs"
OUTPUT_FILE = OUTPUT_DIR / "bihar_predictions.csv"


def main():
    # Ensure output directory exists
    OUTPUT_DIR.mkdir(exist_ok=True)

    # Load model
    model = load_model()

    # Load Bihar data
    df = pd.read_csv(INPUT_FILE, parse_dates=["date"])
    print(f"Bihar rainfall data: {len(df)} days")
    print(f"Date range: {df['date'].min().date()} to {df['date'].max().date()}")
    print()

    # Prepare features
    df_pred = prepare_features(df, rainfall_col="rainfall_mm", date_col="date")

    # Predict
    df_pred["predicted_rainfall"] = predict(model, df_pred)

    # Results
    print("=" * 60)
    print("PREDICTION RESULTS")
    print("=" * 60)
    print(f"Predictions made for: {len(df_pred)} days")
    print()

    # Sample predictions
    print("Sample predictions (first 20 days):")
    print(df_pred[["date", "rainfall_mm", "predicted_rainfall"]].head(20).to_string(index=False))
    print()

    # Metrics
    metrics = compute_metrics(df_pred["rainfall_mm"], df_pred["predicted_rainfall"])

    print("=" * 60)
    print("MODEL PERFORMANCE ON BIHAR DATA")
    print("=" * 60)
    print(f"RMSE: {metrics['rmse']:.2f} mm")
    print(f"MAE:  {metrics['mae']:.2f} mm")
    print(f"R²:   {metrics['r2']:.3f}")
    print()

    # By month
    print("Performance by Month:")
    month_names = {6: "June", 7: "July", 8: "August", 9: "September"}
    monthly = compute_metrics_by_month(df_pred, "rainfall_mm", "predicted_rainfall")
    for _, row in monthly.iterrows():
        name = month_names.get(row["month"], str(row["month"]))
        print(f"  {name:>10}: RMSE={row['rmse']:.2f} mm, MAE={row['mae']:.2f} mm")
    print()

    # Save predictions
    df_pred[["date", "rainfall_mm", "predicted_rainfall"]].to_csv(OUTPUT_FILE, index=False)
    print(f"Predictions saved to: {OUTPUT_FILE}")

    # Distribution stats
    print()
    print("=" * 60)
    print("RAINFALL DISTRIBUTION")
    print("=" * 60)
    actual = df_pred["rainfall_mm"]
    predicted = df_pred["predicted_rainfall"]
    print(f"Actual:    mean={actual.mean():.2f}, std={actual.std():.2f}, max={actual.max():.2f} mm")
    print(f"Predicted: mean={predicted.mean():.2f}, std={predicted.std():.2f}, max={predicted.max():.2f} mm")


if __name__ == "__main__":
    main()


Bihar rainfall data: 1220 days
Date range: 2014-06-01 to 2023-09-30

PREDICTION RESULTS
Predictions made for: 1213 days

Sample predictions (first 20 days):
      date  rainfall_mm  predicted_rainfall
2014-06-08     2.362717            4.433908
2014-06-09     2.537074            4.943960
2014-06-10     9.617831            4.639105
2014-06-11     2.244312            7.329529
2014-06-12     1.470683            6.404959
2014-06-13     0.658616            4.349388
2014-06-14     0.243957            5.200445
2014-06-15     0.146356            5.448652
2014-06-16     1.352018            3.778902
2014-06-17     6.002461            5.717600
2014-06-18     3.693360            6.458920
2014-06-19     8.277402            7.024574
2014-06-20     6.910142            8.444232
2014-06-21    19.565249            8.099051
2014-06-22    13.647746            7.799876
2014-06-23     8.772097           10.207117
2014-06-24     5.742474            9.354973
2014-06-25     3.059098            8.369146
2014-06

In [16]:
""" Plotting data """

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

OUTPUT_DIR = ROOT / "outputs"
PREDICTIONS_FILE = OUTPUT_DIR / "bihar_predictions.csv"


def main():
    # Load predictions
    df = pd.read_csv(PREDICTIONS_FILE, parse_dates=["date"])

    # Create figure with subplots
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    fig.suptitle("Bihar Monsoon Rainfall: Model Predictions vs Actual (2014-2023)",
                 fontsize=14, fontweight="bold")

    # 1. Time series for 2023 (most recent year)
    ax1 = axes[0, 0]
    df_2023 = df[df["date"].dt.year == 2023]
    ax1.plot(df_2023["date"], df_2023["rainfall_mm"], label="Actual", alpha=0.8, linewidth=1.5)
    ax1.plot(df_2023["date"], df_2023["predicted_rainfall"], label="Predicted", alpha=0.8, linewidth=1.5)
    ax1.set_xlabel("Date")
    ax1.set_ylabel("Rainfall (mm)")
    ax1.set_title("2023 Monsoon Season")
    ax1.legend()
    ax1.grid(True, alpha=0.3)

    # 2. Scatter plot: Actual vs Predicted
    ax2 = axes[0, 1]
    ax2.scatter(df["rainfall_mm"], df["predicted_rainfall"], alpha=0.3, s=10)
    max_val = max(df["rainfall_mm"].max(), df["predicted_rainfall"].max())
    ax2.plot([0, max_val], [0, max_val], "r--", label="Perfect prediction")
    ax2.set_xlabel("Actual Rainfall (mm)")
    ax2.set_ylabel("Predicted Rainfall (mm)")
    ax2.set_title("Actual vs Predicted")
    ax2.legend()
    ax2.grid(True, alpha=0.3)

    # 3. Monthly comparison
    ax3 = axes[1, 0]
    df["month"] = df["date"].dt.month
    monthly = df.groupby("month").agg({"rainfall_mm": "mean", "predicted_rainfall": "mean"}).reset_index()
    x = np.arange(len(monthly))
    width = 0.35
    ax3.bar(x - width/2, monthly["rainfall_mm"], width, label="Actual", alpha=0.8)
    ax3.bar(x + width/2, monthly["predicted_rainfall"], width, label="Predicted", alpha=0.8)
    ax3.set_xlabel("Month")
    ax3.set_ylabel("Mean Rainfall (mm)")
    ax3.set_title("Average Rainfall by Month")
    ax3.set_xticks(x)
    ax3.set_xticklabels(["June", "July", "August", "September"])
    ax3.legend()
    ax3.grid(True, alpha=0.3, axis="y")

    # 4. Error distribution
    ax4 = axes[1, 1]
    errors = df["rainfall_mm"] - df["predicted_rainfall"]
    ax4.hist(errors, bins=50, alpha=0.7, edgecolor="black")
    ax4.axvline(x=0, color="red", linestyle="--", label="Zero error")
    ax4.axvline(x=errors.mean(), color="green", linestyle="--", label=f"Mean error: {errors.mean():.2f}")
    ax4.set_xlabel("Prediction Error (mm)")
    ax4.set_ylabel("Frequency")
    ax4.set_title("Error Distribution")
    ax4.legend()
    ax4.grid(True, alpha=0.3)

    plt.tight_layout()
    output_path = OUTPUT_DIR / "bihar_prediction_comparison.png"
    plt.savefig(output_path, dpi=150, bbox_inches="tight")
    plt.close()
    print(f"Plot saved to: {output_path}")

    # Yearly comparison
    fig2, ax = plt.subplots(figsize=(12, 6))
    df["year"] = df["date"].dt.year
    yearly = df.groupby("year").agg({
        "rainfall_mm": ["mean", "sum"],
        "predicted_rainfall": ["mean", "sum"]
    }).reset_index()
    yearly.columns = ["year", "actual_mean", "actual_sum", "pred_mean", "pred_sum"]

    x = np.arange(len(yearly))
    width = 0.35
    ax.bar(x - width/2, yearly["actual_mean"], width, label="Actual", alpha=0.8)
    ax.bar(x + width/2, yearly["pred_mean"], width, label="Predicted", alpha=0.8)
    ax.set_xlabel("Year")
    ax.set_ylabel("Mean Daily Rainfall (mm)")
    ax.set_title("Bihar Monsoon: Yearly Average Daily Rainfall")
    ax.set_xticks(x)
    ax.set_xticklabels(yearly["year"])
    ax.legend()
    ax.grid(True, alpha=0.3, axis="y")

    plt.tight_layout()
    output_path = OUTPUT_DIR / "bihar_yearly_comparison.png"
    plt.savefig(output_path, dpi=150, bbox_inches="tight")
    plt.close()
    print(f"Plot saved to: {output_path}")


if __name__ == "__main__":
    main()

Plot saved to: /home/chaiti/rainfall-prediction/rainfall-prediction/outputs/bihar_prediction_comparison.png
Plot saved to: /home/chaiti/rainfall-prediction/rainfall-prediction/outputs/bihar_yearly_comparison.png


In [18]:
"""Extract and plot June 15-25, 2023 predictions."""
import pandas as pd
import matplotlib.pyplot as plt
OUTPUT_DIR = ROOT / "outputs"

# Load predictions
df = pd.read_csv(OUTPUT_DIR / "bihar_predictions.csv", parse_dates=["date"])

# Filter June 15-25, 2023
mask = (df["date"] >= "2023-06-15") & (df["date"] <= "2023-06-25")
df_june = df[mask].copy()

print("June 15-25, 2023 Predictions:")
print(df_june.to_string(index=False))
print()

# Save to file
output_file = OUTPUT_DIR / "june_2023_15_25.csv"
df_june.to_csv(output_file, index=False)
print(f"Saved to: {output_file}")

# Plot
fig, ax = plt.subplots(figsize=(10, 5))

x = df_june["date"]
ax.plot(x, df_june["rainfall_mm"], "o-", label="Actual", linewidth=2, markersize=8)
ax.plot(x, df_june["predicted_rainfall"], "s--", label="Predicted", linewidth=2, markersize=8)

ax.set_xlabel("Date")
ax.set_ylabel("Rainfall (mm)")
ax.set_title("Bihar Rainfall: June 15-25, 2023")
ax.legend()
ax.grid(True, alpha=0.3)

# Rotate x-axis labels
plt.xticks(rotation=45)
plt.tight_layout()

plot_file = OUTPUT_DIR / "june_2023_15_25.png"
plt.savefig(plot_file, dpi=150, bbox_inches="tight")
plt.close()
print(f"Plot saved to: {plot_file}")


June 15-25, 2023 Predictions:
      date  rainfall_mm  predicted_rainfall
2023-06-15     1.245295            6.207970
2023-06-16     1.038100            5.333638
2023-06-17     0.534527            5.421577
2023-06-18     1.918757            3.584134
2023-06-19     5.118173            6.470734
2023-06-20     2.312770            5.949299
2023-06-21     3.694649            6.515938
2023-06-22     3.028354            5.999797
2023-06-23     8.423565            5.015070
2023-06-24     0.870222            7.238396
2023-06-25     1.565293            3.934512

Saved to: /home/chaiti/rainfall-prediction/rainfall-prediction/outputs/june_2023_15_25.csv
Plot saved to: /home/chaiti/rainfall-prediction/rainfall-prediction/outputs/june_2023_15_25.png


## 3. Comparing predictions to OpenMeteo predictions

In [13]:
import openmeteo_requests
import requests_cache
from retry_requests import retry
import pandas as pd

# Setup cache and retry client
cache_session = requests_cache.CachedSession('.weather_cache', expire_after=3600)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
openmeteo = openmeteo_requests.Client(session=retry_session)

# Patna coordinates
latitude = 25.6
longitude = 85.1

# Full API URL
url = "https://api.open-meteo.com/v1/forecast"
params = {
    "latitude": latitude,
    "longitude": longitude,
    "daily": "precipitation_sum", # <----- daily rainfall (sum in mm)
    "forecast_days": 3            # e.g., next 3 days
}

response = openmeteo.weather_api(url, params)

# Get dates and values
dates = pd.to_datetime(daily.Time())  # Returns a list or array of datetime strings
rainfall_mm = daily.Variables(0).ValuesAsNumpy()  # Precipitation data

daily_data = {
    "date": dates,
    "rainfall_mm": rainfall_mm,
    "latitude": latitude,
    "longitude": longitude
}

df = pd.DataFrame(daily_data)
print(df)

                           date  rainfall_mm  latitude  longitude
0 1970-01-01 00:00:01.765238400          0.0      25.6       85.1
1 1970-01-01 00:00:01.765238400          0.0      25.6       85.1
2 1970-01-01 00:00:01.765238400          0.0      25.6       85.1


In [19]:
import openmeteo_requests
import requests_cache
from retry_requests import retry
import pandas as pd

# Setup cache and retry client
cache_session = requests_cache.CachedSession('.weather_cache', expire_after=3600)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
openmeteo = openmeteo_requests.Client(session=retry_session)

# Patna coordinates
latitude = 25.6
longitude = 85.1

# Use archive endpoint for historical data
url = "https://archive-api.open-meteo.com/v1/archive"
params = {
    "latitude": latitude,
    "longitude": longitude,
    "start_date": "2023-06-15",
    "end_date": "2023-06-25",
    "daily": "precipitation_sum",
    "timezone": "Asia/Kolkata",  # get local time
}

response = openmeteo.weather_api(url, params)
daily = response[0].Daily()
dates = pd.to_datetime(daily.Time())
rainfall_mm = daily.Variables(0).ValuesAsNumpy()

df = pd.DataFrame({
    "date": dates,
    "rainfall_mm": rainfall_mm,
    "latitude": latitude,
    "longitude": longitude
})
print(df)


ModuleNotFoundError: No module named 'openmeteo_requests'