### Obtain Bias For Predicted Data With Respect To IMD Data

In [16]:
import os
import rasterio
import numpy as np
import re

def extract_date(filename):
    imd_match = re.search(r'monthly_rain_(\d{4}_\d{2})', filename)
    if imd_match:
        return imd_match.group(1)
    
    era5_match = re.search(r'rainfall_(\d{4})-(\d{2})', filename)
    if era5_match:
        year, month = era5_match.groups()  # Only two groups, not three
        return f"{year}-{month}"
    
    return None

def standardize_data(date_str):
    if date_str:
        return date_str.replace("-", "_")
    return None

def compare_raster_props(src1, src2):
    diffs = []
    
    if src1.transform != src2.transform:
        diffs.append("Transform differs between rasters {src1.name} and {src2.name}")
        
    if src1.crs != src2.crs:
        diffs.append("CRS differs between rasters {src1.name} and {src2.name}")
        
    if src1.shape != src2.shape:
        diffs.append("Shape differs between rasters {src1.name} and {src2.name}")
    
    if src1.res != src2.res:
        diffs.append("Resolution differs between rasters {src1.name} and {src2.name}")
        
    if src1.bounds != src2.bounds:
        diffs.append("Bounds differ between rasters {src1.name} and {src2.name}")
        
    return diffs

def compute_daily_bias(f1, f2, out_dir):
    os.makedirs(out_dir, exist_ok=True)
    
    imd_files = [f for f in os.listdir(f1) if f.endswith('.tif')]
    
    era5_files = [f for f in os.listdir(f2) if f.endswith('.tif')]
    
    era5_files_dict = {}
    
    for f in era5_files:
        date = extract_date(f)
        print(f"Extracted woowowow date: {date}")
        if date:
            era5_files_dict[standardize_data(date)] = f
    
    processed_count, error_count = 0, 0
    
    for imd_file in imd_files:
        try:
            imd_date = extract_date(imd_file)
            if not imd_date:
                print(f"Date not found in {imd_file}")
                continue
            
            std_date = standardize_data(imd_date)
            print(f"Standardized date: {std_date}")
            era5_date = era5_files_dict.get(std_date)
            print(f"ERA5 date: {era5_date}")
            
            if not era5_date:
                print(f"No matching ERA5 file for {imd_file}")
                continue
            
            imd_path = os.path.join(f1, imd_file)
            era5_path = os.path.join(f2, era5_date)
            
            print(f"Processing {imd_path} and {era5_path}")
            
            with rasterio.open(imd_path) as src1, rasterio.open(era5_path) as src2:
                diffs = compare_raster_props(src1, src2)
                
                if diffs:
                    print(f"Props dont match for date: {std_date}")
                    for diff in diffs:
                        print(diff)

                    era5_data = src2.read(1)
                    era5_date_flipped = np.flipud(era5_data)
                    
                    bias = src1.read(1) - era5_date_flipped
                    
                    output_file = os.path.join(out_dir, f"bias_{std_date}.tif")
                    profile = src1.profile
                    profile.update(nodata=None, description="Bias", dtype=rasterio.float32)
                    
                    with rasterio.open(output_file, 'w', **profile) as dst:
                        dst.write(bias, 1)
                        
                    processed_count += 1
                    if processed_count % 100 == 0:
                        print(f"Processed {processed_count} files")
                
                else:
                    bias = src1.read(1) - src2.read(1)
                    output_file = os.path.join(out_dir, f"bias_{std_date}.tif")
                    profile = src1.profile
                    profile.update(nodata=None, description="Bias", dtype=rasterio.float32)
                    
                    with rasterio.open(output_file, 'w', **profile) as dst:
                        dst.write(bias, 1)
                    
                    processed_count += 1
                    if processed_count % 100 == 0:
                        print(f"Processed {processed_count} files")
        
        except Exception as e:
            print(f"Error processing {imd_file}: {e}")
            error_count += 1
    
    print(f"Processed {processed_count} files with {error_count} errors.")

def main():
    imd_data = r"/home/stormej/dev/rainscale/data/rain/rain_tif_monthly"
    predicted_data_0_25 = r"/home/stormej/dev/rainscale/data/ml/prediction/prediction_0.25"
    output_dir = r"/home/stormej/dev/rainscale/data/bias/bias_0.25"
    
    compute_daily_bias(imd_data, predicted_data_0_25, output_dir)

main()

Extracted woowowow date: None
Extracted woowowow date: 2000-02
Extracted woowowow date: 2000-03
Extracted woowowow date: 2000-04
Extracted woowowow date: 2000-05
Extracted woowowow date: 2000-06
Extracted woowowow date: 2000-07
Extracted woowowow date: 2000-08
Extracted woowowow date: 2000-09
Extracted woowowow date: 2000-10
Extracted woowowow date: 2000-11
Extracted woowowow date: 2000-12
Extracted woowowow date: 2001-01
Extracted woowowow date: 2001-02
Extracted woowowow date: 2001-03
Extracted woowowow date: 2001-04
Extracted woowowow date: 2001-05
Extracted woowowow date: 2001-06
Extracted woowowow date: 2001-07
Extracted woowowow date: 2001-08
Extracted woowowow date: 2001-09
Extracted woowowow date: 2001-10
Extracted woowowow date: 2001-11
Extracted woowowow date: 2001-12
Extracted woowowow date: 2002-01
Extracted woowowow date: 2002-02
Extracted woowowow date: 2002-03
Extracted woowowow date: 2002-04
Extracted woowowow date: 2002-05
Extracted woowowow date: 2002-06
Extracted woo