In [4]:
import os
import rasterio
import numpy as np
import glob
from rasterio.warp import reproject, Resampling
def parse_rainfall_filename(filename):
    try:
        base_name = os.path.basename(filename)
        name_parts = base_name.split('.')
        name_without_ext = name_parts[0]
        
        # Handle rainfall_YYYY-MM format directly
        if name_without_ext.startswith("rainfall_") and "-" in name_without_ext:
            date_part = name_without_ext.split("rainfall_")[1]
            year, month = date_part.split("-")
            if year.isdigit() and month.isdigit() and len(year) == 4:
                month = month.zfill(2)  # Ensure month has 2 digits
                underscore_date = f"{year}_{month}"
                hyphen_date = f"{year}-{month}"
                return underscore_date, hyphen_date
        
        # Existing logic as fallback
        parts = name_without_ext.split('_')
        
        if len(parts) >= 3 and parts[-2].isdigit() and parts[-1].isdigit():
            year = parts[-2]
            month = parts[-1].zfill(2)  # Ensure month has 2 digits
            underscore_date = f"{year}_{month}"
            hyphen_date = f"{year}-{month}"
            return underscore_date, hyphen_date
        
        for i in range(len(parts)):
            if parts[i].isdigit() and len(parts[i]) == 4:  # Potential year
                year = parts[i]
                if i+1 < len(parts) and parts[i+1].isdigit():  # Next part is potential month
                    month = parts[i+1].zfill(2)
                    underscore_date = f"{year}_{month}"
                    hyphen_date = f"{year}-{month}"
                    return underscore_date, hyphen_date
        
        raise ValueError(f"Date pattern not found in filename: {filename}")
    except Exception as e:
        raise ValueError(f"Unable to parse date from filename: {filename}") from e

def resample_bias_to_rainfall(bias_dataset, rainfall_dataset):
    dst_shape = rainfall_dataset.shape
    dst_transform = rainfall_dataset.transform
    dst_crs = rainfall_dataset.crs
    
    resampled_data = np.zeros(dst_shape, dtype='float32')
    
    reproject(
        source=rasterio.band(bias_dataset, 1),
        destination=resampled_data,
        src_transform=bias_dataset.transform,
        src_crs=bias_dataset.crs,
        dst_transform=dst_transform,
        dst_crs=dst_crs,
        resampling=Resampling.bilinear
    )
    
    return resampled_data

def validate_and_correct_data(rainfall_data, bias_data, rainfall_nodata):
    rainfall_mask = (rainfall_data == rainfall_nodata) if rainfall_nodata is not None else np.zeros_like(rainfall_data, dtype=bool)
    bias_mask = np.isnan(bias_data)
    
    valid_mask = ~(rainfall_mask | bias_mask)
    
    corrected_data = np.full_like(rainfall_data, rainfall_nodata if rainfall_nodata is not None else -999)
    
    corrected_data[valid_mask] = rainfall_data[valid_mask] + bias_data[valid_mask]
    
    negative_rainfall = (corrected_data < 0) & valid_mask
    if np.any(negative_rainfall):
        negative_count = np.sum(negative_rainfall)
        total_valid = np.sum(valid_mask)
        print(f"Warning: {negative_count} pixels ({negative_count/total_valid*100:.2f}%) would have negative rainfall after bias correction")
        print("Setting these values to zero")
        corrected_data[negative_rainfall] = 0
    
    max_reasonable_rainfall = 2000  # mm/month (adjust based on your data's units and region)
    excessive_rainfall = (corrected_data > max_reasonable_rainfall) & valid_mask
    
    if np.any(excessive_rainfall):
        excessive_count = np.sum(excessive_rainfall)
        print(f"Warning: {excessive_count} pixels have extremely high rainfall values after correction")
        print(f"Range of these values: {np.min(corrected_data[excessive_rainfall]):.2f} to {np.max(corrected_data[excessive_rainfall]):.2f}")
        corrected_data[excessive_rainfall] = max_reasonable_rainfall
        print(f"Capped extremely high values at {max_reasonable_rainfall}")
    
    return corrected_data

def apply_bias_correction(rainfall_dir, bias_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    
    rainfall_files = glob.glob(os.path.join(rainfall_dir, "*.tif"))
    print(f"Found {len(rainfall_files)} rainfall files to process")
    
    processed = 0
    errors = 0
    skipped = 0
    
    for rainfall_file in rainfall_files:
        try:
            rainfall_filename = os.path.basename(rainfall_file)
            print(f"\nProcessing {rainfall_filename}...")
            
            date_str, date_str_hyphen = parse_rainfall_filename(rainfall_filename)
            year, month = date_str.split('_')
            print(f"Looking for bias file with date: {date_str_hyphen}")
            
            bias_file = os.path.join(bias_dir, f"bias_{year}_{month}.tif")
            if not os.path.exists(bias_file):
                print(f"Skipping {date_str_hyphen}, no matching bias file found.")
                skipped += 1
                continue
            
            with rasterio.open(rainfall_file) as rainfall_dataset, rasterio.open(bias_file) as bias_dataset:
                rainfall_data = rainfall_dataset.read(1)
                
                print(f"Rainfall data statistics:")
                valid_rainfall = rainfall_data[rainfall_data != (rainfall_dataset.nodata if rainfall_dataset.nodata is not None else -999)]
                if valid_rainfall.size > 0:
                    print(f"  Min: {np.min(valid_rainfall):.2f}, Max: {np.max(valid_rainfall):.2f}, Mean: {np.mean(valid_rainfall):.2f}")
                
                print("Resampling bias data...")
                bias_data = resample_bias_to_rainfall(bias_dataset, rainfall_dataset)
                
                print(f"Bias data statistics:")
                valid_bias = bias_data[~np.isnan(bias_data)]
                if valid_bias.size > 0:
                    print(f"  Min: {np.min(valid_bias):.2f}, Max: {np.max(valid_bias):.2f}, Mean: {np.mean(valid_bias):.2f}")
                    if np.mean(valid_bias) < 0 and np.mean(valid_rainfall) > 0:
                        print("  Note: Bias is predominantly negative while rainfall is positive")
                    if abs(np.mean(valid_bias)) > np.mean(valid_rainfall):
                        print("  Warning: Average bias magnitude exceeds average rainfall. Check bias calculation.")
                
                print("Applying bias correction with validation...")
                corrected_rainfall = validate_and_correct_data(
                    rainfall_data, 
                    bias_data, 
                    rainfall_dataset.nodata
                )
                
                valid_mask = corrected_rainfall != (rainfall_dataset.nodata if rainfall_dataset.nodata is not None else -999)
                if np.any(valid_mask):
                    print(f"Valid corrected data statistics:")
                    print(f"  Min: {np.min(corrected_rainfall[valid_mask]):.2f}")
                    print(f"  Max: {np.max(corrected_rainfall[valid_mask]):.2f}")
                    print(f"  Mean: {np.mean(corrected_rainfall[valid_mask]):.2f}")
                    
                    if valid_rainfall.size > 0:
                        total_rainfall_before = np.sum(valid_rainfall)
                        total_rainfall_after = np.sum(corrected_rainfall[valid_mask])
                        change_percent = ((total_rainfall_after - total_rainfall_before) / total_rainfall_before) * 100
                        
                        print(f"  Total rainfall before correction: {total_rainfall_before:.2f}")
                        print(f"  Total rainfall after correction: {total_rainfall_after:.2f}")
                        print(f"  Change in total rainfall: {change_percent:.2f}%")
                        
                        if abs(change_percent) > 50:
                            print("  WARNING: Very large change in total rainfall after bias correction!")
                output_profile = rainfall_dataset.profile.copy()
                output_profile.update({
                    'compress': 'lzw',
                    'nodata': rainfall_dataset.nodata if rainfall_dataset.nodata is not None else -999,
                    'dtype': 'float32'
                })
                
                output_file = os.path.join(output_dir, f"corrected_{rainfall_filename}")
                with rasterio.open(output_file, 'w', **output_profile) as dst:
                    dst.write(corrected_rainfall.astype('float32'), 1)
                
                processed += 1
                print(f"Successfully processed: {output_file}")
                
        except Exception as e:
            print(f"Error processing {os.path.basename(rainfall_file)}: {str(e)}")
            errors += 1
            import traceback
            print("Detailed error:")
            print(traceback.format_exc())
    
    print(f"\nProcessing Summary:")
    print(f"Total files: {len(rainfall_files)}")
    print(f"Successfully processed: {processed}")
    print(f"Skipped: {skipped}")
    print(f"Errors: {errors}")

def main():
    rainfall_dir = r"/home/stormej/dev/rainscale/data/ml/prediction/prediction_0.01"
    bias_dir = r"/home/stormej/dev/rainscale/data/bias/bias_kriging_0.01"
    output_dir = r"/home/stormej/dev/rainscale/data/bias_corrected/bias_corrected_kriging_0.01"
    
    print("Starting rainfall bias correction...")
    apply_bias_correction(rainfall_dir, bias_dir, output_dir)
    print("Correction completed!")

main()   

Starting rainfall bias correction...
Found 277 rainfall files to process

Processing rainfall_2000-02.tif...
Looking for bias file with date: 2000-02
Rainfall data statistics:
  Min: 9.63, Max: 25.11, Mean: 14.54
Resampling bias data...
Bias data statistics:
  Min: 6.48, Max: 6.48, Mean: 6.48
Applying bias correction with validation...
Valid corrected data statistics:
  Min: 16.11
  Max: 31.59
  Mean: 21.01
  Total rainfall before correction: 85760.03
  Total rainfall after correction: 123963.45
  Change in total rainfall: 44.55%
Successfully processed: /home/stormej/dev/rainscale/data/bias_corrected/bias_corrected_kriging_0.01/corrected_rainfall_2000-02.tif

Processing rainfall_2000-03.tif...
Looking for bias file with date: 2000-03
Rainfall data statistics:
  Min: 9.83, Max: 44.98, Mean: 14.17
Resampling bias data...
Bias data statistics:
  Min: -11.62, Max: -4.02, Mean: -5.02
  Note: Bias is predominantly negative while rainfall is positive
Applying bias correction with validation..

In [6]:
import os
import rasterio
import numpy as np
import glob
from rasterio.warp import reproject, Resampling
def parse_rainfall_filename(filename):
    try:
        base_name = os.path.basename(filename)
        name_parts = base_name.split('.')
        name_without_ext = name_parts[0]
        
        # Handle rainfall_YYYY-MM format directly
        if name_without_ext.startswith("rainfall_") and "-" in name_without_ext:
            date_part = name_without_ext.split("rainfall_")[1]
            year, month = date_part.split("-")
            if year.isdigit() and month.isdigit() and len(year) == 4:
                month = month.zfill(2)  # Ensure month has 2 digits
                underscore_date = f"{year}_{month}"
                hyphen_date = f"{year}-{month}"
                return underscore_date, hyphen_date
        
        # Existing logic as fallback
        parts = name_without_ext.split('_')
        
        if len(parts) >= 3 and parts[-2].isdigit() and parts[-1].isdigit():
            year = parts[-2]
            month = parts[-1].zfill(2)  # Ensure month has 2 digits
            underscore_date = f"{year}_{month}"
            hyphen_date = f"{year}-{month}"
            return underscore_date, hyphen_date
        
        for i in range(len(parts)):
            if parts[i].isdigit() and len(parts[i]) == 4:  # Potential year
                year = parts[i]
                if i+1 < len(parts) and parts[i+1].isdigit():  # Next part is potential month
                    month = parts[i+1].zfill(2)
                    underscore_date = f"{year}_{month}"
                    hyphen_date = f"{year}-{month}"
                    return underscore_date, hyphen_date
        
        raise ValueError(f"Date pattern not found in filename: {filename}")
    except Exception as e:
        raise ValueError(f"Unable to parse date from filename: {filename}") from e

def resample_bias_to_rainfall(bias_dataset, rainfall_dataset):
    dst_shape = rainfall_dataset.shape
    dst_transform = rainfall_dataset.transform
    dst_crs = rainfall_dataset.crs
    
    resampled_data = np.zeros(dst_shape, dtype='float32')
    
    reproject(
        source=rasterio.band(bias_dataset, 1),
        destination=resampled_data,
        src_transform=bias_dataset.transform,
        src_crs=bias_dataset.crs,
        dst_transform=dst_transform,
        dst_crs=dst_crs,
        resampling=Resampling.bilinear
    )
    
    return resampled_data

def validate_and_correct_data(rainfall_data, bias_data, rainfall_nodata):
    rainfall_mask = (rainfall_data == rainfall_nodata) if rainfall_nodata is not None else np.zeros_like(rainfall_data, dtype=bool)
    bias_mask = np.isnan(bias_data)
    
    valid_mask = ~(rainfall_mask | bias_mask)
    
    corrected_data = np.full_like(rainfall_data, rainfall_nodata if rainfall_nodata is not None else -999)
    
    corrected_data[valid_mask] = rainfall_data[valid_mask] + bias_data[valid_mask]
    
    negative_rainfall = (corrected_data < 0) & valid_mask
    if np.any(negative_rainfall):
        negative_count = np.sum(negative_rainfall)
        total_valid = np.sum(valid_mask)
        print(f"Warning: {negative_count} pixels ({negative_count/total_valid*100:.2f}%) would have negative rainfall after bias correction")
        print("Setting these values to zero")
        corrected_data[negative_rainfall] = 0
    
    max_reasonable_rainfall = 2000  # mm/month (adjust based on your data's units and region)
    excessive_rainfall = (corrected_data > max_reasonable_rainfall) & valid_mask
    
    if np.any(excessive_rainfall):
        excessive_count = np.sum(excessive_rainfall)
        print(f"Warning: {excessive_count} pixels have extremely high rainfall values after correction")
        print(f"Range of these values: {np.min(corrected_data[excessive_rainfall]):.2f} to {np.max(corrected_data[excessive_rainfall]):.2f}")
        corrected_data[excessive_rainfall] = max_reasonable_rainfall
        print(f"Capped extremely high values at {max_reasonable_rainfall}")
    
    return corrected_data

def apply_bias_correction(rainfall_dir, bias_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    
    rainfall_files = glob.glob(os.path.join(rainfall_dir, "*.tif"))
    print(f"Found {len(rainfall_files)} rainfall files to process")
    
    processed = 0
    errors = 0
    skipped = 0
    
    for rainfall_file in rainfall_files:
        try:
            rainfall_filename = os.path.basename(rainfall_file)
            print(f"\nProcessing {rainfall_filename}...")
            
            date_str, date_str_hyphen = parse_rainfall_filename(rainfall_filename)
            year, month = date_str.split('_')
            print(f"Looking for bias file with date: {date_str_hyphen}")
            
            bias_file = os.path.join(bias_dir, f"bias_{year}_{month}.tif")
            if not os.path.exists(bias_file):
                print(f"Skipping {date_str_hyphen}, no matching bias file found.")
                skipped += 1
                continue
            
            with rasterio.open(rainfall_file) as rainfall_dataset, rasterio.open(bias_file) as bias_dataset:
                rainfall_data = rainfall_dataset.read(1)
                
                print(f"Rainfall data statistics:")
                valid_rainfall = rainfall_data[rainfall_data != (rainfall_dataset.nodata if rainfall_dataset.nodata is not None else -999)]
                if valid_rainfall.size > 0:
                    print(f"  Min: {np.min(valid_rainfall):.2f}, Max: {np.max(valid_rainfall):.2f}, Mean: {np.mean(valid_rainfall):.2f}")
                
                print("Resampling bias data...")
                bias_data = resample_bias_to_rainfall(bias_dataset, rainfall_dataset)
                
                print(f"Bias data statistics:")
                valid_bias = bias_data[~np.isnan(bias_data)]
                if valid_bias.size > 0:
                    print(f"  Min: {np.min(valid_bias):.2f}, Max: {np.max(valid_bias):.2f}, Mean: {np.mean(valid_bias):.2f}")
                    if np.mean(valid_bias) < 0 and np.mean(valid_rainfall) > 0:
                        print("  Note: Bias is predominantly negative while rainfall is positive")
                    if abs(np.mean(valid_bias)) > np.mean(valid_rainfall):
                        print("  Warning: Average bias magnitude exceeds average rainfall. Check bias calculation.")
                
                print("Applying bias correction with validation...")
                corrected_rainfall = validate_and_correct_data(
                    rainfall_data, 
                    bias_data, 
                    rainfall_dataset.nodata
                )
                
                valid_mask = corrected_rainfall != (rainfall_dataset.nodata if rainfall_dataset.nodata is not None else -999)
                if np.any(valid_mask):
                    print(f"Valid corrected data statistics:")
                    print(f"  Min: {np.min(corrected_rainfall[valid_mask]):.2f}")
                    print(f"  Max: {np.max(corrected_rainfall[valid_mask]):.2f}")
                    print(f"  Mean: {np.mean(corrected_rainfall[valid_mask]):.2f}")
                    
                    if valid_rainfall.size > 0:
                        total_rainfall_before = np.sum(valid_rainfall)
                        total_rainfall_after = np.sum(corrected_rainfall[valid_mask])
                        change_percent = ((total_rainfall_after - total_rainfall_before) / total_rainfall_before) * 100
                        
                        print(f"  Total rainfall before correction: {total_rainfall_before:.2f}")
                        print(f"  Total rainfall after correction: {total_rainfall_after:.2f}")
                        print(f"  Change in total rainfall: {change_percent:.2f}%")
                        
                        if abs(change_percent) > 50:
                            print("  WARNING: Very large change in total rainfall after bias correction!")
                output_profile = rainfall_dataset.profile.copy()
                output_profile.update({
                    'compress': 'lzw',
                    'nodata': rainfall_dataset.nodata if rainfall_dataset.nodata is not None else -999,
                    'dtype': 'float32'
                })
                
                output_file = os.path.join(output_dir, f"corrected_{rainfall_filename}")
                with rasterio.open(output_file, 'w', **output_profile) as dst:
                    dst.write(corrected_rainfall.astype('float32'), 1)
                
                processed += 1
                print(f"Successfully processed: {output_file}")
                
        except Exception as e:
            print(f"Error processing {os.path.basename(rainfall_file)}: {str(e)}")
            errors += 1
            import traceback
            print("Detailed error:")
            print(traceback.format_exc())
    
    print(f"\nProcessing Summary:")
    print(f"Total files: {len(rainfall_files)}")
    print(f"Successfully processed: {processed}")
    print(f"Skipped: {skipped}")
    print(f"Errors: {errors}")

def main():
    rainfall_dir = r"/home/stormej/dev/rainscale/data/ml/prediction/prediction_0.1"
    bias_dir = r"/home/stormej/dev/rainscale/data/bias/bias_kriging_0.1"
    output_dir = r"/home/stormej/dev/rainscale/data/bias_corrected/bias_corrected_kriging_0.1"
    
    print("Starting rainfall bias correction...")
    apply_bias_correction(rainfall_dir, bias_dir, output_dir)
    print("Correction completed!")

main()   

Starting rainfall bias correction...
Found 277 rainfall files to process

Processing rainfall_2000-02.tif...
Looking for bias file with date: 2000-02
Rainfall data statistics:
  Min: 10.66, Max: 20.04, Mean: 14.22
Resampling bias data...
Bias data statistics:
  Min: 6.48, Max: 6.48, Mean: 6.48
Applying bias correction with validation...
Valid corrected data statistics:
  Min: 17.13
  Max: 26.51
  Mean: 20.69
  Total rainfall before correction: 710.89
  Total rainfall after correction: 1034.64
  Change in total rainfall: 45.54%
Successfully processed: /home/stormej/dev/rainscale/data/bias_corrected/bias_corrected_kriging_0.1/corrected_rainfall_2000-02.tif

Processing rainfall_2000-03.tif...
Looking for bias file with date: 2000-03
Rainfall data statistics:
  Min: 9.89, Max: 19.66, Mean: 13.67
Resampling bias data...
Bias data statistics:
  Min: -8.31, Max: -4.20, Mean: -4.94
  Note: Bias is predominantly negative while rainfall is positive
Applying bias correction with validation...
Val