# Integrating MODIS Snow Cover Data with Station Measurements for Training and Analysis

Here we will explore the process of mapping snow cover data from MODIS GeoTIFF files with station locations, and merge the results into a comprehensive CSV file.

In [6]:
import os
import pandas as pd
import rasterio
from pyproj import Transformer
from rasterio.enums import Resampling
import concurrent.futures
from datetime import datetime, timedelta
import dask.dataframe as dd
import numpy as np

- os: For file and directory operations.
- pandas: For data manipulation and analysis.
- rasterio: For reading and writing raster data.
- pyproj: For coordinate transformations.
- rasterio.enums.Resampling: For resampling options in rasterio.
- concurrent.futures: For parallel execution.
- datetime, timedelta: For date and time operations.
- dask.dataframe: For handling large datasets.
- numpy: For numerical operations.

In [10]:
work_dir = '../data/'
train_start_date = "2023-01-01"
train_end_date = "2023-01-31"

In [21]:
working_dir = f"../data/fsca"
folder_path = f"../data/fsca/final_output/"
new_base_station_list_file = f"{work_dir}/all_snotel_cdec_stations_active_in_westus.csv"
cell_to_modis_mapping = f"{working_dir}/training_cell_to_modis_mapper_original_snotel_stations.csv"
non_station_random_points_file = f"{work_dir}/non_station_random_points_in_westus.csv"
only_active_ghcd_station_in_west_conus_file = f"{working_dir}/active_ghcnd_station_only_list.csv"
ghcd_station_to_modis_mapper_file = f"{working_dir}/active_ghcnd_mapper_modis.csv"
all_training_points_with_snotel_ghcnd_file = f"{working_dir}/all_training_points_snotel_ghcnd_in_westus.csv"
modis_day_wise = f"{working_dir}/final_output/"
os.makedirs(modis_day_wise, exist_ok=True)

- Defines working directories and file paths.
- Creates necessary directories if they don't exist.

In [5]:
def map_modis_to_station(row, src):
    drow, dcol = src.index(row["lon"], row["lat"])
    return drow, dcol


Above method Maps MODIS pixel coordinates to station coordinates.

## Generate Random Points in the MODIS Grid

In [7]:
def generate_random_non_station_points():
    sample_modis_tif = f"{modis_day_wise}/2022-10-01__snow_cover.tif"
    print(f"loading geotiff {sample_modis_tif}")
    with rasterio.open(sample_modis_tif) as src:
        bounds = src.bounds
        transform = src.transform
        width = src.width
        height = src.height
        raster_array = src.read(1)

        random_points = []
        while len(random_points) < 4000:
            random_x = np.random.uniform(bounds.left, bounds.right)
            random_y = np.random.uniform(bounds.bottom, bounds.top)
            col, row = ~transform * (random_x, random_y)

            if 0 <= row < height and 0 <= col < width:
                value = raster_array[int(row), int(col)]
                if value != 239 and value != 255:
                    random_points.append((random_x, random_y, col, row))

        random_points = [(lat, lon, col, row) for lon, lat, col, row in random_points]
        random_points_df = pd.DataFrame(random_points, columns=['latitude', 'longitude', 'modis_x', 'modis_y'])
        random_points_df.to_csv(non_station_random_points_file, index=False)
        print(f"random points are saved to {non_station_random_points_file}")


- Loads a sample GeoTIFF file.
- Generates random points within the bounds of the raster, ensuring they are valid points.
- Saves the generated points to a CSV file.

## Prepare MODIS Grid Mapper for Training

In [8]:
def prepare_modis_grid_mapper_training():
    if os.path.exists(cell_to_modis_mapping):
        print(f"The file {cell_to_modis_mapping} exists. skip.")
    else:
        print(f"start to generate {cell_to_modis_mapping}")
        station_df = pd.read_csv(new_base_station_list_file)
        print("original station_df describe() = ", station_df.describe())

        sample_modis_tif = f"{modis_day_wise}/2022-10-01__snow_cover.tif"

        with rasterio.open(sample_modis_tif) as src:
            transform = src.transform
            station_df['modis_y'], station_df['modis_x'] = rasterio.transform.rowcol(
                src.transform, 
                station_df["longitude"], 
                station_df["latitude"])
            station_df.to_csv(cell_to_modis_mapping, index=False, columns=['latitude', 'longitude', 'modis_x', 'modis_y'])
            print("after mapped modis station_df.describe() = ", station_df.describe())


The goal of this function is to create a mapping between snotel station locations and a MODIS satellite image. This involves transforming the geographical coordinates of the stations into pixel coordinates on the MODIS image.

Uses rasterio to read the modis image file and get information about its spatial extent (how much of the Earth it covers) and its transformation matrix (how to convert pixel coordinates to geographic coordinates).

Converts the geographic coordinates (longitude and latitude) of the snotel stations into pixel coordinates (modis_x, modis_y) for the MODIS image.

Uses the transformation matrix from the MODIS image to map each station's geographic coordinates to its corresponding pixel location in the image.

## Merge Station and Non-Station Points

In [9]:
def merge_station_and_non_station_to_one_csv():
    df1 = pd.read_csv(cell_to_modis_mapping)
    df2 = pd.read_csv(non_station_random_points_file)
    combined_df = pd.concat([df1, df2], ignore_index=True)
    combined_df.to_csv(all_training_points_with_station_and_non_station_file, index=False)
    print(f"Combined CSV saved to {all_training_points_with_station_and_non_station_file}")

Merges station data and non-station random points into one CSV file.

## Merge SNOTEL and GHCND Stations

In [13]:
def merge_snotel_ghcnd_station_to_one_csv():
    df1 = pd.read_csv(cell_to_modis_mapping)
    df2 = pd.read_csv(ghcd_station_to_modis_mapper_file)
    combined_df = pd.concat([df1, df2], ignore_index=True)
    combined_df.to_csv(all_training_points_with_snotel_ghcnd_file, index=False)
    print(f"Combined CSV saved to {all_training_points_with_snotel_ghcnd_file}")

## Prepare GHCND Station Mapping for Training

In [14]:
def prepare_ghcnd_station_mapping_training():
    if os.path.exists(ghcd_station_to_modis_mapper_file):
        print(f"The file {ghcd_station_to_modis_mapper_file} exists. skip.")
    else:
        print(f"start to generate {ghcd_station_to_modis_mapper_file}")
        station_df = pd.read_csv(only_active_ghcd_station_in_west_conus_file)
        station_df = station_df.rename(columns={'Latitude': 'latitude', 'Longitude': 'longitude'})
        print("original station_df describe() = ", station_df.describe())

        sample_modis_tif = f"{modis_day_wise}/2022-10-01__snow_cover.tif"

        with rasterio.open(sample_modis_tif) as src:
            transform = src.transform
            station_df['modis_y'], station_df['modis_x'] = rasterio.transform.rowcol(
                src.transform, 
                station_df["longitude"],
                station_df["latitude"])
            station_df.to_csv(ghcd_station_to_modis_mapper_file, index=False, columns=['latitude', 'longitude', 'modis_x', 'modis_y'])
            print(f"the new mapper to the ghcnd is saved to {ghcd_station_to_modis_mapper_file}")
            print("after mapped modis station_df.describe() = ", station_df.describe())


- Loads GHCND station data and a sample MODIS GeoTIFF file.
- Maps GHCND station coordinates to MODIS grid coordinates.
- Saves the mapping to a CSV file.

## Get Band Value for a Row

In [12]:
def get_band_value(row, src):
    if (row["modis_y"] < src.height) and (row["modis_x"] < src.width):
        valid_value =  src.read(1, window=((int(row["modis_y"]), int(row["modis_y"])+1), (int(row["modis_x"]), int(row["modis_x"])+1)))
        return valid_value[0,0]
    else:
        return None


The get_band_value function is used to get the value of a pixel from a satellite image (in this case, from a MODIS raster image) based on specific coordinates.

**Inputs**:

**row**: This contains the coordinates where you want to get the pixel value. Specifically, it has modis_x and modis_y, which tell you the column and row of the pixel in the image.

**src**: This is the MODIS raster image you are working with. It's like a map where each point (pixel) has a value (like color or temperature).

Retrieves the value from the MODIS raster at the coordinates specified in the row.

In [17]:
def process_file(file_path, current_date_str, outfile):
  print(f"processing {file_path}")
  station_df = pd.read_csv(all_training_points_with_snotel_ghcnd_file)
  # print("station_df.head() = ", station_df.head())

  # Apply get_band_value for each row in the DataFrame
  with rasterio.open(file_path) as src:
    # Apply get_band_value for each row in the DataFrame
    # Get the affine transformation matrix
    transform = src.transform

    # Extract the spatial extent using the affine transformation
    left, bottom, right, top = rasterio.transform.array_bounds(src.height, src.width, transform)

    # Print the spatial extent
    # print("Spatial Extent (Bounding Box):", (left, bottom, right, top))

    station_df['fsca'] = station_df.apply(get_band_value, axis=1, args=(src,))

    
  # Prepare final data
  station_df['date'] = current_date_str
  station_df.to_csv(outfile, index=False, 
                    columns=['date', 'latitude', 'longitude', 'fsca'])
  print(f"Saved to csv: {outfile}")


In [None]:
def merge_csv(start_date, end_date):
  import glob
  # Find CSV files within the specified date range
  csv_files = glob.glob(folder_path + '*_training_output_station_with_ghcnd.csv')
  relevant_csv_files = []

  for c in csv_files:
    # Extract the date from the file name
    print("c = ", c)
    file_name = os.path.basename(c)
    date_str = file_name.split('_')[0]  # Assuming the date is part of the file name
    print("date_str = ", date_str)
    file_date = datetime.strptime(date_str, "%Y-%m-%d")

    # Check if the file date is within the specified range
    if start_date <= file_date <= end_date:
      relevant_csv_files.append(c)
#       # Read and concatenate only relevant CSV files
#       df = []
#       for c in relevant_csv_files:
#         tmp = pd.read_csv(c, low_memory=False, usecols=['date', 'latitude', 'longitude', 'fsca'])
#         df.append(tmp)

#         combined_df = pd.concat(df, ignore_index=True)

  # Initialize a Dask DataFrame
  print("start to use dask to read all csv files")
  dask_df = dd.read_csv(relevant_csv_files)

  # Save the merged DataFrame to a CSV file
  output_file = f'{working_dir}/fsca_final_training_all.csv'
  # Write the Dask DataFrame to a single CSV file
  print(f"saving all csvs into one file: {output_file}")
  dask_df.to_csv(output_file, index=False, single_file=True)
  #combined_df.to_csv(output_file, index=False)

  #print(combined_df.describe())
  print(f"Merged data saved to {output_file}")

In [27]:
def main():
  
  start_date = datetime.strptime(train_start_date, "%Y-%m-%d")
  
  end_date = datetime.strptime(train_end_date, "%Y-%m-%d")
  
  prepare_modis_grid_mapper_training()
  prepare_ghcnd_station_mapping_training()
  # running this function will generate a new set of random points
  # generate_random_non_station_points()
  #merge_station_and_non_station_to_one_csv()
  merge_snotel_ghcnd_station_to_one_csv()
  
  date_list = [start_date + timedelta(days=i) for i in range((end_date - start_date).days + 1)]
  for i in date_list:
    current_date = i.strftime("%Y-%m-%d")
    #print(f"extracting data for {current_date}")
    outfile = os.path.join(modis_day_wise, f'{current_date}_training_output_station_with_ghcnd.csv')
    if os.path.exists(outfile):
      print(f"The file {outfile} exists. skip.")
      pass
    else:
      process_file(f'{modis_day_wise}/{current_date}__snow_cover.tif', current_date, outfile)
  
  merge_csv(start_date, end_date)

In [28]:
main()
print("fsca Data extraction complete.")

The file ../data/fsca/training_cell_to_modis_mapper_original_snotel_stations.csv exists. skip.
The file ../data/fsca/active_ghcnd_mapper_modis.csv exists. skip.
Combined CSV saved to ../data/fsca/all_training_points_snotel_ghcnd_in_westus.csv
The file ../data/fsca/final_output/2023-01-01_training_output_station_with_ghcnd.csv exists. skip.
The file ../data/fsca/final_output/2023-01-02_training_output_station_with_ghcnd.csv exists. skip.
The file ../data/fsca/final_output/2023-01-03_training_output_station_with_ghcnd.csv exists. skip.
The file ../data/fsca/final_output/2023-01-04_training_output_station_with_ghcnd.csv exists. skip.
The file ../data/fsca/final_output/2023-01-05_training_output_station_with_ghcnd.csv exists. skip.
The file ../data/fsca/final_output/2023-01-06_training_output_station_with_ghcnd.csv exists. skip.
The file ../data/fsca/final_output/2023-01-07_training_output_station_with_ghcnd.csv exists. skip.
The file ../data/fsca/final_output/2023-01-08_training_output_st

ValueError: An error occurred while calling the read_csv method registered to the pandas backend.
Original Message: empty urlpath sequence