# get data from csv

In [4]:
from shroomradar.src.climate import *

csv_file_path = "..//data//Summer Bolete.csv"

download_climate_data_from_csv(csv_file_path,
                               rclone_path="..//shroomradar//data//rclone.exe")

Step 1: Generating file structure from CSV...


Reading CSV: 100%|██████████| 4479/4479 [00:00<00:00, 201429.13it/s]


📅 Found 1205 unique observation dates


Building file list: 100%|██████████| 1205/1205 [00:00<00:00, 21753.19it/s]


📝 Generated 52656 unique file entries
✅ File structure saved to: climate_files_from_csv.txt
Step 2: Checking for existing files and updating download list...


Checking for existing files: 100%|██████████| 52656/52656 [00:03<00:00, 14059.96it/s]


✅ Found and removed 26184 existing files from the download list.
Step 3: Downloading climate data files...
▶️ Running command: ..//shroomradar//data//rclone.exe sync -v --filter-from climate_files_from_csv.txt --drive-shared-with-me google:/MSWX_V100 ..//climate_data
2025/09/19 12:06:03 INFO  : Past/Pres/Daily: Set directory modification time (using SetModTime)
2025/09/19 12:06:03 INFO  : Past/P/Daily: Set directory modification time (using SetModTime)
2025/09/19 12:06:03 INFO  : Past/Tmax/Daily: Set directory modification time (using SetModTime)
2025/09/19 12:06:03 INFO  : Past/SpecHum/Daily: Set directory modification time (using SetModTime)
2025/09/19 12:06:03 INFO  : Past/RelHum/Daily: Set directory modification time (using SetModTime)
2025/09/19 12:06:03 INFO  : Past/Temp/Daily: Set directory modification time (using SetModTime)
2025/09/19 12:06:03 INFO  : Past/Tmin/Daily: Set directory modification time (using SetModTime)
2025/09/19 12:06:03 INFO  : Past/Wind/Daily: Set directory

'climate_files_from_csv.txt'

# append to cvs

In [6]:
from shroomradar.src.climate import append_climate_data_csv
import os

# --- Configuration --
input_path = os.path.join("..", "data", "Psilocybe cubensis_xy.csv")
output_path = os.path.join("..",  "data", "Psilocybe cubensis_xy_climate.csv")
climate_data_folder = "..//climate_data"


print("Starting climate data appending process...")
append_climate_data_csv(
    input_csv=input_path,
    output_csv=output_path,
    climate_base=climate_data_folder
)
print("Process finished.")


Starting climate data appending process...
Loaded 3390 rows from ..\data\Psilocybe cubensis_xy.csv


Processing observations: 100%|██████████| 3390/3390 [21:24<00:00,  2.64it/s]


✅ Data has been updated and saved to ..\data\Psilocybe cubensis_xy_climate.csv
Process finished.


# Appen to csv

In [None]:
from shroomradar.src.climate import append_climate_data_csv
import os

# --- Configuration --
input_path = os.path.join("..", "data", "negative_samples_within_land_10k_with_coords_topography.csv")
output_path = os.path.join("..",  "data", "negative_samples_within_land_10k_with_coords_topography_climate.csv")
climate_data_folder = "..//climate_data"


print("Starting climate data appending process...")
append_climate_data_csv(
    input_csv=input_path,
    output_csv=output_path,
    climate_base=climate_data_folder
)
print("Process finished.")


# append to geojson values (to be used in pipeline)

In [None]:
import os
import geopandas as gpd
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import xarray as xr
from tqdm import tqdm


def sample_nc_point(file_path: str, lon: float, lat: float) -> float:
    """
    Read a single NetCDF file and return the value at (lat, lon),
    using xarray's CF decoding and spatial interpolation.

    Returns float(np.nan) if the value cannot be obtained.
    """
    if not os.path.isfile(file_path):
        return float('nan')

    try:
        # decode_cf=True applies scale_factor/add_offset and masks _FillValue/missing_value
        with xr.open_dataset(file_path, engine="netcdf4", decode_cf=True) as ds:
            # pick the first real data variable (skip common coord names)
            data_vars = list(ds.data_vars)
            var_candidates = [v for v in data_vars
                              if v.lower() not in ("lat", "latitude", "lon", "longitude", "time")]
            if not var_candidates:
                return float('nan')
            vname = var_candidates[0]

            da = ds[vname]

            # Standardize spatial dim names to ('lat','lon') if needed
            rename_map = {}
            if "latitude" in da.dims: rename_map["latitude"] = "lat"
            if "longitude" in da.dims: rename_map["longitude"] = "lon"
            if rename_map:
                da = da.rename(rename_map)

            # If there is a time dimension, use the first (these are daily files)
            if "time" in da.dims:
                da = da.isel(time=0)

            # Ensure lat/lon dims exist
            if not (("lat" in da.dims) and ("lon" in da.dims)):
                return float('nan')

            # Spatial interpolation (linear) with nearest fallback at edges
            val = da.interp(lat=lat, lon=lon, method="linear").values.item()
            if val is None or (isinstance(val, float) and np.isnan(val)):
                val = da.interp(lat=lat, lon=lon, method="nearest").values.item()

            # Final guard
            if val is None:
                return float('nan')
            return float(val)
    except Exception as e:
        print(f"⚠️ Error reading/interpolating {file_path}: {e}")
        return float('nan')


def get_environmental_timeseries(polygon, date, data_dir, variables, num_days):
    """
    For each variable, build a vector of daily values at the polygon centroid
    using spatial interpolation (xarray.interp), then temporally interpolate
    missing days (pandas), and return today's-first ordering.
    """
    results = {}
    cx, cy = polygon.centroid.x, polygon.centroid.y  # lon, lat

    for variable in variables:
        values = []
        for i in range(num_days):
            current_date = date - timedelta(days=i)
            file_date_str = current_date.strftime('%Y') + str(current_date.timetuple().tm_yday).zfill(3)
            data_file = os.path.join(data_dir, variable, "Daily", f"{file_date_str}.nc")

            val = sample_nc_point(data_file, cx, cy)
            values.append(val)

        # Temporal interpolation (fill gaps) while keeping Var_1 = "today"
        s = pd.Series(values[::-1])  # oldest→newest for interpolation
        s = s.interpolate(limit_direction="both")
        results[variable] = s[::-1].tolist()  # back to newest→oldest (today first)

    return results


# === Main script ===
spain = gpd.read_file("crop.geojson")
# Ensure CRS is WGS84 lon/lat
if spain.crs is None:
    spain.set_crs("EPSG:4326", inplace=True)
else:
    spain = spain.to_crs("EPSG:4326")

data_dir = "new_data/NRT"
variables = ["P", "Pres", "RelHum", "SpecHum", "Temp", "Tmax", "Tmin"]
num_days = 14
test_date = datetime(2025, 9, 10)

results = []
for idx, row in tqdm(spain.iterrows(), total=len(spain), desc="Processing polygons"):
    ts_data = get_environmental_timeseries(row["geometry"], test_date, data_dir, variables, num_days)
    results.append((row.name, ts_data))

# Merge back into GeoDataFrame
for idx, ts_data in results:
    for variable, values in ts_data.items():
        # per-day values (Var_1 = today)
        for day_index, value in enumerate(values, start=1):
            spain.loc[idx, f"{variable}_{day_index}"] = value
        # mean across the period
        spain.loc[idx, f"{variable}_mean"] = float(np.nanmean(values))

    # quick preview for first polygon
    if idx == 0:
        print(f"--- Preview for polygon {idx} ---")
        for variable, values in ts_data.items():
            print(variable, values[:5], "...")

# Save output
output_file = "crop_new.geojson"
spain.to_file(output_file, driver="GeoJSON")
print("✅ Data has been updated and saved to", output_file)


In [None]:
"""
import os
import geopandas as gpd
import numpy as np
from netCDF4 import Dataset
from datetime import datetime, timedelta
import csv
import rasterio
from rasterio import mask

# Function to calculate the average value of environmental data within a polygon area
def get_average_environmental_data(polygon, date, data_dir, variable):
    # Define the number of days to consider for the average (including today)
    num_days = 5

    # Initialize a list to store the environmental data values
    data_values = []

    # Iterate over the number of days to consider
    for i in range(num_days):
        # Calculate the date for the current iteration
        current_date = date - timedelta(days=i)

        # Construct the filename for the current date
        file_date_str = current_date.strftime('%Y') + str(current_date.timetuple().tm_yday).zfill(3)
        data_file = os.path.join(data_dir, variable, "Daily", f"{file_date_str}.nc")

        # Check if the data file exists
        if not os.path.isfile(data_file):
            print(f"File not found for {variable} on {file_date_str}")
            data_values.append(np.nan)
            continue

        # Read environmental data from the file
        nc = Dataset(data_file, 'r')

        # Extract environmental data variable
        var = None
        for var_name in nc.variables.keys():
            if var_name not in ['lon', 'lat', 'time']:
                var = nc.variables[var_name]
                break

        if var is None:
            print(f"No environmental data variable found in {data_file}")
            data_values.append(np.nan)
            nc.close()
            continue  # Skip this variable and move to the next one

        # Extract bounding box of the polygon
        min_lon, min_lat, max_lon, max_lat = polygon.bounds

        # Read the raster data and clip it to the polygon extent
        with rasterio.open(data_file) as src:
            out_image, out_transform = mask.mask(src, [polygon], crop=True)
            out_image = np.squeeze(out_image)  # Remove singleton dimension

            # Calculate the mean value for the current date and append to the list
            data_values.append(np.nanmean(out_image))

        # Close the netCDF file
        nc.close()

    # Calculate the average value
    average_value = np.nanmean(data_values)

    return average_value

# Load the GeoJSON file into a GeoDataFrame
spain = gpd.read_file('spain_3km_ready.geojson')
spain = spain.to_crs('4623')
# Define the directory containing the data files
data_dir = "new_data/NRT"

# Define the environmental variables you want to include
variables = ['P', 'Pres', 'RelHum', 'SpecHum', 'Temp', 'Tmax', 'Tmin']

# Create a list to store the updated data
updated_data = []

# Define a fixed date for testing
test_date = datetime(2024, 5, 3)  # Replace with the desired date

# Iterate over each polygon in the GeoDataFrame
for index, row in spain.iterrows():
    # Iterate over each variable
    for variable in variables:
        # Calculate the average environmental data for the current polygon, date, and variable
        average_data = get_average_environmental_data(row['geometry'], test_date, data_dir, variable)

        # Add the average environmental data to the row
        row[f'{variable}_avg'] = average_data

    # Append the updated row to the list
    updated_data.append(row)


# Define the output CSV file path
output_file = "today_ready.csv"

# Write the updated data to a new CSV file
with open(output_file, "w", newline="") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=updated_data[0].keys())
    writer.writeheader()
    writer.writerows(updated_data)

print("Data has been updated and saved to", output_file)

"""