In [None]:
import pandas as pd

df = pd.read_csv("boletus.csv")

In [None]:
import os
from netCDF4 import Dataset
from datetime import datetime, timedelta
import numpy as np
import csv

# Read the CSV file and store its data in a list of dictionaries
data = []
with open("negative_samples_within_polygons.csv", "r") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        if 'location' in row and row['location']:  # Check if 'location' exists and is not empty
            data.append(row)

# Function to extract coordinates from string
def extract_coordinates(coord_str):
    if not coord_str or coord_str == '()':
        return None
    return tuple(map(float, coord_str.strip("()").split(', ')))

# Function to parse datetime strings with timezone information
def parse_datetime_with_timezone(datetime_str):
    datetime_str = datetime_str.split(" ")[0]  # Extract the date part
    return datetime.strptime(datetime_str, "%Y-%m-%d")

# Function to get environmental data for a given coordinate and date
def get_environmental_data(coord, date, data_dir, variable):
    data = []
    date_str = date.strftime('%Y%j')

    # Iterate over the 14 days
    for i in range(14):
        
        # Construct the filename for the current date
        file_date_str = (date - timedelta(days=i)).strftime('%Y%j')
        file_year = file_date_str[:4]
        day_of_year = int(file_date_str[4:])
        
        # Adjusting day of year to ensure leading zeros if necessary
        file_date_str = file_year + str(day_of_year).zfill(3)
        
        data_file = os.path.join(data_dir, variable, "Daily", f"{file_date_str}.nc")

        if not os.path.isfile(data_file):
            data.append(np.nan)
            print(f"File not found for {variable} on {file_date_str}")
            continue

        # Read environmental data from the file
        nc = Dataset(data_file, 'r')


        # Iterate over variables to find the one that represents the environmental data
        var_names = nc.variables.keys()
        found_var_name = None
        for var_name in var_names:
            if var_name not in ['lon', 'lat', 'time']:
                found_var_name = var_name
                break

        if found_var_name is None:
            print(f"No environmental data variable found in {data_file}")
            data.append(np.nan)
            continue  # Skip this variable and move to the next one

        var = nc.variables[found_var_name]
        var_data = var[:]  # Extract data for the selected variable
        
        # Assuming the coordinates are indices in the netCDF file
        # You may need to adjust this based on how your data is structured
        try:
            # Assuming latitudes range from -90 to 90 and longitudes range from -180 to 180
            # Mapping latitude and longitude to indices in the data grid
            lat_index = int((90 - coord[1]) / 0.1)
            lon_index = int((180 + coord[0]) / 0.1)
            data_value = var_data[0, lat_index, lon_index]  # Assuming the order is time, lat, lon
        except IndexError:
            print(f"Index out of range for {variable} on {file_date_str}")
            data.append(np.nan)
            continue  # Skip this variable and move to the next one
        
        data.append(data_value)

        # Close the netCDF file
        nc.close()

    return data

# Define the environmental variables you want to include
variables = ['P', 'Pres', 'RelHum', 'SpecHum', 'Temp', 'Tmax', 'Tmin']

# Path to the directory containing the data files
data_dir = "MSWX_V100/Past"

# Iterate over the data
rows_to_remove = []  # Store indices of rows to remove
for idx, row in enumerate(data):
    # Extract coordinates and observed date
    print(row)
    coords = extract_coordinates(row['location'])
    print(coords)
    if coords is None:
        print(f"Skipping row {idx+1}: Missing or invalid coordinates")
        rows_to_remove.append(idx)
        continue
    
    observed_date = parse_datetime_with_timezone(row['observed_on'])
    
    # Store already added variables for each row
    added_variables = set()

    # Iterate over the environmental variables
    for variable in variables:
        # Get environmental data for the given coordinate, date, and variable
        environmental_data = get_environmental_data(coords, observed_date, data_dir, variable)

        # Add environmental data to the row if the variable hasn't been added yet
        if variable not in added_variables:
            # Add environmental data to the row
            for i, value in enumerate(environmental_data):
                row[f"{variable}_{i+1}"] = value
            
            # Mark the variable as added
            added_variables.add(variable)

# Remove rows with missing or invalid coordinates
#for idx in reversed(rows_to_remove):
#    del data[idx]

# Write the updated data to a new CSV file
output_file = "negative_samples_within_polygons_updated.csv"
fieldnames = list(data[0].keys())
with open(output_file, "w", newline="") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(data)

print("Data has been updated and saved to", output_file)


In [None]:
import os
import geopandas as gpd
import numpy as np
from netCDF4 import Dataset
from datetime import datetime, timedelta
import csv
import rasterio
from rasterio import mask

# Function to calculate the average value of environmental data within a polygon area
def get_average_environmental_data(polygon, date, data_dir, variable):
    # Define the number of days to consider for the average (including today)
    num_days = 5

    # Initialize a list to store the environmental data values
    data_values = []

    # Iterate over the number of days to consider
    for i in range(num_days):
        # Calculate the date for the current iteration
        current_date = date - timedelta(days=i)

        # Construct the filename for the current date
        file_date_str = current_date.strftime('%Y') + str(current_date.timetuple().tm_yday).zfill(3)
        data_file = os.path.join(data_dir, variable, "Daily", f"{file_date_str}.nc")

        # Check if the data file exists
        if not os.path.isfile(data_file):
            print(f"File not found for {variable} on {file_date_str}")
            data_values.append(np.nan)
            continue

        # Read environmental data from the file
        nc = Dataset(data_file, 'r')

        # Extract environmental data variable
        var = None
        for var_name in nc.variables.keys():
            if var_name not in ['lon', 'lat', 'time']:
                var = nc.variables[var_name]
                break

        if var is None:
            print(f"No environmental data variable found in {data_file}")
            data_values.append(np.nan)
            nc.close()
            continue  # Skip this variable and move to the next one

        # Extract bounding box of the polygon
        min_lon, min_lat, max_lon, max_lat = polygon.bounds

        # Read the raster data and clip it to the polygon extent
        with rasterio.open(data_file) as src:
            out_image, out_transform = mask.mask(src, [polygon], crop=True)
            out_image = np.squeeze(out_image)  # Remove singleton dimension

            # Calculate the mean value for the current date and append to the list
            data_values.append(np.nanmean(out_image))

        # Close the netCDF file
        nc.close()

    # Calculate the average value
    average_value = np.nanmean(data_values)

    return average_value

# Load the GeoJSON file into a GeoDataFrame
spain = gpd.read_file('spain_3km_ready.geojson')
spain = spain.to_crs('4623')
# Define the directory containing the data files
data_dir = "new_data/NRT"

# Define the environmental variables you want to include
variables = ['P', 'Pres', 'RelHum', 'SpecHum', 'Temp', 'Tmax', 'Tmin']

# Create a list to store the updated data
updated_data = []

# Define a fixed date for testing
test_date = datetime(2024, 5, 3)  # Replace with the desired date

# Iterate over each polygon in the GeoDataFrame
for index, row in spain.iterrows():
    # Iterate over each variable
    for variable in variables:
        # Calculate the average environmental data for the current polygon, date, and variable
        average_data = get_average_environmental_data(row['geometry'], test_date, data_dir, variable)

        # Add the average environmental data to the row
        row[f'{variable}_avg'] = average_data

    # Append the updated row to the list
    updated_data.append(row)


# Define the output CSV file path
output_file = "today_ready.csv"

# Write the updated data to a new CSV file
with open(output_file, "w", newline="") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=updated_data[0].keys())
    writer.writeheader()
    writer.writerows(updated_data)

print("Data has been updated and saved to", output_file)


In [1]:
import os
import geopandas as gpd
import numpy as np
from datetime import datetime, timedelta
import rasterio
from shapely.geometry import Point
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor  # Import ThreadPoolExecutor

# Function to calculate the average value of environmental data within a polygon area
def get_average_environmental_data(polygon, date, data_dir, variables, num_days):
    mean_values = {}
    for variable in variables:
        variable_values = []
        for i in range(num_days):
            current_date = date - timedelta(days=i)
            file_date_str = current_date.strftime('%Y') + str(current_date.timetuple().tm_yday).zfill(3)
            data_file = os.path.join(data_dir, variable, "Daily", f"{file_date_str}.nc")
            if not os.path.isfile(data_file):
                print(f"File not found for {variable} on {file_date_str}")
                variable_values.append(np.nan)
                continue
            with rasterio.open(data_file, mode="r") as src:
                min_x, min_y, max_x, max_y = polygon.bounds
                centroid_x = (min_x + max_x) / 2
                centroid_y = (min_y + max_y) / 2
                centroid = Point(centroid_x, centroid_y)
                px, py = src.index(centroid.x, centroid.y)
                value = src.read(1, window=((py, py+1), (px, px+1)))
                variable_values.append(value[0, 0])
        mean_value = np.nanmean(variable_values)
        mean_values[variable] = mean_value
    return mean_values

spain = gpd.read_file('spain_3km_ready.geojson')
spain = spain.to_crs('EPSG:4326')

data_dir = "new_data/NRT"
variables = ['P', 'Pres', 'RelHum', 'SpecHum', 'Temp', 'Tmax', 'Tmin']
num_days = 14
test_date = datetime(2024, 5, 3)

# Define a function to process a single row

def process_row(row):
    average_data = get_average_environmental_data(row['geometry'], test_date, data_dir, variables, num_days)
    for day_index in range(num_days):
        for variable, value in average_data.items():
            if variable in average_data:
                spain.loc[row.name, f'{variable}_{day_index+1}'] = average_data[variable]
            else:
                spain.loc[row.name, f'{variable}_{day_index+1}'] = np.nan

# Use ThreadPoolExecutor to run two threads
with ThreadPoolExecutor(max_workers=8) as executor:
    futures = [executor.submit(process_row, row) for index, row in spain.iterrows()]
    for future in tqdm(futures, total=len(spain), desc="Processing polygons"):
        future.result()  # Wait for each thread to complete

output_file = "spain_today.geojson"
spain.to_file(output_file, driver='GeoJSON')

print("Data has been updated and saved to", output_file)


  super().__setitem__(key, value)
  super().__setitem__(key, value)
  super().__setitem__(key, value)
Processing polygons:  24%|██▍       | 13972/57840 [1:06:18<3:43:46,  3.27it/s]