In [None]:
import pandas as pd
import re

# Read the CSV file
df = pd.read_csv('data//inaturalist_boletus_edulis_with_corine_climate_nonan.csv')
df2 = pd.read_csv('data//inaturalist_boletus_edulis_with_coords_topography.csv')

# Merge the columns from df2 into df based on the 'location' column
df = df.merge(df2[['location', 'dem', 'slope', 'aspect', 'geomorphon']], on='location', how='left', suffixes=('', '_df2'))
df = df.dropna()
df = df[df['dem'] != -32768]

df

df.to_csv('data/inaturalist_boletus_edulis_with_corine_climate_topography.csv', index=False)

In [None]:
import matplotlib.pyplot as plt

mean_rain = df[rain_columns].mean()
std_rain = df[rain_columns].std()

plt.figure(figsize=(10,6))
plt.plot(mean_rain.index, mean_rain.values, color="blue", linewidth=3, label="Mean Rainfall")
plt.fill_between(mean_rain.index, 
                 mean_rain - std_rain, 
                 mean_rain + std_rain, 
                 color="blue", alpha=0.2, label="±1 Std Dev")

plt.xticks(range(len(rain_columns)), labels=[str(i) for i in range(len(rain_columns))])
plt.xlabel("Days Ago (0 = today)")
plt.ylabel("Rainfall (mm)")
plt.title("Average Rainfall Trend with Variability")
plt.legend()
plt.show()


In [None]:
rain_columns = [f'P_{i}' for i in range(1, 15) if f'P_{i}' in df.columns]


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Define rain columns
rain_columns = [f'P_{i}' for i in range(1, 15) if f'P_{i}' in df.columns]

# Reshape dataframe into long format
df_long = df[rain_columns].melt(var_name="Day", value_name="Rain")
df_long["Day"] = df_long["Day"].str.extract("(\d+)").astype(int) - 1  # P1->0, P2->1...

# Plot with seaborn
plt.figure(figsize=(10,4))
sns.lineplot(
    data=df_long, x="Day", y="Rain",
    ci=95, n_boot=1000,  # bootstrap CI handled automatically
    color="navy"
)
plt.xlabel("Days Ago (0 = today)")
plt.ylabel("Rainfall (mm)")
plt.title("Rainfall Trends with 95% Bootstrapped CIs")
plt.grid()
plt.tight_layout()
plt.show()


In [None]:
import numpy as np

plt.figure(figsize=(10,2))
sns.heatmap([df[rain_columns].mean().values], 
            cmap="Blues", annot=True, cbar=False,
            xticklabels=[str(i) for i in range(len(rain_columns))])
plt.xlabel("Days Ago (0 = today)")
plt.title("Average Rainfall Across Days Before Observation")
plt.show()


In [None]:
stats = np.vstack([
    df[rain_columns].mean().values,
    df[rain_columns].std().values
])

plt.figure(figsize=(10,2.5))
sns.heatmap(stats, cmap="Blues", annot=True, fmt=".1f", cbar=True,
            xticklabels=[str(i) for i in range(len(rain_columns))],
            yticklabels=["Mean", "Std Dev"])
plt.xlabel("Days Ago (0 = today)")
plt.title("Rainfall Statistics Across Days")
plt.show()


In [None]:
import numpy as np
import pandas as pd
from plotnine import ggplot, aes, geom_line, geom_ribbon, labs, theme_minimal, theme_xkcd
import random

# 1. Select rain columns from your dataframe
rain_columns = [f'P_{i}' for i in range(1, 15) if f'P_{i}' in df.columns]

# 2. Compute mean values
mean_vals = df[rain_columns].mean().values

# 3. Bootstrap for confidence intervals
boot_means = []
for _ in range(1000):
    sample = df[rain_columns].sample(frac=1, replace=True).mean()
    boot_means.append(sample.values)
boot_means = np.array(boot_means)

ci_lower = np.percentile(boot_means, 2.5, axis=0)
ci_upper = np.percentile(boot_means, 97.5, axis=0)

# 4. Create dataframe for plotting
ci_df = pd.DataFrame({
    "Day": range(len(rain_columns)),   # 0 = today, 13 = 13 days ago
    "Mean": mean_vals,
    "Lower": ci_lower,
    "Upper": ci_upper
})

# 5. Plot with plotnine (ggplot2 style)
p = (
    ggplot(ci_df, aes(x="Day", y="Mean"))
    + geom_line(color="navy")
    + geom_ribbon(aes(ymin="Lower", ymax="Upper"), alpha=0.2, fill="skyblue")
    + labs(
        title="Rainfall Trends with 95% Bootstrapped CIs",
        x="Days Ago (0 = today)",
        y="Rainfall (mm)"
    )
    + theme_xkcd()   # <-- White background
)

print(p)


# Append variables for inference (on vector file)

In [None]:
import pandas as pd
import re

# Read the CSV file
df = pd.read_csv('data/negative_samples_within_land_10k.csv')

# Function to extract coordinates from the location string
def parse_coordinates(location_str):
    # Check if the value is NaN or not a string
    if pd.isna(location_str) or not isinstance(location_str, str):
        return None, None
    
    # Use regex to extract numbers from the string format "(lat, lon)"
    match = re.search(r'\(([^,]+),\s*([^)]+)\)', location_str)
    if match:
        try:
            lat = float(match.group(1).strip())
            lon = float(match.group(2).strip())
            return lat, lon
        except ValueError:
            return None, None
    else:
        return None, None

# Extract coordinates into separate columns
df[['y', 'x']] = df['location'].apply(lambda x: pd.Series(parse_coordinates(x)))

# Optional: Remove the original location column if you don't need it
# df = df.drop('location', axis=1)

# Save the new CSV with coordinate columns
df.to_csv('data/negative_samples_within_land_10k_with_coords.csv', index=False)

print("CSV file saved with separate x and y coordinate columns!")
print(f"Total rows processed: {len(df)}")
print(f"Rows with valid coordinates: {df[['x', 'y']].dropna().shape[0]}")
print(f"Rows with missing coordinates: {df[['x', 'y']].isna().any(axis=1).sum()}")
print("\nFirst few rows with new columns:")
print(df[['species', 'x', 'y', 'observed_on']].head())

In [None]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('negative_samples_within_polygons_updated.csv')

df["species"] = "None"
# Remove rows containing NaN values
df_cleaned = df.dropna()

# If you want to reset the index after dropping rows
df_cleaned.reset_index(drop=True, inplace=True)

# Print the cleaned DataFrame
print(df_cleaned)


In [None]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, Polygon

# Load the GeoJSON file containing the polygon of Spain
spain_polygon = gpd.read_file('spain.geojson')

# Load the CSV file into a DataFrame
df = df_cleaned

# Extract latitude and longitude from the "location" column
df[['latitude', 'longitude']] = df['location'].str.extract(r'\(([^,]+),([^)]+)\)').astype(float)

# Drop the original "location" column
df.drop(columns=['location'], inplace=True)

# Convert the DataFrame to a GeoDataFrame
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude))

# Ensure that both GeoDataFrames have the same CRS (Coordinate Reference System)
gdf.crs = spain_polygon.crs

# Perform a spatial join to keep only the points inside the polygon
points_inside_polygon = gpd.sjoin(gdf, spain_polygon, op='within')

# Drop unnecessary columns added during the join
points_inside_polygon.drop(columns=['index_right'], inplace=True)
points_inside_polygon
# Save the filtered points to a new CSV file
#points_inside_polygon.to_csv('points_inside_spain.csv', index=False)


In [None]:
import matplotlib.pyplot as plt

# Define the variables to plot
variables_to_plot = ['P', 'Tmin', 'Temp', 'RelHum', 'SpecHum', 'Pres', 'Tmax']

# Aggregate the data across the 14 columns for each variable
aggregated_data = {}
for var in variables_to_plot:
    aggregated_data[var] = points_inside_polygon[[f'{var}_{i}' for i in range(1, 15)]].mean(axis=1)

# Plotting
fig, axs = plt.subplots(len(variables_to_plot), 1, figsize=(10, 6 * len(variables_to_plot)))
for i, var in enumerate(variables_to_plot):
    ax = axs[i]
    ax.hist(aggregated_data[var], bins=20, alpha=0.7, color='blue')
    ax.set_title(f'Distribution of {var} across all days')
    ax.set_xlabel(var)
    ax.set_ylabel('Frequency')

plt.tight_layout()
plt.show()


In [None]:
points_inside_polygon.to_csv('boletus_spain_negative.csv', index=False)