In [None]:
import pandas as pd 
data = pd.read_csv("data/inaturalist_boletus_edulis_with_el_aspect_corine_weather.csv")
data2 = pd.read_csv("data/negative_samples_el_aspect_corine_weather.csv")
data2["species"] = "None"

del data["Unnamed: 0"]


In [None]:
pd.concat([data, data2], sort=False)

In [None]:
df = pd.concat([data, data2], sort=False)
# Remove rows with any NaN values to avoid issues during model training
df = df.dropna()


In [None]:
df

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

# Load the dataset
data = df

# Selecting relevant features and target variable
X = data[
    ['tmin_P1', 'tmin_P2', 'tmin_P3', 'tmin_P4', 'tmin_P5', 'tmin_P6', 'tmin_P7', 'tmin_P8', 'tmin_P9', 'tmin_P10', 'tmin_P11', 'tmin_P12', 'tmin_P13', 'tmin_P14', 'tmin_P15',
     'tmax_P1', 'tmax_P2', 'tmax_P3', 'tmax_P4', 'tmax_P5', 'tmax_P6', 'tmax_P7', 'tmax_P8', 'tmax_P9', 'tmax_P10', 'tmax_P11', 'tmax_P12', 'tmax_P13', 'tmax_P14', 'tmax_P15',
     'temp_P1', 'temp_P2', 'temp_P3', 'temp_P4', 'temp_P5', 'temp_P6', 'temp_P7', 'temp_P8', 'temp_P9', 'temp_P10', 'temp_P11', 'temp_P12', 'temp_P13', 'temp_P14', 'temp_P15',
     'rel_humidity_P1', 'rel_humidity_P2', 'rel_humidity_P3', 'rel_humidity_P4', 'rel_humidity_P5', 'rel_humidity_P6', 'rel_humidity_P7', 'rel_humidity_P8', 'rel_humidity_P9', 'rel_humidity_P10', 'rel_humidity_P11', 'rel_humidity_P12', 'rel_humidity_P13', 'rel_humidity_P14', 'rel_humidity_P15',
     'precipitation_P1', 'precipitation_P2', 'precipitation_P3', 'precipitation_P4', 'precipitation_P5', 'precipitation_P6', 'precipitation_P7', 'precipitation_P8', 'precipitation_P9', 'precipitation_P10', 'precipitation_P11', 'precipitation_P12', 'precipitation_P13', 'precipitation_P14', 'precipitation_P15',
     'wind_speed_P1', 'wind_speed_P2', 'wind_speed_P3', 'wind_speed_P4', 'wind_speed_P5', 'wind_speed_P6', 'wind_speed_P7', 'wind_speed_P8', 'wind_speed_P9', 'wind_speed_P10', 'wind_speed_P11', 'wind_speed_P12', 'wind_speed_P13', 'wind_speed_P14', 'wind_speed_P15',
     'LC', 'elevation', 'aspect'
    ]
]
y = data['species']

# Balance the classes in the dataset by downsampling the majority class
from sklearn.utils import resample

# Combine X and y for easier resampling
df_combined = X.copy()
df_combined['species'] = y

# Find the class counts
class_counts = df_combined['species'].value_counts()
min_class_count = class_counts.min()

# Downsample each class to the size of the smallest class
balanced_df = pd.concat([
    resample(
        df_combined[df_combined['species'] == label],
        replace=False,
        n_samples=min_class_count,
        random_state=43
    )
    for label in class_counts.index
])

# Shuffle the balanced dataset
balanced_df = balanced_df.sample(frac=1, random_state=43).reset_index(drop=True)

# Separate features and target again
X_balanced = balanced_df.drop('species', axis=1)
y_balanced = balanced_df['species']

# Split the balanced data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.15, random_state=43, stratify=y_balanced)

# Initialize Gradient Boosting Classifier
gb_clf = GradientBoostingClassifier(random_state=43)

# Define hyperparameters for tuning
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(gb_clf, param_grid, cv=5, scoring='accuracy')

# Train the model
grid_search.fit(X_train, y_train)

# Get the best estimator
best_gb_clf = grid_search.best_estimator_

# Predictions
y_pred = best_gb_clf.predict(X_test)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Best Parameters:", grid_search.best_params_)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_rep)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
data = df

required_variables = ['tmin_P1', 'tmin_P2', 'tmin_P3', 'tmin_P4', 'tmin_P5', 'tmin_P6', 'tmin_P7', 'tmin_P8', 'tmin_P9', 'tmin_P10', 'tmin_P11', 'tmin_P12', 'tmin_P13', 'tmin_P14',
     'tmax_P1', 'tmax_P2', 'tmax_P3', 'tmax_P4', 'tmax_P5', 'tmax_P6', 'tmax_P7', 'tmax_P8', 'tmax_P9', 'tmax_P10', 'tmax_P11', 'tmax_P12', 'tmax_P13', 'tmax_P14',
     'temp_P1', 'temp_P2', 'temp_P3', 'temp_P4', 'temp_P5', 'temp_P6', 'temp_P7', 'temp_P8', 'temp_P9', 'temp_P10', 'temp_P11', 'temp_P12', 'temp_P13', 'temp_P14', 
     'rel_humidity_P1', 'rel_humidity_P2', 'rel_humidity_P3', 'rel_humidity_P4', 'rel_humidity_P5', 'rel_humidity_P6', 'rel_humidity_P7', 'rel_humidity_P8', 'rel_humidity_P9', 'rel_humidity_P10', 'rel_humidity_P11', 'rel_humidity_P12', 'rel_humidity_P13', 'rel_humidity_P14',
     'precipitation_P1', 'precipitation_P2', 'precipitation_P3', 'precipitation_P4', 'precipitation_P5', 'precipitation_P6', 'precipitation_P7', 'precipitation_P8', 'precipitation_P9', 'precipitation_P10', 'precipitation_P11', 'precipitation_P12', 'precipitation_P13', 'precipitation_P14', 
     'wind_speed_P1', 'wind_speed_P2', 'wind_speed_P3', 'wind_speed_P4', 'wind_speed_P5', 'wind_speed_P6', 'wind_speed_P7', 'wind_speed_P8', 'wind_speed_P9', 'wind_speed_P10', 'wind_speed_P11', 'wind_speed_P12', 'wind_speed_P13', 'wind_speed_P14', 
    'elevation', 'aspect'
    ]
# Selecting relevant features and target variable
X = data[
    required_variables
]

# Split the balanced data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=45)
# Split the data into train and test sets

# Initialize Gradient Boosting Classifier
gb_clf = GradientBoostingClassifier(n_estimators=500, learning_rate=0.1, max_depth=3, random_state=43)

# Train the model
gb_clf.fit(X_train, y_train)

# Predictions
y_pred = gb_clf.predict(X_test)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_rep)


In [None]:
import joblib

joblib.dump(gb_clf, 'docker//data//models//gradient_boosting_model_v5.pkl')


logist regression

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE

# Load the dataset
data = df   # make sure df is already defined
target_col = "species"   # your target column

# List of features
required_variables = [
    'tmin_P1','tmin_P2','tmin_P3','tmin_P4','tmin_P5','tmin_P6','tmin_P7','tmin_P8','tmin_P9','tmin_P10','tmin_P11','tmin_P12','tmin_P13','tmin_P14',
    'tmax_P1','tmax_P2','tmax_P3','tmax_P4','tmax_P5','tmax_P6','tmax_P7','tmax_P8','tmax_P9','tmax_P10','tmax_P11','tmax_P12','tmax_P13','tmax_P14',
    'temp_P1','temp_P2','temp_P3','temp_P4','temp_P5','temp_P6','temp_P7','temp_P8','temp_P9','temp_P10','temp_P11','temp_P12','temp_P13','temp_P14',
    'rel_humidity_P1','rel_humidity_P2','rel_humidity_P3','rel_humidity_P4','rel_humidity_P5','rel_humidity_P6','rel_humidity_P7','rel_humidity_P8','rel_humidity_P9','rel_humidity_P10','rel_humidity_P11','rel_humidity_P12','rel_humidity_P13','rel_humidity_P14',
    'precipitation_P1','precipitation_P2','precipitation_P3','precipitation_P4','precipitation_P5','precipitation_P6','precipitation_P7','precipitation_P8','precipitation_P9','precipitation_P10','precipitation_P11','precipitation_P12','precipitation_P13','precipitation_P14',
    'wind_speed_P1','wind_speed_P2','wind_speed_P3','wind_speed_P4','wind_speed_P5','wind_speed_P6','wind_speed_P7','wind_speed_P8','wind_speed_P9','wind_speed_P10','wind_speed_P11','wind_speed_P12','wind_speed_P13','wind_speed_P14',
    'elevation','aspect'
]

# Extract features
X = data[required_variables]

# --- Step 1: Encode categorical features (None -> 0, others -> 1) ---
categorical_columns = X.select_dtypes(include=["object"]).columns
for col in categorical_columns:
    X[col] = X[col].apply(lambda x: 0 if pd.isnull(x) or x == "None" else 1)

# --- Step 2: Encode target (None=0, rest=1) ---
y = data[target_col].apply(lambda x: 0 if x == "None" else 1)

# --- Step 3: Train-test split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=45, stratify=y
)

# --- Step 4: Oversample minority with SMOTE ---
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)

# --- Step 5: Logistic Regression ---
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_res, y_res)


# --- Step 6: Predictions ---
y_pred = log_reg.predict(X_test)
y_prob = log_reg.predict_proba(X_test)[:, 1]

# --- Step 7: Evaluation ---
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=["None","Other"]))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))


In [None]:
import joblib

joblib.dump(log_reg, 'docker//data//models//lr_model_v5.pkl')


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.calibration import CalibratedClassifierCV

# ----------------------
# Load and prepare data
# ----------------------
data = df   # make sure df is already defined
target_col = "species"   # <-- your target column

required_variables = [
    'tmin_P1','tmin_P2','tmin_P3','tmin_P4','tmin_P5','tmin_P6','tmin_P7','tmin_P8','tmin_P9','tmin_P10','tmin_P11','tmin_P12','tmin_P13','tmin_P14',
    'tmax_P1','tmax_P2','tmax_P3','tmax_P4','tmax_P5','tmax_P6','tmax_P7','tmax_P8','tmax_P9','tmax_P10','tmax_P11','tmax_P12','tmax_P13','tmax_P14',
    'temp_P1','temp_P2','temp_P3','temp_P4','temp_P5','temp_P6','temp_P7','temp_P8','temp_P9','temp_P10','temp_P11','temp_P12','temp_P13','temp_P14',
    'rel_humidity_P1','rel_humidity_P2','rel_humidity_P3','rel_humidity_P4','rel_humidity_P5','rel_humidity_P6','rel_humidity_P7','rel_humidity_P8','rel_humidity_P9','rel_humidity_P10','rel_humidity_P11','rel_humidity_P12','rel_humidity_P13','rel_humidity_P14',
    'precipitation_P1','precipitation_P2','precipitation_P3','precipitation_P4','precipitation_P5','precipitation_P6','precipitation_P7','precipitation_P8','precipitation_P9','precipitation_P10','precipitation_P11','precipitation_P12','precipitation_P13','precipitation_P14',
    'wind_speed_P1','wind_speed_P2','wind_speed_P3','wind_speed_P4','wind_speed_P5','wind_speed_P6','wind_speed_P7','wind_speed_P8','wind_speed_P9','wind_speed_P10','wind_speed_P11','wind_speed_P12','wind_speed_P13','wind_speed_P14',
    'elevation','aspect'
]

# Features
X = data[required_variables].copy()

# Encode categorical features: None/NaN -> 0, else -> 1
categorical_columns = X.select_dtypes(include=["object"]).columns
for col in categorical_columns:
    X[col] = X[col].apply(lambda x: 0 if pd.isnull(x) or x == "None" else 1)

# Encode target: None=0, all other species=1
y = data[target_col].apply(lambda x: 0 if x == "None" else 1)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=45, stratify=y
)

# ----------------------
# Option 1: Logistic Regression with class_weight balanced
# ----------------------
log_reg_balanced = LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42)
log_reg_balanced.fit(X_train, y_train)

y_pred1 = log_reg_balanced.predict(X_test)
y_prob1 = log_reg_balanced.predict_proba(X_test)[:, 1]

print("=== Option 1: Logistic Regression (class_weight balanced) ===")
print("Accuracy:", accuracy_score(y_test, y_pred1))
print("Classification Report:\n", classification_report(y_test, y_pred1, target_names=["None","Other"]))
print("ROC-AUC:", roc_auc_score(y_test, y_prob1))


# ----------------------
# Option 3: Logistic Regression + Probability Calibration
# ----------------------
log_reg_base = LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42)
calibrated_clf = CalibratedClassifierCV(log_reg_base, method='isotonic', cv=5)
calibrated_clf.fit(X_train, y_train)

y_pred3 = calibrated_clf.predict(X_test)
y_prob3 = calibrated_clf.predict_proba(X_test)[:, 1]

print("\n=== Option 3: Logistic Regression (Calibrated) ===")
print("Accuracy:", accuracy_score(y_test, y_pred3))
print("Classification Report:\n", classification_report(y_test, y_pred3, target_names=["None","Other"]))
print("ROC-AUC:", roc_auc_score(y_test, y_prob3))


In [None]:
import geopandas as gpd
import pandas as pd
import joblib

# Load the trained model
gb_clf = gb_clf

# Load the GeoJSON file into a GeoDataFrame
spain = gpd.read_file('updated_spain.geojson')
spain = spain.to_crs('EPSG:4326')  # Convert CRS if needed
spain.rename(columns={'mode_value': 'LC', 'mean_elevation': 'elevation'}, inplace=True)

# Define the directory containing the data files
data_dir = "MSWX_V100/Past"

# Define the list of variables required for prediction
required_variables = ['Pres_1', 'Pres_2', 'Pres_3', 'Pres_4', 'Pres_5', 'Pres_6', 'Pres_7', 'Pres_8', 'Pres_9', 'Pres_10',
                      'Pres_11', 'Pres_12', 'Pres_13', 'Pres_14', 'P_1', 'P_2', 'P_3', 'P_4', 'P_5', 'P_6', 'P_7', 'P_8',
                      'P_9', 'P_10', 'P_11', 'P_12', 'P_13', 'P_14', 'RelHum_1', 'RelHum_2', 'RelHum_3', 'RelHum_4',
                      'RelHum_5', 'RelHum_6', 'RelHum_7', 'RelHum_8', 'RelHum_9', 'RelHum_10', 'RelHum_11', 'RelHum_12',
                      'RelHum_13', 'RelHum_14', 'SpecHum_1', 'SpecHum_2', 'SpecHum_3', 'SpecHum_4', 'SpecHum_5',
                      'SpecHum_6', 'SpecHum_7', 'SpecHum_8', 'SpecHum_9', 'SpecHum_10', 'SpecHum_11', 'SpecHum_12',
                      'SpecHum_13', 'SpecHum_14', 'Temp_1', 'Temp_2', 'Temp_3', 'Temp_4', 'Temp_5', 'Temp_6', 'Temp_7',
                      'Temp_8', 'Temp_9', 'Temp_10', 'Temp_11', 'Temp_12', 'Temp_13', 'Temp_14', 'Tmax_1', 'Tmax_2',
                      'Tmax_3', 'Tmax_4', 'Tmax_5', 'Tmax_6', 'Tmax_7', 'Tmax_8', 'Tmax_9', 'Tmax_10', 'Tmax_11',
                      'Tmax_12', 'Tmax_13', 'Tmax_14', 'Tmin_1', 'Tmin_2', 'Tmin_3', 'Tmin_4', 'Tmin_5', 'Tmin_6',
                      'Tmin_7', 'Tmin_8', 'Tmin_9', 'Tmin_10', 'Tmin_11', 'Tmin_12', 'Tmin_13', 'Tmin_14', 'LC',
                      'elevation']

# Initialize an empty DataFrame to store predictions
predictions_df = pd.DataFrame(columns=['geometry', 'species_prediction'])

# Iterate over each polygon in the GeoDataFrame
for index, row in spain.iterrows():
    # Extract the required variables for prediction
    variables_for_prediction = row[required_variables]


    # Reshape the variables for prediction into a single-row DataFrame
    variables_for_prediction_df = pd.DataFrame(variables_for_prediction).transpose()
    


    # Make the prediction
    predicted_species = gb_clf.predict_proba(variables_for_prediction_df)
    
    print("Predicted species:")
    print(predicted_species[0][0])

    # Append the prediction to the DataFrame
    predictions_df = pd.concat([predictions_df, pd.DataFrame({'geometry': row['geometry'], 'species_prediction': predicted_species[0][0]}, index=[0])], ignore_index=True)

# Merge the predictions with the original GeoDataFrame
spain_with_predictions = pd.concat([spain, predictions_df['species_prediction']], axis=1)

# Define the output GeoJSON file path
output_file = "spain_with_species_predictions.geojson"

# Write the GeoDataFrame with predictions to a new GeoJSON file
spain_with_predictions.to_file(output_file, driver='GeoJSON')

print("Predictions have been made and saved to", output_file)


In [None]:
import shap
import matplotlib.pyplot as plt

# Make sure you have a sample of your data to explain
# We'll use the first 100 rows for speed, but you can adjust as needed
X_sample = df[required_variables].iloc[:100]

# Create a SHAP explainer for the trained model
explainer = shap.Explainer(gb_clf, X_sample)

# Calculate SHAP values for the sample
shap_values = explainer(X_sample)

# Plot a summary SHAP graph (beeswarm plot)
shap.summary_plot(shap_values, X_sample, plot_type="dot", show=True)

# Optionally, plot a bar chart of mean absolute SHAP values (feature importance)
shap.summary_plot(shap_values, X_sample, plot_type="bar", show=True)

In [None]:
# Feature Importance
feature_importance = gb_clf.feature_importances_

# Create a DataFrame to display feature importance
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importance})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print feature importance
print(feature_importance_df[:15])

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Load your data
data = df

# Extract static features
X_static = data[['LC', 'elevation']]  # Add other static variables here

# Reshape the data
temporal_vars = ['P', 'Pres', 'RelHum', 'SpecHum', 'Temp', 'Tmax', 'Tmin']
ordered_data = pd.concat([pd.concat([data[[f"{var}_{i+1}"]] for i in range(14)], axis=1).stack().reset_index(level=1, drop=True) for var in temporal_vars], axis=1)

# Merge temporal and static features
X_temporal = ordered_data.values
X_static_repeated = np.repeat(X_static.values, 14, axis=0)
X = np.concatenate([X_temporal, X_static_repeated], axis=1)

# Reshape X to have the same number of samples as y
X = X[:525]

# Encode the target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['species'])

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train[:, :-2].shape)

# Reshape input data for LSTM (samples, timesteps, features)
X_train_temporal = np.reshape(X_train[:, :-2], (X_train.shape[0], 14, 7))
X_train_static = np.tile(X_train[:, -2:], (1, 14, 1))

# Similarly, reshape the test data
X_test_temporal = np.reshape(X_test[:, :-2], (X_test.shape[0], 14, -1))
X_test_static = np.tile(X_test[:, -2:], (1, 14, 1))

# Define the LSTM model
model = Sequential([
    LSTM(64, input_shape=(14, X_train.shape[1] // 14)),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit([X_train_temporal, X_train_static], y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
y_pred = model.predict([X_test_temporal, X_test_static])
y_pred_binary = (y_pred > 0.5).astype(int)
accuracy = accuracy_score(y_test, y_pred_binary)
print("Test Accuracy:", accuracy)


In [None]:
import os
import geopandas as gpd
import numpy as np
from datetime import datetime, timedelta
import rasterio
from shapely.geometry import Point
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
import joblib


def get_environmental_data(polygon, date, data_dir, variables, num_days):
    data = {}
    for variable in variables:
        variable_values = []
        for i in range(num_days):
            current_date = date - timedelta(days=i)
            file_date_str = current_date.strftime('%Y') + str(current_date.timetuple().tm_yday).zfill(3)
            data_file = os.path.join(data_dir, variable, "Daily", f"{file_date_str}.nc")
            if not os.path.isfile(data_file):
                print(f"File not found for {variable} on {file_date_str}")
                variable_values.append(np.nan)
                continue
            with rasterio.open(data_file, mode="r") as src:
                min_x, min_y, max_x, max_y = polygon.bounds
                centroid_x = (min_x + max_x) / 2
                centroid_y = (min_y + max_y) / 2
                centroid = Point(centroid_x, centroid_y)
                px, py = src.index(centroid.x, centroid.y)
                value = src.read(1, window=((py, py+1), (px, px+1)))
                variable_values.append(value[0, 0])
        data[variable] = variable_values[::-1]  # Reverse the list to align with the order needed (recent to past)
    
    return data

def process_row(row_tuple):
    index, row = row_tuple
    average_data = get_environmental_data(row['geometry'], test_date, data_dir, variables, num_days)
    variables_for_prediction = {}
    for variable in variables:
        for day_number, value in enumerate(average_data[variable], start=1):
            variable_name = f"{variable}_{day_number}"
            variables_for_prediction[variable_name] = value
    variables_for_prediction['LC'] = row['mode_value']
    variables_for_prediction['elevation'] = row['mean_elevation']
    df = pd.DataFrame(variables_for_prediction, index=[0])
    # Display all rows and columns without truncation
    #with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    #    print(df)    
    predicted_species = gb_clf.predict_proba(df)[0][0]  # Make prediction
    return predicted_species


# Load GeoJSON file into a GeoDataFrame
spain = gpd.read_file('spain_3km_ready.geojson')
spain = spain.to_crs('EPSG:4326')

data_dir = "new_data/NRT"
variables = ['Pres', 'P', 'RelHum', 'SpecHum', 'Temp', 'Tmax', 'Tmin']
num_days = 14
test_date = datetime(2024, 5, 3)

# Use ThreadPoolExecutor to run multiple threads
with ThreadPoolExecutor(max_workers=1) as executor:
    predictions = list(tqdm(executor.map(process_row, spain.iterrows()), total=len(spain), desc="Making predictions"))

# Add predictions to the GeoDataFrame
spain['species_prediction'] = predictions

# Define the output GeoJSON file path
output_file = "spain_with_species_predictions.geojson"

# Write the GeoDataFrame with predictions to a new GeoJSON file
spain[['geometry', 'species_prediction']].to_file(output_file, driver='GeoJSON')

print("Predictions have been made and saved to", output_file)


In [None]:
import os
import geopandas as gpd
import numpy as np
from datetime import datetime, timedelta
import rasterio
from shapely.geometry import Point
from tqdm import tqdm
import pandas as pd

def get_environmental_data(polygon, date, data_dir, variables, num_days):
    data = {}
    for variable in variables:
        variable_values = []
        for i in range(num_days):
            current_date = date - timedelta(days=i)
            file_date_str = current_date.strftime('%Y') + str(current_date.timetuple().tm_yday).zfill(3)
            data_file = os.path.join(data_dir, variable, "Daily", f"{file_date_str}.nc")
            if not os.path.isfile(data_file):
                print(f"File not found for {variable} on {file_date_str}")
                variable_values.append(np.nan)
                continue
            with rasterio.open(data_file, mode="r") as src:
                min_x, min_y, max_x, max_y = polygon.bounds
                centroid_x = (min_x + max_x) / 2
                centroid_y = (min_y + max_y) / 2
                centroid = Point(centroid_x, centroid_y)
                px, py = src.index(centroid.x, centroid.y)
                value = src.read(1, window=((py, py+1), (px, px+1)))
                variable_values.append(value[0, 0])
        data[variable] = variable_values[::-1]  # Reverse the list to align with the order needed (recent to past)
    
    return data

def process_row(index, row):
    average_data = get_environmental_data(row['geometry'], test_date, data_dir, variables, num_days)
    variables_for_prediction = {}
    for variable in variables:
        for day_number, value in enumerate(average_data[variable], start=1):
            variable_name = f"{variable}_{day_number}"
            variables_for_prediction[variable_name] = value
    variables_for_prediction['LC'] = row['mode_value']
    variables_for_prediction['elevation'] = row['mean_elevation']
    df = pd.DataFrame(variables_for_prediction, index=[0])
    predicted_species = gb_clf.predict_proba(df)[0][0]  # Make prediction
    return predicted_species

# Load GeoJSON file into a GeoDataFrame
spain = gpd.read_file('spain_3km_ready.geojson')
spain = spain.to_crs('EPSG:4326')

data_dir = "new_data/NRT"
variables = ['Pres', 'P', 'RelHum', 'SpecHum', 'Temp', 'Tmax', 'Tmin']
num_days = 14
test_date = datetime(2024, 5, 3)

# Sequentially process each row
predictions = []
for index, row in tqdm(spain.iterrows(), total=len(spain), desc="Making predictions"):
    prediction = process_row(index, row)
    predictions.append(prediction)

# Add predictions to the GeoDataFrame
spain['species_prediction'] = predictions

# Define the output GeoJSON file path
output_file = "spain_with_species_predictions.geojson"

# Write the GeoDataFrame with predictions to a new GeoJSON file
spain[['geometry', 'species_prediction']].to_file(output_file, driver='GeoJSON')

print("Predictions have been made and saved to", output_file)


In [None]:
import os
import geopandas as gpd
import numpy as np
from datetime import datetime, timedelta
import rasterio
from shapely.geometry import Point
from tqdm import tqdm
import pandas as pd
from joblib import Parallel, delayed
import multiprocessing

def extract_data_from_raster(data_file, polygons):
    values = []
    if not os.path.isfile(data_file):
        print(f"File not found: {data_file}")
        return [np.nan] * len(polygons)
    
    with rasterio.open(data_file, mode="r") as src:
        for polygon in tqdm(polygons, desc=data_file):
            min_x, min_y, max_x, max_y = polygon.bounds
            centroid_x = (min_x + max_x) / 2
            centroid_y = (min_y + max_y) / 2
            centroid = Point(centroid_x, centroid_y)
            px, py = src.index(centroid.x, centroid.y)
            try:
                value = src.read(1, window=((py, py+1), (px, px+1)))
                values.append(value[0, 0])
            except:
                values.append(np.nan)
    
    return values

def get_environmental_data(polygons, date, data_dir, variables, num_days):
    data = {variable: [[] for _ in range(len(polygons))] for variable in variables}
    
    dates = [date - timedelta(days=i) for i in range(num_days)]
    
    for variable in tqdm(variables, desc="Variable"):
        print(f"Processing variable: {variable}")
        for current_date in dates:
            file_date_str = current_date.strftime('%Y') + str(current_date.timetuple().tm_yday).zfill(3)
            data_file = os.path.join(data_dir, variable, "Daily", f"{file_date_str}.nc")
            values = extract_data_from_raster(data_file, polygons)
            for i, value in enumerate(values):
                data[variable][i].append(value)
    
    for variable in variables:
        for i in range(len(polygons)):
            data[variable][i] = data[variable][i][::-1]  # Reverse the list to align with the order needed (recent to past)
    
    return data

def prepare_data_for_prediction(polygon_index, all_data, variables, num_days, row):
    variables_for_prediction = {}
    for variable in variables:
        for day_number in range(num_days):
            variable_name = f"{variable}_{day_number+1}"
            variables_for_prediction[variable_name] = all_data[variable][polygon_index][day_number]
    
    variables_for_prediction['LC'] = row['mode_value']
    variables_for_prediction['elevation'] = row['mean_elevation']
    
    return variables_for_prediction

# Load GeoJSON file into a GeoDataFrame
spain = gpd.read_file('spain_3km_ready.geojson')
spain = spain.to_crs('EPSG:4326')

data_dir = "new_data/NRT"
variables = ['Pres', 'P', 'RelHum', 'SpecHum', 'Temp', 'Tmax', 'Tmin']
num_days = 14
test_date = datetime(2024, 5, 3)

# Prepare polygon geometries
polygon_geometries = spain['geometry']

# Get environmental data for all polygons
all_data = get_environmental_data(polygon_geometries, test_date, data_dir, variables, num_days)

# Sequentially process each row for predictions
predictions = []
for index, row in tqdm(spain.iterrows(), total=len(spain), desc="Making predictions"):
    data_dict = prepare_data_for_prediction(index, all_data, variables, num_days, row)
    df = pd.DataFrame(data_dict, index=[0])
    predicted_species = gb_clf.predict_proba(df)[0][0]  # Make prediction
    predictions.append(predicted_species)

# Add predictions to the GeoDataFrame
spain['species_prediction'] = predictions

# Define the output GeoJSON file path
output_file = "spain_with_species_predictions.geojson"

# Write the GeoDataFrame with predictions to a new GeoJSON file
spain[['geometry', 'species_prediction']].to_file(output_file, driver='GeoJSON')

print("Predictions have been made and saved to", output_file)


In [None]:
# Prepare data for all rows
prepared_data = []
for index, row in tqdm(spain.iterrows(), total=len(spain), desc="Preparing data"):
    data_dict = prepare_data_for_prediction(index, all_data, variables, num_days, row)
    prepared_data.append(data_dict)

# Concatenate all prepared data into a single DataFrame
all_prepared_data = pd.DataFrame(prepared_data)

# Fill NaN values with column means
all_prepared_data.fillna(all_prepared_data.mean(), inplace=True)

# Sequential processing for predictions
predictions = []
for index, row in tqdm(all_prepared_data.iterrows(), total=len(all_prepared_data), desc="Making predictions"):
    df = row.to_frame().T  # Convert the row to a DataFrame for prediction
    predicted_species = gb_clf.predict_proba(df)[0][0]  # Make prediction
    predictions.append(predicted_species)

# Add predictions to the GeoDataFrame
spain['species_prediction'] = predictions


# Keep only 'geometry' and 'species_prediction' columns
spain_smaller = spain[['geometry', 'species_prediction']]

# Define the output GeoJSON file path
output_file = "spain_with_species_predictions_whole.geojson"

# Write the GeoDataFrame with predictions to a new GeoJSON file
spain_smaller.to_file(output_file, driver='GeoJSON')

print("Predictions have been made and saved to", output_file)

In [None]:
# Remove polygons with a species_prediction value lower than 0.0001
import geopandas as pd

spain_smaller = pd.read_file("spain_with_species_predictions_whole.geojson")
threshold = 0.005
initial_count = len(spain_smaller)
spain_filtered = spain_smaller[spain_smaller['species_prediction'] >= threshold]
removed_count = initial_count - len(spain_filtered)

# Print the number of polygons removed
print(f"Number of polygons removed: {removed_count}")
print(f"Number of polygons left: {len(spain_filtered)}")

# Define the output GeoJSON file path
output_file = "spain_with_species_predictions_smaller.geojson"

# Write the GeoDataFrame with predictions to a new GeoJSON file
spain_filtered.to_file(output_file, driver='GeoJSON')

print("Predictions have been made and saved to", output_file)