In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
from pathlib import Path
from sklearn.neighbors import NearestNeighbors
from bramm_data_analysis import loading
import matplotlib.pyplot as plt
import geopandas as gpd
import numpy as np

### Load Moss Data

In [None]:
columns = {}

columns["metadata"] = [
    "site_code",
    "site_insee_code",
    "department_code",
    "latitude",
    "longitude",
    "x_lambert",
    "y_lambert",
    "altitude",
    "date",
    "sample_code",
    "sample_send_europe",
]
columns["sample"] = [
    "strand_size",
    "visible_dust_particles",
    "visible_pollen_particles",
    "mineral_type",
]
columns["site"] = [
    "samples_nb",
    "fern_samples_nb",
    "tree_samples_nb",
    "herb_samples_nb",
    "litter_samples_nb",
    "humus_samples_nb",
    "soi_samples_nb",
    "sand_samples_nb",
    "hard_strain_coniferous_samples_nb",
    "hard_strain_hardwood_samples_nb",
    "hard_strain_unknown_samples_nb",
    "decomposed_strain_coniferous_samples_nb",
    "decomposed_strain_hardwood_samples_nb",
    "decomposed_strain_unknown_samples_nb",
    "bark_coniferous_nb",
    "bark_hardwood_samples_nb",
    "bark_unknown_samples_nb",
    "hard_coniferous_samples_nb",
    "hard_hardwood_samples_nb",
    "hard_unknown_samples_nb",
    "decomposed_coniferous_samples_nb",
    "decomposed_hardwood_samples_nb",
    "decomposed_unknown_samples_nb",
]
columns["studies"] = [
    "sample_outside_complementary_study",
    "cs_3_species_comparison",
    "cs_2_species_comparison",
    "cs_repeated_sampling",
    "cs_repeated_analysis",
]
columns["incertitudes"] = [
    "aluminium_incertitude",
    "arsenic_incertitude",
    "calcium_incertitude",
    "cadmium_incertitude",
    "cobalt_incertitude",
    "chromium_incertitude",
    "copper_incertitude",
    "iron_incertitude",
    "mercury_incertitude",
    "nitrogen_incertitude",
    "sodium_incertitude",
    "nickel_incertitude",
    "lead_incertitude",
    "sulfur_incertitude",
    "zinc_incertitude",
]
columns["categorical"] = [
    "weather",
    "tree_layer",
    "tree_layer_complement",
    "tree_cover",
    "species",
]
df_sites = loading.load_sites(Path("../data/Mines_2024.xlsx"))
df_samples = loading.load_samples(Path("../data/Mines_2024.xlsx"))
df_values = loading.load_values(Path("../data/Mines_2024.xlsx"))
df_moss = pd.merge(
    pd.merge(
        df_sites,
        df_samples,
        on="site_code",
    ),
    df_values,
    on="sample_code",
)


# Correct data
for col in [
    "sodium",
    "platinium",
    "rhodium",
    "antimony",
    "strontium",
    "vanadium",
    "zinc",
]:
    df_moss[col] = df_moss[col].astype(str)
    df_moss[col] = df_moss[col].apply(lambda x: x.replace(",", "."))
    df_moss[col] = df_moss[col].apply(lambda x: x.replace("< ", ""))
    df_moss[col].astype("float64")

In [None]:
df_rmqs = pd.read_csv("../data/RMQS.csv")
df_rmqs["date_complete"] = pd.to_datetime(df_rmqs["date_complete"])
df_rmqs.reset_index(drop=True, inplace=True)

In [None]:
df_rmqs["date_complete"].dt.year.hist()

In [None]:
year_threshold = 2000
df_rmqs = df_rmqs[df_rmqs["date_complete"].dt.year >= year_threshold]

## Closest Points Matching

In [None]:
estimator = NearestNeighbors(n_neighbors=1, metric="haversine")
moss_xy = df_moss[["longitude", "latitude"]]
rmqs_xy = df_rmqs[["longitude", "latitude"]]
estimator.fit(rmqs_xy)
dist, indexes = estimator.kneighbors(moss_xy)

nearests = df_rmqs.loc[df_rmqs.index[indexes.flatten()]]
nearests.index = df_moss.index

raw_match = pd.merge(
    left=df_moss,
    right=nearests,
    left_index=True,
    right_index=True,
    suffixes=("_moss", "_rmqs"),
).merge(
    right=pd.DataFrame({"distance": dist.flatten()}, index=nearests.index),
    left_index=True,
    right_index=True,
)


threshold = 0.01
final_match = raw_match[raw_match["distance"] <= threshold].reset_index()
print(
    f"Threshold : {threshold * 6371} km \nConserved : {final_match.shape[0]} / {raw_match.shape[0]}"
)

In [None]:
# gdf_moss = gpd.GeoDataFrame(
#     data=df_moss,
#     geometry=gpd.points_from_xy(
#         x=df_moss["longitude"], y=df_moss["latitude"], crs="epsg:4326"
#     ),
# )
# gdf_moss_match = gpd.GeoDataFrame(
#     data=df_moss_match,
#     geometry=gpd.points_from_xy(
#         x=df_moss_match["longitude"], y=df_moss_match["latitude"], crs="epsg:4326"
#     ),
# )
# gdf_rmqs_match = gpd.GeoDataFrame(
#     data=rmqs_match,
#     geometry=gpd.points_from_xy(
#         x=rmqs_match["longitude"], y=rmqs_match["latitude"], crs="epsg:4326"
#     ),
# )
# gdf_rmqs_not_match = gpd.GeoDataFrame(
#     data=rmqs_not_match,
#     geometry=gpd.points_from_xy(
#         x=rmqs_not_match["longitude"], y=rmqs_not_match["latitude"], crs="epsg:4326"
#     ),
# )
# gdf_rmqs = gpd.GeoDataFrame(
#     data=df_rmqs,
#     geometry=gpd.points_from_xy(
#         x=df_rmqs["longitude"], y=df_rmqs["latitude"], crs="epsg:4326"
#     ),
# )

In [None]:
plt.scatter(
    df_moss["longitude"], df_moss["latitude"], label="all moss", alpha=0.4
)
plt.scatter(
    final_match["longitude_moss"],
    final_match["latitude_moss"],
    label="Moss match",
    alpha=0.5,
    color="green",
)
plt.scatter(
    final_match["longitude_rmqs"],
    final_match["latitude_rmqs"],
    label="RMQS match",
    alpha=0.5,
    color="red",
)
plt.legend()
plt.plot()

## Multivariate Analysis

In [None]:
import gstlearn as gl
import gstlearn.plot as gp
import gstlearn.document as gdoc
import gstools as gs

verbose = True
graphics = True
gl.OptCst.define(gl.ECst.NTCOL, 6)
gdoc.setNoScroll()

In [None]:
print(df_moss.columns)

In [None]:
print(df_rmqs.filter(like="_tot_hf").columns)

In [None]:
print(df_moss.columns)

In [None]:
x1 = "longitude"
x2 = "latitude"
z1 = "aluminium"
z2 = "al_tot_hf"
raw_df = final_match.filter([x1, x2, z1, z2]).astype("float")
final_df = raw_df  # [
# (raw_df[z1] <= 15000)
# &
# (raw_df[z2] <= 1)
# ]

In [None]:
mydb = gl.Db_fromPanda(final_df)
mydb.setLocators([x1, x2], gl.ELoc.X)
mydb.setLocator(z1, gl.ELoc.Z)

In [None]:
mydb

In [None]:
gl.dbStatisticsPrint(
    db=mydb,
    names=[z1, z2],
    opers=gl.EStatOption.fromKeys(["MEAN", "MINI", "MAXI"]),
    flagIso=False,
    title="Number of observations",
)

In [None]:
ax = gp.correlation(mydb, namex=z1, namey=z2, asPoint=True, regrLine=True)
ax.decoration(title=f"Correlation between {z1} and {z2}")