In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path
import pandas as pd
from bramm_data_analysis import loaders
from sklearn import decomposition, preprocessing
import numpy as np
import matplotlib.pyplot as plt

In [None]:
columns = {}

columns["metadata"] = [
    "site_code",
    "site_insee_code",
    "department_code",
    "latitude",
    "longitude",
    "x_lambert",
    "y_lambert",
    "altitude",
    "date",
    "sample_code",
    "sample_send_europe",
]
columns["sample"] = [
    "strand_size",
    "visible_dust_particles",
    "visible_pollen_particles",
    "mineral_type",
]
columns["site"] = [
    "samples_nb",
    "fern_samples_nb",
    "tree_samples_nb",
    "herb_samples_nb",
    "litter_samples_nb",
    "humus_samples_nb",
    "soi_samples_nb",
    "sand_samples_nb",
    "hard_strain_coniferous_samples_nb",
    "hard_strain_hardwood_samples_nb",
    "hard_strain_unknown_samples_nb",
    "decomposed_strain_coniferous_samples_nb",
    "decomposed_strain_hardwood_samples_nb",
    "decomposed_strain_unknown_samples_nb",
    "bark_coniferous_nb",
    "bark_hardwood_samples_nb",
    "bark_unknown_samples_nb",
    "hard_coniferous_samples_nb",
    "hard_hardwood_samples_nb",
    "hard_unknown_samples_nb",
    "decomposed_coniferous_samples_nb",
    "decomposed_hardwood_samples_nb",
    "decomposed_unknown_samples_nb",
]
columns["studies"] = [
    "sample_outside_complementary_study",
    "cs_3_species_comparison",
    "cs_2_species_comparison",
    "cs_repeated_sampling",
    "cs_repeated_analysis",
]
columns["incertitudes"] = [
    "aluminium_incertitude",
    "arsenic_incertitude",
    "calcium_incertitude",
    "cadmium_incertitude",
    "cobalt_incertitude",
    "chromium_incertitude",
    "copper_incertitude",
    "iron_incertitude",
    "mercury_incertitude",
    "nitrogen_incertitude",
    "sodium_incertitude",
    "nickel_incertitude",
    "lead_incertitude",
    "sulfur_incertitude",
    "zinc_incertitude",
]
columns["categorical"] = [
    "weather",
    "tree_layer",
    "tree_layer_complement",
    "tree_cover",
    "species",
]

In [None]:
moss_data_path = Path("../data/Mines_2024.xlsx")
df = loaders.from_moss_csv(moss_data_path).retrieve_df(
    duplicates_handling_strategy=None
)

In [None]:
# Correct Data
df.loc[
    df["tree_layer_complement"].isna(), "tree_layer_complement"
] = "unspecified"

In [None]:
metadata = df[columns["metadata"]]
site_info = df[columns["site"]]
samples_info = df[columns["sample"]]
studies = df[columns["studies"]]
incertitudes = df[columns["incertitudes"]]
data = df.drop(
    columns=columns["metadata"]
    + columns["sample"]
    + columns["studies"]
    + columns["site"]
    + columns["incertitudes"]
)

# One-Hot encoding
encoder = preprocessing.OneHotEncoder(
    sparse_output=False, handle_unknown="ignore"
)
categorical = pd.concat(
    [data.pop(col) for col in columns["categorical"]], axis=1
)
data_one_hot = encoder.fit_transform(categorical)
categories = encoder.categories_
unwrap_categories = []
for i in range(len(categories)):
    cat = columns["categorical"][i]
    cats = categories[i]
    unwrap_categories.append([f"{cat}_{c}" for c in cats])
categories_encoded = pd.DataFrame(
    columns=np.concatenate(unwrap_categories), data=data_one_hot
)
data = pd.concat([data, categories_encoded], axis=1)

# Remove Columns with nans
print("The following columns contains NaN and will be removed :")
print(data.columns[data.isna().any(axis=0)])
data = data.drop(columns=data.columns[data.isna().any(axis=0)])

In [None]:
print(data.columns)

In [None]:
pca = decomposition.PCA(n_components=32)
data_pca = pca.fit_transform(data)

In [None]:
pca.explained_variance_

In [None]:
plt.scatter(
    [i for i in range(len(pca.explained_variance_))],
    np.log(pca.explained_variance_[::-1].cumsum()),
)

In [None]:
df["species"].shape

In [None]:
plt.scatter(data_pca[:, 0], data_pca[:, 1])