# Precompute the embeddings for the training and test set

This notebook precomputes the embeddings of the data.

This process is lengthy and can be done once and for all. It takes about 3 hours on a single GPU.

## Setup and imports

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pickle
from pathlib import Path
from warnings import filterwarnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
import torch
import umap
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from tqdm.autonotebook import tqdm
from transformers import AutoModel, AutoTokenizer

from nlp_assemblee.datasets import build_dataset_and_dataloader_from_config
from nlp_assemblee.models import build_classifier_from_config
from nlp_assemblee.simple_precompute import (
    get_embeddings_dict,
    get_embeddings_dict_from_hugging,
    get_embeddings_list,
    get_embeddings_list_from_hugging,
    get_embeddings_list_unbatched,
    get_embeddings_matrix,
    plot_proj_from_emb_dict,
    save_embedding_matrix,
    save_embedding_matrix_from_list,
    train_test_val_split,
)

filterwarnings("ignore")

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

### Load the data

In [None]:
df = pd.read_pickle("../data/processed/14th_merged_data_short.pkl")
df = df[
    [
        "nom",
        "groupe",
        "date_seance",
        "nb_mots_approx",
        "profession",
        "titre",
        "titre_complet",
        "intervention",
        "sexe",
        "n_y_naissance",
        "label",
    ]
]
reg = "(article|l'article)\s*(\d+[^\w\s]*|premier|deuxième|troisième|[^\w\s]*\d+[^\w\s]*)"
df["titre_regexed"] = df["titre"].str.replace(reg, "Article X", regex=True)
df["contexte"] = (
    df["titre_complet"].str.split(" > ").apply(lambda x: x[0] if len(x) > 1 else "Sans contexte")
)

In [None]:
idx_train, idx_val, idx_test = train_test_val_split(
    df["label"], train_pc=0.5, val_pc=0.2, stratify=True, random_state=42
)

In [None]:
df.head()

## Sentence transformer

### MiniLM-L12

In [None]:
model_name = "paraphrase-multilingual-MiniLM-L12-v2"
model = SentenceTransformer(model_name, device=device)
model.eval()

In [None]:
path = Path(f"../data/precomputed/{model_name}")
path.mkdir(exist_ok=True, parents=True)

#### Profession

In [None]:
profession_dict, profession_to_int = get_embeddings_dict(model, df, "profession", batch_size=64)

In [None]:
save_embedding_matrix(profession_dict, profession_to_int, "profession", path)
umap_fig = plot_proj_from_emb_dict(
    "umap", profession_dict, profession_to_int, df, "profession", path
)
tsne_fig = plot_proj_from_emb_dict(
    "tsne", profession_dict, profession_to_int, df, "profession", path
)
pca_fig = plot_proj_from_emb_dict("pca", profession_dict, profession_to_int, df, "profession", path)

#### Titre

In [None]:
titre_dict, titre_to_int = get_embeddings_dict(model, df, "titre_regexed", batch_size=64)

In [None]:
save_embedding_matrix(titre_dict, titre_to_int, "titre_regexed", path)
umap_fig = plot_proj_from_emb_dict("umap", titre_dict, titre_to_int, df, "titre_regexed", path)
tsne_fig = plot_proj_from_emb_dict("tsne", titre_dict, titre_to_int, df, "titre_regexed", path)
pca_fig = plot_proj_from_emb_dict("pca", titre_dict, titre_to_int, df, "titre_regexed", path)

#### Contexte

In [None]:
contexte_dict, contexte_to_int = get_embeddings_dict(model, df, "contexte", batch_size=64)

In [None]:
save_embedding_matrix(contexte_dict, contexte_to_int, "contexte", path)
umap_fig = plot_proj_from_emb_dict("umap", contexte_dict, contexte_to_int, df, "contexte", path)
tsne_fig = plot_proj_from_emb_dict("tsne", contexte_dict, contexte_to_int, df, "contexte", path)
pca_fig = plot_proj_from_emb_dict("pca", contexte_dict, contexte_to_int, df, "contexte", path)

#### Titre complet

In [None]:
titre_complet_dict, titre_complet_to_int = get_embeddings_dict(
    model, df, "titre_complet", batch_size=64
)

In [None]:
save_embedding_matrix(titre_complet_dict, titre_complet_to_int, "titre_complet", path)
umap_fig = plot_proj_from_emb_dict(
    "umap", titre_complet_dict, titre_complet_to_int, df, "titre_complet", path
)
tsne_fig = plot_proj_from_emb_dict(
    "tsne", titre_complet_dict, titre_complet_to_int, df, "titre_complet", path
)
pca_fig = plot_proj_from_emb_dict(
    "pca", titre_complet_dict, titre_complet_to_int, df, "titre_complet", path
)

#### Interventions

In [None]:
intervention_list = get_embeddings_list(model, df, "intervention", batch_size=256)

#### Saving the computed data

In [None]:
idx_train, idx_val, idx_test = train_test_val_split(
    df["label"], train_pc=0.5, val_pc=0.2, stratify=True, random_state=42
)

records = {
    "profession": df["profession"].map(profession_to_int).values,
    "titre_regexed": df["titre_regexed"].map(titre_to_int).values,
    "titre_complet": df["titre_complet"].map(titre_complet_to_int).values,
    "contexte": df["contexte"].map(contexte_to_int).values,
    "intervention": intervention_list,
    "sexe": df["sexe"].map({"H": 0.0, "F": 1.0}).values,
    "n_y_naissance": df["n_y_naissance"].values,
    "label": df["label"].values,
}

train_records = {k: v[idx_train] for k, v in records.items()}
val_records = {k: v[idx_val] for k, v in records.items()}
test_records = {k: v[idx_test] for k, v in records.items()}

with open(path / "precomputed_train.pkl", "wb") as f:
    pickle.dump(train_records, f)

with open(path / "precomputed_val.pkl", "wb") as f:
    pickle.dump(val_records, f)

with open(path / "precomputed_test.pkl", "wb") as f:
    pickle.dump(test_records, f)

### DistilUse-base-v2

In [None]:
model_name = "distiluse-base-multilingual-cased-v2"
model = SentenceTransformer(model_name, device=device)
model.eval()

In [None]:
path = Path(f"../data/precomputed/{model_name}")
path.mkdir(exist_ok=True, parents=True)

#### Profession

In [None]:
profession_dict, profession_to_int = get_embeddings_dict(model, df, "profession", batch_size=64)

In [None]:
save_embedding_matrix(profession_dict, profession_to_int, "profession", path)

##### Images

In [None]:
fig = plot_proj_from_emb_dict("umap", profession_dict, profession_to_int, df, "profession", path)

#### Titre

In [None]:
titre_dict, titre_to_int = get_embeddings_dict(model, df, "titre_regexed", batch_size=64)

In [None]:
save_embedding_matrix(titre_dict, titre_to_int, "titre_regexed", path)
umap_fig = plot_proj_from_emb_dict("umap", titre_dict, titre_to_int, df, "titre_regexed", path)
tsne_fig = plot_proj_from_emb_dict("tsne", titre_dict, titre_to_int, df, "titre_regexed", path)
pca_fig = plot_proj_from_emb_dict("pca", titre_dict, titre_to_int, df, "titre_regexed", path)

#### Contexte

In [None]:
contexte_dict, contexte_to_int = get_embeddings_dict(model, df, "contexte", batch_size=64)

In [None]:
save_embedding_matrix(contexte_dict, contexte_to_int, "contexte", path)
umap_fig = plot_proj_from_emb_dict("umap", contexte_dict, contexte_to_int, df, "contexte", path)
tsne_fig = plot_proj_from_emb_dict("tsne", contexte_dict, contexte_to_int, df, "contexte", path)
pca_fig = plot_proj_from_emb_dict("pca", contexte_dict, contexte_to_int, df, "contexte", path)

#### Titre complet

In [None]:
titre_complet_dict, titre_complet_to_int = get_embeddings_dict(
    model, df, "titre_complet", batch_size=64
)

In [None]:
save_embedding_matrix(titre_complet_dict, titre_complet_to_int, "titre_complet", path)
umap_fig = plot_proj_from_emb_dict(
    "umap", titre_complet_dict, titre_complet_to_int, df, "titre_complet", path
)
tsne_fig = plot_proj_from_emb_dict(
    "tsne", titre_complet_dict, titre_complet_to_int, df, "titre_complet", path
)
pca_fig = plot_proj_from_emb_dict(
    "pca", titre_complet_dict, titre_complet_to_int, df, "titre_complet", path
)

#### Interventions

In [None]:
intervention_list = get_embeddings_list(model, df, "intervention", batch_size=256)

#### Saving the computed data

In [None]:
records = {
    "profession": df["profession"].map(profession_to_int).values,
    "titre_regexed": df["titre_regexed"].map(titre_to_int).values,
    "titre_complet": df["titre_complet"].map(titre_complet_to_int).values,
    "contexte": df["contexte"].map(contexte_to_int).values,
    "intervention": intervention_list,
    "sexe": df["sexe"].map({"H": 0.0, "F": 1.0}).values,
    "n_y_naissance": df["n_y_naissance"].values,
    "label": df["label"].values,
}

train_records = {k: v[idx_train] for k, v in records.items()}
val_records = {k: v[idx_val] for k, v in records.items()}
test_records = {k: v[idx_test] for k, v in records.items()}

with open(path / "precomputed_train.pkl", "wb") as f:
    pickle.dump(train_records, f)

with open(path / "precomputed_val.pkl", "wb") as f:
    pickle.dump(val_records, f)

with open(path / "precomputed_test.pkl", "wb") as f:
    pickle.dump(test_records, f)

### MPNET-base

In [None]:
model_name = "paraphrase-multilingual-mpnet-base-v2"
model = SentenceTransformer(model_name, device=device)
model.eval()

In [None]:
path = Path(f"../data/precomputed/{model_name}")
path.mkdir(exist_ok=True, parents=True)

#### Profession

In [None]:
profession_dict, profession_to_int = get_embeddings_dict(model, df, "profession", batch_size=64)

In [None]:
save_embedding_matrix(profession_dict, profession_to_int, "profession", path)
umap_fig = plot_proj_from_emb_dict(
    "umap", profession_dict, profession_to_int, df, "profession", path
)
tsne_fig = plot_proj_from_emb_dict(
    "tsne", profession_dict, profession_to_int, df, "profession", path
)
pca_fig = plot_proj_from_emb_dict("pca", profession_dict, profession_to_int, df, "profession", path)

#### Titre

In [None]:
titre_dict, titre_to_int = get_embeddings_dict(model, df, "titre_regexed", batch_size=64)

In [None]:
save_embedding_matrix(titre_dict, titre_to_int, "titre_regexed", path)
umap_fig = plot_proj_from_emb_dict("umap", titre_dict, titre_to_int, df, "titre_regexed", path)
tnse_fig = plot_proj_from_emb_dict("tnse", titre_dict, titre_to_int, df, "titre_regexed", path)
pca_fig = plot_proj_from_emb_dict("pca", titre_dict, titre_to_int, df, "titre_regexed", path)

#### Contexte

In [None]:
contexte_dict, contexte_to_int = get_embeddings_dict(model, df, "contexte", batch_size=64)

In [None]:
save_embedding_matrix(contexte_dict, contexte_to_int, "contexte", path)
umap_fig = plot_proj_from_emb_dict("umap", contexte_dict, contexte_to_int, df, "contexte", path)
tsne_fig = plot_proj_from_emb_dict("tsne", contexte_dict, contexte_to_int, df, "contexte", path)
pca_fig = plot_proj_from_emb_dict("pca", contexte_dict, contexte_to_int, df, "contexte", path)

#### Titre complet

In [None]:
titre_complet_dict, titre_complet_to_int = get_embeddings_dict(
    model, df, "titre_complet", batch_size=64
)

In [None]:
save_embedding_matrix(titre_complet_dict, titre_complet_to_int, "titre_complet", path)
umap_fig = plot_proj_from_emb_dict(
    "umap", titre_complet_dict, titre_complet_to_int, df, "titre_complet", path
)
tsne_fig = plot_proj_from_emb_dict(
    "tsne", titre_complet_dict, titre_complet_to_int, df, "titre_complet", path
)
pca_fig = plot_proj_from_emb_dict(
    "pca", titre_complet_dict, titre_complet_to_int, df, "titre_complet", path
)

#### Interventions

In [None]:
intervention_list = get_embeddings_list(model, df, "intervention", batch_size=256)

#### Saving the computed data

In [None]:
records = {
    "profession": df["profession"].map(profession_to_int).values,
    "titre_regexed": df["titre_regexed"].map(titre_to_int).values,
    "titre_complet": df["titre_complet"].map(titre_complet_to_int).values,
    "contexte": df["contexte"].map(contexte_to_int).values,
    "intervention": intervention_list,
    "sexe": df["sexe"].map({"H": 0.0, "F": 1.0}).values,
    "n_y_naissance": df["n_y_naissance"].values,
    "label": df["label"].values,
}

train_records = {k: v[idx_train] for k, v in records.items()}
val_records = {k: v[idx_val] for k, v in records.items()}
test_records = {k: v[idx_test] for k, v in records.items()}

with open(path / "precomputed_train.pkl", "wb") as f:
    pickle.dump(train_records, f)

with open(path / "precomputed_val.pkl", "wb") as f:
    pickle.dump(val_records, f)

with open(path / "precomputed_test.pkl", "wb") as f:
    pickle.dump(test_records, f)

### Camembert-base

In [None]:
model_name = "dangvantuan/sentence-camembert-base"
model = SentenceTransformer(model_name, device=device)
model.eval()

In [None]:
path = Path("../data/precomputed/camembert-base")
path.mkdir(exist_ok=True, parents=True)

#### Profession

In [None]:
profession_matrix, profession_to_int = get_embeddings_matrix(model, df, "profession", batch_size=64)

In [None]:
save_embedding_matrix_from_list(profession_matrix, profession_to_int, "profession", path)
umap_fig = plot_proj_from_emb_dict(
    "umap", profession_matrix.T, profession_to_int, df, "profession", path
)
tsne_fig = plot_proj_from_emb_dict(
    "tsne", profession_matrix.T, profession_to_int, df, "profession", path
)
pca_fig = plot_proj_from_emb_dict(
    "pca", profession_matrix.T, profession_to_int, df, "profession", path
)

#### Titre

In [None]:
titre_matrix, titre_to_int = get_embeddings_matrix(model, df, "titre_regexed", batch_size=64)

In [None]:
save_embedding_matrix_from_list(titre_matrix, titre_to_int, "titre_regexed", path)
umap_fig = plot_proj_from_emb_dict("umap", titre_matrix.T, titre_to_int, df, "titre_regexed", path)
tsne_fig = plot_proj_from_emb_dict("tsne", titre_matrix.T, titre_to_int, df, "titre_regexed", path)
pca_fig = plot_proj_from_emb_dict("pca", titre_matrix.T, titre_to_int, df, "titre_regexed", path)

#### Contexte

In [None]:
contexte_matrix, contexte_to_int = get_embeddings_matrix(model, df, "contexte", batch_size=64)

In [None]:
save_embedding_matrix_from_list(contexte_matrix, contexte_to_int, "contexte", path)
umap_fig = plot_proj_from_emb_dict("umap", contexte_matrix.T, contexte_to_int, df, "contexte", path)
tsne_fig = plot_proj_from_emb_dict("tsne", contexte_matrix.T, contexte_to_int, df, "contexte", path)
pca_fig = plot_proj_from_emb_dict("pca", contexte_matrix.T, contexte_to_int, df, "contexte", path)

#### Titre complet

In [None]:
titre_complet_matrix, titre_complet_to_int = get_embeddings_matrix(
    model, df, "titre_complet", batch_size=64
)

In [None]:
save_embedding_matrix_from_list(titre_complet_matrix, titre_complet_to_int, "titre_complet", path)
umap_fig = plot_proj_from_emb_dict(
    "umap", titre_complet_matrix.T, titre_complet_to_int, df, "titre_complet", path
)
tsne_fig = plot_proj_from_emb_dict(
    "tsne", titre_complet_matrix.T, titre_complet_to_int, df, "titre_complet", path
)
pca_fig = plot_proj_from_emb_dict(
    "pca", titre_complet_matrix.T, titre_complet_to_int, df, "titre_complet", path
)

#### Interventions

In [None]:
intervention_list = model.encode(
    df["intervention"].to_list(), batch_size=128, show_progress_bar=True
)

#### Saving the computed data

In [None]:
records = {
    "profession": df["profession"].map(profession_to_int).values,
    "titre_regexed": df["titre_regexed"].map(titre_to_int).values,
    "titre_complet": df["titre_complet"].map(titre_complet_to_int).values,
    "contexte": df["contexte"].map(contexte_to_int).values,
    "intervention": intervention_list,
    "sexe": df["sexe"].map({"H": 0.0, "F": 1.0}).values,
    "n_y_naissance": df["n_y_naissance"].values,
    "label": df["label"].values,
}

train_records = {k: v[idx_train] for k, v in records.items()}
val_records = {k: v[idx_val] for k, v in records.items()}
test_records = {k: v[idx_test] for k, v in records.items()}

with open(path / "precomputed_train.pkl", "wb") as f:
    pickle.dump(train_records, f)

with open(path / "precomputed_val.pkl", "wb") as f:
    pickle.dump(val_records, f)

with open(path / "precomputed_test.pkl", "wb") as f:
    pickle.dump(test_records, f)

In [None]:
with open(path / "precomputed_train.pkl", "rb") as f:
    intervention_list = pickle.load(f)["intervention"]

In [None]:
fit = umap.UMAP(n_components=2)
proj = fit.fit_transform(intervention_list)
proj_df = pd.DataFrame(proj, columns=["x", "y"])

In [None]:
fig = px.scatter(
    proj_df,
    x="x",
    y="y",
    color=df["label"].values[idx_train],
    color_continuous_scale=px.colors.diverging.Temps,
)

fig_path = Path(path) / "intervention/images"
fig_path.mkdir(exist_ok=True, parents=True)
fig.write_image(fig_path / "umap.png")
fig.write_html(fig_path / "umap.html")

fig.show()

In [None]:
fit = PCA(n_components=2)
proj = fit.fit_transform(intervention_list)
proj_df = pd.DataFrame(proj, columns=["x", "y"])

In [None]:
fig = px.scatter(
    proj_df,
    x="x",
    y="y",
    color=df["label"].values[idx_train],
    color_continuous_scale=px.colors.diverging.Temps,
)

fig_path = Path(path) / "intervention/images"
fig_path.mkdir(exist_ok=True, parents=True)
fig.write_image(fig_path / "pca.png")
fig.write_html(fig_path / "pca.html")

fig.show()

## Avec transformers

### distilbert-base-multilingual-cased

In [None]:
model_name = "distilbert-base-multilingual-cased"

In [None]:
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)
model.eval()

In [None]:
path = Path(f"../data/precomputed/{model_name}")
path.mkdir(exist_ok=True, parents=True)

#### Profession

In [None]:
profession_dict, profession_to_int = get_embeddings_dict_from_hugging(
    model, tokenizer, df, "profession", batch_size=8
)

In [None]:
save_embedding_matrix(profession_dict, profession_to_int, "profession", path)
umap_fig = plot_proj_from_emb_dict(
    "umap", profession_dict, profession_to_int, df, "profession", path
)
tsne_fig = plot_proj_from_emb_dict(
    "tsne", profession_dict, profession_to_int, df, "profession", path
)
pca_fig = plot_proj_from_emb_dict("pca", profession_dict, profession_to_int, df, "profession", path)

#### Titre

In [None]:
titre_dict, titre_to_int = get_embeddings_dict_from_hugging(
    model, tokenizer, df, "titre_regexed", batch_size=8
)

In [None]:
save_embedding_matrix(titre_dict, titre_to_int, "titre_regexed", path)
umap_fig = plot_proj_from_emb_dict("umap", titre_dict, titre_to_int, df, "titre_regexed", path)
tsne_fig = plot_proj_from_emb_dict("tsne", titre_dict, titre_to_int, df, "titre_regexed", path)
pca_fig = plot_proj_from_emb_dict("pca", titre_dict, titre_to_int, df, "titre_regexed", path)

#### Contexte

In [None]:
contexte_dict, contexte_to_int = get_embeddings_dict_from_hugging(
    model, tokenizer, df, "contexte", batch_size=8
)

In [None]:
save_embedding_matrix(contexte_dict, contexte_to_int, "contexte", path)
umap_fig = plot_proj_from_emb_dict("umap", contexte_dict, contexte_to_int, df, "contexte", path)
tsne_fig = plot_proj_from_emb_dict("tsne", contexte_dict, contexte_to_int, df, "contexte", path)
pca_fig = plot_proj_from_emb_dict("pca", contexte_dict, contexte_to_int, df, "contexte", path)

#### Titre complet

In [None]:
titre_complet_dict, titre_complet_to_int = get_embeddings_dict_from_hugging(
    model, tokenizer, df, "titre_complet", batch_size=16
)

In [None]:
save_embedding_matrix(titre_complet_dict, titre_complet_to_int, "titre_complet", path)
umap_fig = plot_proj_from_emb_dict(
    "umap", titre_complet_dict, titre_complet_to_int, df, "titre_complet", path
)
tsne_fig = plot_proj_from_emb_dict(
    "tsne", titre_complet_dict, titre_complet_to_int, df, "titre_complet", path
)
pca_fig = plot_proj_from_emb_dict(
    "pca", titre_complet_dict, titre_complet_to_int, df, "titre_complet", path
)

#### Interventions

In [None]:
torch.cuda.empty_cache()

In [None]:
intervention_list = get_embeddings_list_from_hugging(
    model, tokenizer, df, "intervention", batch_size=128
)

#### Saving the computed data

In [None]:
records = {
    "profession": df["profession"].map(profession_to_int).values,
    "titre_regexed": df["titre_regexed"].map(titre_to_int).values,
    "titre_complet": df["titre_complet"].map(titre_complet_to_int).values,
    "contexte": df["contexte"].map(contexte_to_int).values,
    "intervention": intervention_list,
    "sexe": df["sexe"].map({"H": 0.0, "F": 1.0}).values,
    "n_y_naissance": df["n_y_naissance"].values,
    "label": df["label"].values,
}

train_records = {k: v[idx_train] for k, v in records.items()}
val_records = {k: v[idx_val] for k, v in records.items()}
test_records = {k: v[idx_test] for k, v in records.items()}

with open(path / "precomputed_train.pkl", "wb") as f:
    pickle.dump(train_records, f)

with open(path / "precomputed_val.pkl", "wb") as f:
    pickle.dump(val_records, f)

with open(path / "precomputed_test.pkl", "wb") as f:
    pickle.dump(test_records, f)

### distilcamembert-base

In [None]:
model_name = "cmarkea/distilcamembert-base"

In [None]:
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)
pooled_output = "mean"
model.eval()

In [None]:
path = Path("../data/precomputed/distilcamembert-base")
path.mkdir(exist_ok=True, parents=True)

#### Profession

In [None]:
profession_dict, profession_to_int = get_embeddings_dict_from_hugging(
    model, tokenizer, df, "profession", batch_size=8, pooled_output=pooled_output
)

In [None]:
save_embedding_matrix(profession_dict, profession_to_int, "profession", path)
umap_fig = plot_proj_from_emb_dict(
    "umap", profession_dict, profession_to_int, df, "profession", path
)
tsne_fig = plot_proj_from_emb_dict(
    "tsne", profession_dict, profession_to_int, df, "profession", path
)
pca_fig = plot_proj_from_emb_dict("pca", profession_dict, profession_to_int, df, "profession", path)

#### Titre

In [None]:
titre_dict, titre_to_int = get_embeddings_dict_from_hugging(
    model, tokenizer, df, "titre_regexed", batch_size=8, pooled_output=pooled_output
)

In [None]:
save_embedding_matrix(titre_dict, titre_to_int, "titre_regexed", path)
umap_fig = plot_proj_from_emb_dict("umap", titre_dict, titre_to_int, df, "titre_regexed", path)
tsne_fig = plot_proj_from_emb_dict("tsne", titre_dict, titre_to_int, df, "titre_regexed", path)
pca_fig = plot_proj_from_emb_dict("pca", titre_dict, titre_to_int, df, "titre_regexed", path)

#### Contexte

In [None]:
contexte_dict, contexte_to_int = get_embeddings_dict_from_hugging(
    model, tokenizer, df, "contexte", batch_size=8, pooled_output=pooled_output
)

In [None]:
save_embedding_matrix(contexte_dict, contexte_to_int, "contexte", path)
umap_fig = plot_proj_from_emb_dict("umap", contexte_dict, contexte_to_int, df, "contexte", path)
tsne_fig = plot_proj_from_emb_dict("tsne", contexte_dict, contexte_to_int, df, "contexte", path)
pca_fig = plot_proj_from_emb_dict("pca", contexte_dict, contexte_to_int, df, "contexte", path)

#### Titre complet

In [None]:
titre_complet_dict, titre_complet_to_int = get_embeddings_dict_from_hugging(
    model, tokenizer, df, "titre_complet", batch_size=16, pooled_output=pooled_output
)

In [None]:
save_embedding_matrix(titre_complet_dict, titre_complet_to_int, "titre_complet", path)
umap_fig = plot_proj_from_emb_dict(
    "umap", titre_complet_dict, titre_complet_to_int, df, "titre_complet", path
)
tsne_fig = plot_proj_from_emb_dict(
    "tsne", titre_complet_dict, titre_complet_to_int, df, "titre_complet", path
)
pca_fig = plot_proj_from_emb_dict(
    "pca", titre_complet_dict, titre_complet_to_int, df, "titre_complet", path
)

#### Interventions

In [None]:
torch.cuda.empty_cache()

In [None]:
intervention_list = get_embeddings_list_from_hugging(
    model, tokenizer, df, "intervention", batch_size=128, pooled_output=pooled_output
)

#### Saving the computed data

In [None]:
records = {
    "profession": df["profession"].map(profession_to_int).values,
    "titre_regexed": df["titre_regexed"].map(titre_to_int).values,
    "titre_complet": df["titre_complet"].map(titre_complet_to_int).values,
    "contexte": df["contexte"].map(contexte_to_int).values,
    "intervention": intervention_list,
    "sexe": df["sexe"].map({"H": 0.0, "F": 1.0}).values,
    "n_y_naissance": df["n_y_naissance"].values,
    "label": df["label"].values,
}

train_records = {k: v[idx_train] for k, v in records.items()}
val_records = {k: v[idx_val] for k, v in records.items()}
test_records = {k: v[idx_test] for k, v in records.items()}

with open(path / "precomputed_train.pkl", "wb") as f:
    pickle.dump(train_records, f)

with open(path / "precomputed_val.pkl", "wb") as f:
    pickle.dump(val_records, f)

with open(path / "precomputed_test.pkl", "wb") as f:
    pickle.dump(test_records, f)

### bert-base-multilingual-cased

In [None]:
model_name = "bert-base-multilingual-cased"

In [None]:
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)
pooled_output = "pooled"
model.eval()

In [None]:
path = Path(f"../data/precomputed/{model_name}")
path.mkdir(exist_ok=True, parents=True)

#### Profession

In [None]:
profession_dict, profession_to_int = get_embeddings_dict_from_hugging(
    model, tokenizer, df, "profession", batch_size=8, pooled_output=pooled_output
)

In [None]:
save_embedding_matrix(profession_dict, profession_to_int, "profession", path)
umap_fig = plot_proj_from_emb_dict(
    "umap", profession_dict, profession_to_int, df, "profession", path
)
tsne_fig = plot_proj_from_emb_dict(
    "tsne", profession_dict, profession_to_int, df, "profession", path
)
pca_fig = plot_proj_from_emb_dict("pca", profession_dict, profession_to_int, df, "profession", path)

#### Titre

In [None]:
titre_dict, titre_to_int = get_embeddings_dict_from_hugging(
    model, tokenizer, df, "titre_regexed", batch_size=8, pooled_output=pooled_output
)

In [None]:
save_embedding_matrix(titre_dict, titre_to_int, "titre_regexed", path)
umap_fig = plot_proj_from_emb_dict("umap", titre_dict, titre_to_int, df, "titre_regexed", path)
tsne_fig = plot_proj_from_emb_dict("tsne", titre_dict, titre_to_int, df, "titre_regexed", path)
pca_fig = plot_proj_from_emb_dict("pca", titre_dict, titre_to_int, df, "titre_regexed", path)

#### Contexte

In [None]:
contexte_dict, contexte_to_int = get_embeddings_dict_from_hugging(
    model, tokenizer, df, "contexte", batch_size=8, pooled_output=pooled_output
)

In [None]:
save_embedding_matrix(contexte_dict, contexte_to_int, "contexte", path)
umap_fig = plot_proj_from_emb_dict("umap", contexte_dict, contexte_to_int, df, "contexte", path)
tsne_fig = plot_proj_from_emb_dict("tsne", contexte_dict, contexte_to_int, df, "contexte", path)
pca_fig = plot_proj_from_emb_dict("pca", contexte_dict, contexte_to_int, df, "contexte", path)

#### Titre complet

In [None]:
titre_complet_dict, titre_complet_to_int = get_embeddings_dict_from_hugging(
    model, tokenizer, df, "titre_complet", batch_size=16, pooled_output=pooled_output
)

In [None]:
save_embedding_matrix(titre_complet_dict, titre_complet_to_int, "titre_complet", path)
umap_fig = plot_proj_from_emb_dict(
    "umap", titre_complet_dict, titre_complet_to_int, df, "titre_complet", path
)
tsne_fig = plot_proj_from_emb_dict(
    "tsne", titre_complet_dict, titre_complet_to_int, df, "titre_complet", path
)
pca_fig = plot_proj_from_emb_dict(
    "pca", titre_complet_dict, titre_complet_to_int, df, "titre_complet", path
)

#### Interventions

In [None]:
torch.cuda.empty_cache()

In [None]:
intervention_list = get_embeddings_list_from_hugging(
    model, tokenizer, df, "intervention", batch_size=128, pooled_output=pooled_output
)

#### Saving the computed data

In [None]:
records = {
    "profession": df["profession"].map(profession_to_int).values,
    "titre_regexed": df["titre_regexed"].map(titre_to_int).values,
    "titre_complet": df["titre_complet"].map(titre_complet_to_int).values,
    "contexte": df["contexte"].map(contexte_to_int).values,
    "intervention": intervention_list,
    "sexe": df["sexe"].map({"H": 0.0, "F": 1.0}).values,
    "n_y_naissance": df["n_y_naissance"].values,
    "label": df["label"].values,
}

train_records = {k: v[idx_train] for k, v in records.items()}
val_records = {k: v[idx_val] for k, v in records.items()}
test_records = {k: v[idx_test] for k, v in records.items()}

with open(path / "precomputed_train.pkl", "wb") as f:
    pickle.dump(train_records, f)

with open(path / "precomputed_val.pkl", "wb") as f:
    pickle.dump(val_records, f)

with open(path / "precomputed_test.pkl", "wb") as f:
    pickle.dump(test_records, f)

### xlm-roberta-base

In [None]:
model_name = "xlm-roberta-base"

In [None]:
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)
pooled_output = "mean"
model.eval()

In [None]:
path = Path("../data/precomputed/xlm-roberta-base")
path.mkdir(exist_ok=True, parents=True)

#### Profession

In [None]:
profession_dict, profession_to_int = get_embeddings_dict_from_hugging(
    model, tokenizer, df, "profession", batch_size=8, pooled_output=pooled_output
)

In [None]:
save_embedding_matrix(profession_dict, profession_to_int, "profession", path)
umap_fig = plot_proj_from_emb_dict(
    "umap", profession_dict, profession_to_int, df, "profession", path
)
tsne_fig = plot_proj_from_emb_dict(
    "tsne", profession_dict, profession_to_int, df, "profession", path
)
pca_fig = plot_proj_from_emb_dict("pca", profession_dict, profession_to_int, df, "profession", path)

#### Titre

In [None]:
titre_dict, titre_to_int = get_embeddings_dict_from_hugging(
    model, tokenizer, df, "titre_regexed", batch_size=8, pooled_output=pooled_output
)

In [None]:
save_embedding_matrix(titre_dict, titre_to_int, "titre_regexed", path)
umap_fig = plot_proj_from_emb_dict("umap", titre_dict, titre_to_int, df, "titre_regexed", path)
tsne_fig = plot_proj_from_emb_dict("tsne", titre_dict, titre_to_int, df, "titre_regexed", path)
pca_fig = plot_proj_from_emb_dict("pca", titre_dict, titre_to_int, df, "titre_regexed", path)

#### Contexte

In [None]:
contexte_dict, contexte_to_int = get_embeddings_dict_from_hugging(
    model, tokenizer, df, "contexte", batch_size=8, pooled_output=pooled_output
)

In [None]:
save_embedding_matrix(contexte_dict, contexte_to_int, "contexte", path)
umap_fig = plot_proj_from_emb_dict("umap", contexte_dict, contexte_to_int, df, "contexte", path)
tsne_fig = plot_proj_from_emb_dict("tsne", contexte_dict, contexte_to_int, df, "contexte", path)
pca_fig = plot_proj_from_emb_dict("pca", contexte_dict, contexte_to_int, df, "contexte", path)

#### Titre complet

In [None]:
titre_complet_dict, titre_complet_to_int = get_embeddings_dict_from_hugging(
    model, tokenizer, df, "titre_complet", batch_size=16, pooled_output=pooled_output
)

In [None]:
save_embedding_matrix(titre_complet_dict, titre_complet_to_int, "titre_complet", path)
umap_fig = plot_proj_from_emb_dict(
    "umap", titre_complet_dict, titre_complet_to_int, df, "titre_complet", path
)
tsne_fig = plot_proj_from_emb_dict(
    "tsne", titre_complet_dict, titre_complet_to_int, df, "titre_complet", path
)
pca_fig = plot_proj_from_emb_dict(
    "pca", titre_complet_dict, titre_complet_to_int, df, "titre_complet", path
)

#### Interventions

In [None]:
torch.cuda.empty_cache()

In [None]:
intervention_list = get_embeddings_list_from_hugging(
    model, tokenizer, df, "intervention", batch_size=128, pooled_output=pooled_output
)

#### Saving the computed data

In [None]:
records = {
    "profession": df["profession"].map(profession_to_int).values,
    "titre_regexed": df["titre_regexed"].map(titre_to_int).values,
    "titre_complet": df["titre_complet"].map(titre_complet_to_int).values,
    "contexte": df["contexte"].map(contexte_to_int).values,
    "intervention": intervention_list,
    "sexe": df["sexe"].map({"H": 0.0, "F": 1.0}).values,
    "n_y_naissance": df["n_y_naissance"].values,
    "label": df["label"].values,
}

train_records = {k: v[idx_train] for k, v in records.items()}
val_records = {k: v[idx_val] for k, v in records.items()}
test_records = {k: v[idx_test] for k, v in records.items()}

with open(path / "precomputed_train.pkl", "wb") as f:
    pickle.dump(train_records, f)

with open(path / "precomputed_val.pkl", "wb") as f:
    pickle.dump(val_records, f)

with open(path / "precomputed_test.pkl", "wb") as f:
    pickle.dump(test_records, f)

### bert-tiny

In [None]:
model_name = "prajjwal1/bert-tiny"

In [None]:
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)
pooled_output = "mean"
model.eval()

In [None]:
path = Path("../data/precomputed/bert-tiny")
path.mkdir(exist_ok=True, parents=True)

#### Profession

In [None]:
profession_dict, profession_to_int = get_embeddings_dict_from_hugging(
    model, tokenizer, df, "profession", batch_size=8, pooled_output=pooled_output
)

In [None]:
save_embedding_matrix(profession_dict, profession_to_int, "profession", path)
umap_fig = plot_proj_from_emb_dict(
    "umap", profession_dict, profession_to_int, df, "profession", path
)
tsne_fig = plot_proj_from_emb_dict(
    "tsne", profession_dict, profession_to_int, df, "profession", path
)
pca_fig = plot_proj_from_emb_dict("pca", profession_dict, profession_to_int, df, "profession", path)

#### Titre

In [None]:
titre_dict, titre_to_int = get_embeddings_dict_from_hugging(
    model, tokenizer, df, "titre_regexed", batch_size=8, pooled_output=pooled_output
)

In [None]:
save_embedding_matrix(titre_dict, titre_to_int, "titre_regexed", path)
umap_fig = plot_proj_from_emb_dict("umap", titre_dict, titre_to_int, df, "titre_regexed", path)
tsne_fig = plot_proj_from_emb_dict("tsne", titre_dict, titre_to_int, df, "titre_regexed", path)
pca_fig = plot_proj_from_emb_dict("pca", titre_dict, titre_to_int, df, "titre_regexed", path)

#### Contexte

In [None]:
contexte_dict, contexte_to_int = get_embeddings_dict_from_hugging(
    model, tokenizer, df, "contexte", batch_size=8, pooled_output=pooled_output
)

In [None]:
save_embedding_matrix(contexte_dict, contexte_to_int, "contexte", path)
umap_fig = plot_proj_from_emb_dict("umap", contexte_dict, contexte_to_int, df, "contexte", path)
tsne_fig = plot_proj_from_emb_dict("tsne", contexte_dict, contexte_to_int, df, "contexte", path)
pca_fig = plot_proj_from_emb_dict("pca", contexte_dict, contexte_to_int, df, "contexte", path)

#### Titre complet

In [None]:
titre_complet_dict, titre_complet_to_int = get_embeddings_dict_from_hugging(
    model, tokenizer, df, "titre_complet", batch_size=16, pooled_output=pooled_output
)

In [None]:
save_embedding_matrix(titre_complet_dict, titre_complet_to_int, "titre_complet", path)
umap_fig = plot_proj_from_emb_dict(
    "umap", titre_complet_dict, titre_complet_to_int, df, "titre_complet", path
)
tsne_fig = plot_proj_from_emb_dict(
    "tsne", titre_complet_dict, titre_complet_to_int, df, "titre_complet", path
)
pca_fig = plot_proj_from_emb_dict(
    "pca", titre_complet_dict, titre_complet_to_int, df, "titre_complet", path
)

#### Interventions

In [None]:
torch.cuda.empty_cache()

In [None]:
intervention_list = get_embeddings_list_from_hugging(
    model, tokenizer, df, "intervention", batch_size=128, pooled_output=pooled_output
)

#### Saving the computed data

In [None]:
records = {
    "profession": df["profession"].map(profession_to_int).values,
    "titre_regexed": df["titre_regexed"].map(titre_to_int).values,
    "titre_complet": df["titre_complet"].map(titre_complet_to_int).values,
    "contexte": df["contexte"].map(contexte_to_int).values,
    "intervention": intervention_list,
    "sexe": df["sexe"].map({"H": 0.0, "F": 1.0}).values,
    "n_y_naissance": df["n_y_naissance"].values,
    "label": df["label"].values,
}

train_records = {k: v[idx_train] for k, v in records.items()}
val_records = {k: v[idx_val] for k, v in records.items()}
test_records = {k: v[idx_test] for k, v in records.items()}

with open(path / "precomputed_train.pkl", "wb") as f:
    pickle.dump(train_records, f)

with open(path / "precomputed_val.pkl", "wb") as f:
    pickle.dump(val_records, f)

with open(path / "precomputed_test.pkl", "wb") as f:
    pickle.dump(test_records, f)