In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from nlp_assemblee.datasets import build_dataset_and_dataloader_from_config
from nlp_assemblee.models import build_classifier_from_config

In [3]:
import pickle

import numpy as np
import torch
from tqdm.autonotebook import tqdm

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [5]:
bert_type = "bert"

config_file = f"../../configs/{bert_type}.json"

classifier = build_classifier_from_config(config_file)

datasets, loaders = build_dataset_and_dataloader_from_config(config_file, "../../")


embeddings = {
    "intervention": [],
    "titre_complet": [],
    "profession": [],
    "features": [],
    "label": [],
}

embedder = classifier.bert_linears.bert_layers.to(device)

for phase in tqdm(["train", "val", "test"]):
    embeddings = {
        "intervention": [],
        "titre_complet": [],
        "profession": [],
        "features": [],
        "label": [],
    }
    for x, y in tqdm(loaders[phase]):
        x = {k: v.to(device) for k, v in x.items()}
        with torch.no_grad():
            for k, v in x.items():
                if k != "features":
                    embeddings[k].append(embedder[k].bert(v)["pooler_output"].cpu().numpy())
                else:
                    embeddings["features"].append(v.cpu().numpy())
        embeddings["label"].append(y.cpu().numpy())

    embs = {
        "intervention": np.vstack(embeddings["intervention"]),
        "titre_complet": np.vstack(embeddings["titre_complet"]),
        "profession": np.vstack(embeddings["profession"]),
        "features": np.vstack(embeddings["features"]),
        "label": np.hstack(embeddings["label"]),
    }

    with open(f"../../data/{bert_type}_embeddings_{phase}.pkl", "wb") as f:
        pickle.dump(embs, f)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'c

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1855 [00:00<?, ?it/s]

  0%|          | 0/796 [00:00<?, ?it/s]

  0%|          | 0/1137 [00:00<?, ?it/s]

In [None]:
bert_type = "camembert"

config_file = f"../../configs/{bert_type}.json"

classifier = build_classifier_from_config(config_file)

datasets, loaders = build_dataset_and_dataloader_from_config(config_file, "../../")


embeddings = {
    "intervention": [],
    "titre_complet": [],
    "profession": [],
    "features": [],
    "label": [],
}

embedder = classifier.bert_linears.bert_layers.to(device)

for phase in tqdm(["train", "val", "test"]):
    phase_list = []
    for x, y in tqdm(loaders[phase]):
        embeddings = {}
        x = {k: v.to(device) for k, v in x.items()}
        with torch.no_grad():
            for k, v in x.items():
                if k != "features":
                    embeddings[k] = embedder[k].bert(v)["pooler_output"].cpu().numpy()
                else:
                    embeddings["features"] = v.cpu().numpy()
        embeddings["label"] = y.cpu().numpy()
        phase_list.append(embeddings)

    with open(f"../../data/{bert_type}_embeddings_{phase}.pkl", "wb") as f:
        pickle.dump(phase_list, f)

In [None]:
embs = {
    "intervention": np.vstack(
        embeddings["val"]["intervention"]
        + embeddings["train"]["intervention"]
        + embeddings["test"]["intervention"]
    ),
    "titre_complet": np.vstack(
        embeddings["val"]["titre_complet"]
        + embeddings["train"]["titre_complet"]
        + embeddings["test"]["titre_complet"]
    ),
    "profession": np.vstack(
        embeddings["val"]["profession"]
        + embeddings["train"]["profession"]
        + embeddings["test"]["profession"]
    ),
    "features": np.vstack(
        embeddings["val"]["features"]
        + embeddings["train"]["features"]
        + embeddings["test"]["features"]
    ),
    "label": np.hstack(
        embeddings["val"]["label"] + embeddings["train"]["label"] + embeddings["test"]["label"]
    ),
}

with open("../../data/camembert_embeddings_array.pkl", "wb") as f:
    pickle.dump(embs, f)

In [5]:
with open("../../data/camembert_embeddings_array.pkl", "rb") as f:
    embs = pickle.load(f)

In [9]:
labs = embs["label"]

In [12]:
n = len(labs)
probs = [np.sum(labs == i) / n for i in range(3)]

In [11]:
probs

[0.5000838942327529, 0.1402051420288101, 0.35971096373843703, 0.0]

In [14]:
-np.sum(probs * np.log(probs))

0.9897897551342119

In [15]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import train_test_split

In [19]:
dummy = DummyClassifier(strategy="stratified")
dummy.fit(embs["features"], embs["label"])

In [21]:
y_pred = dummy.predict(embs["features"])

balanced_accuracy_score(embs["label"], y_pred)

0.33242072528356476

In [23]:
embs.keys()

dict_keys(['intervention', 'titre_complet', 'profession', 'features', 'label'])

In [24]:
test_pc = 0.3
val_pc = 0.3

X = np.arange(len(labs))
y = labs
idx_train, idx_test, y_train, y_test = train_test_split(
    X, y, test_size=test_pc, random_state=42, stratify=y
)
idx_train, idx_val, y_train, y_val = train_test_split(
    idx_train,
    y_train,
    test_size=val_pc,
    random_state=42,
    stratify=y_train,
)

train_records = {
    "intervention": embs["intervention"][idx_train],
    "titre_complet": embs["titre_complet"][idx_train],
    "profession": embs["profession"][idx_train],
    "features": embs["features"][idx_train],
    "label": embs["label"][idx_train],
}
test_records = {
    "intervention": embs["intervention"][idx_test],
    "titre_complet": embs["titre_complet"][idx_test],
    "profession": embs["profession"][idx_test],
    "features": embs["features"][idx_test],
    "label": embs["label"][idx_test],
}
val_records = {
    "intervention": embs["intervention"][idx_val],
    "titre_complet": embs["titre_complet"][idx_val],
    "profession": embs["profession"][idx_val],
    "features": embs["features"][idx_val],
    "label": embs["label"][idx_val],
}

In [25]:
bert_type = "camembert"
for phase, phase_list in zip(["train", "val", "test"], [train_records, val_records, test_records]):
    with open(f"../../data/{bert_type}_embeddings_{phase}.pkl", "wb") as f:
        pickle.dump(phase_list, f)