# Plateforme Agnostique de Traitement et d'Analyse des Textes
### Carnet d'expérimentation
---

## Sujet : Camembert Embeddings

---

# Observations et environnement
---

## Environnement

In [1]:
_rs = 42

In [2]:
cd ../..

/Volumes/Geek/Work/Patat


In [3]:
import ast
import importlib
import pandas as pd
import seaborn as sns
from tqdm.notebook import tqdm
tqdm.pandas()

## Données

In [4]:
import patat.util.file

filename = 'data/prod/230517-OIDS-Label.pickle'

df_label = patat.util.file.pickle_load(filename)

In [5]:
labels = ['infox', 'entites_nommees', 'ouverture_esprit', 'faits', 'opinions',
       'propos_raportes', 'sources_citees', 'fausse_nouvelle', 'insinuations',
       'exageration', ]

In [6]:
df_label[labels].describe()

Unnamed: 0,infox,entites_nommees,ouverture_esprit,faits,opinions,propos_raportes,sources_citees,fausse_nouvelle,insinuations,exageration
count,900.0,804.0,803.0,804.0,804.0,803.0,803.0,802.0,802.0,552.0
mean,0.414444,0.618159,0.063512,0.717662,0.547264,0.244085,0.400996,0.15212,0.331671,0.317029
std,0.4929,0.48614,0.244033,0.450417,0.498071,0.429811,0.490406,0.359361,0.471107,0.465741
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Urls duppliquées

In [7]:
df_label.duplicated(subset='url').sum()

0

### Sites

In [8]:
df_label.value_counts('site')

site
www.francesoir.fr                    169
www.francetvinfo.fr                   91
www.breizh-info.com                   66
www.wikistrike.com                    62
lezarceleurs.blogspot.com             58
lesmoutonsrebelles.com                47
lemediaen442.fr                       32
www.profession-gendarme.com           28
lesdeqodeurs.fr                       28
fr.sott.net                           26
www.dreuz.info                        25
www.lelibrepenseur.org                23
www.polemia.com                       19
reseauinternational.net               17
actu.fr                               17
www.mondialisation.ca                 16
www.nouvelordremondial.cc             14
lesakerfrancophone.fr                 13
www.lesalonbeige.fr                   13
www.voltairenet.org                   12
lesobservateurs.ch                     9
www.anguillesousroche.com              9
lecourrier-du-soir.com                 9
www.cnews.fr                           9
www.preuves

# Experience
---

In [14]:
import torch

In [11]:
# Camembert version
from transformers import CamembertTokenizer, CamembertModel
model_name = 'camembert-base'
tokenizer = CamembertTokenizer.from_pretrained(model_name)
model = CamembertModel.from_pretrained(model_name)

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertModel: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing CamembertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
def get_outputs(sentence,tokenizer,model):
    tokens = tokenizer.tokenize(sentence)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    max_length = 512  # Taille maximale de la phrase d'entrée
    input_ids = input_ids[:max_length]
    input_ids = input_ids + [0] * (max_length - len(input_ids))  # Padding
    attention_mask = [1] * len(input_ids)
    input_ids = torch.tensor(input_ids).unsqueeze(0)  # Ajoute une dimension de lot
    attention_mask = torch.tensor(attention_mask).unsqueeze(0)  # Ajoute une dimension de lot
    outputs = model(input_ids, attention_mask=attention_mask)
    embeddings = outputs[0]  # Récupère les embeddings de la dernière couche cachée
    return outputs

In [16]:
sentence = 'Ceci est un test avec une phrase un peu plus longue. On verra ce que ca donne...'
outputs = get_outputs(sentence,tokenizer,model)

In [17]:
len(outputs)

2

In [22]:
outputs[0]

tensor([[[-0.0051,  0.0997,  0.1328,  ..., -0.0749, -0.0180, -0.0609],
         [-0.0768,  0.1755,  0.1299,  ..., -0.0244, -0.0549, -0.1113],
         [ 0.0987,  0.0403, -0.2107,  ..., -0.0356,  0.1181, -0.3143],
         ...,
         [-0.0051,  0.0997,  0.1328,  ..., -0.0749, -0.0180, -0.0609],
         [-0.0051,  0.0997,  0.1328,  ..., -0.0749, -0.0180, -0.0609],
         [-0.0832,  0.0825,  0.1666,  ..., -0.0583, -0.0570,  0.0176]]],
       grad_fn=<NativeLayerNormBackward0>)

In [23]:
NativeLayerNormBackward0

## Calcul des embeddings des textes

In [14]:
df_label['embeddings']=df_label['text'].progress_apply(lambda text: get_embeddings(text,tokenizer,model))

  0%|          | 0/904 [00:00<?, ?it/s]

## Prédiction infox

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, train_test_split

In [17]:
def get_balanced_df_ml(label,df_label):
    df_0 = df_label[df_label[label] == 0]
    df_1 = df_label[df_label[label] == 1]
    min_sample = min(len(df_0),len(df_1))
    df_0=df_0.sample(min_sample,random_state=_rs)
    df_1=df_1.sample(min_sample,random_state=_rs)
    df_ml = pd.concat([df_0,df_1])
    df_ml = df_ml.sample(frac=1,random_state=_rs)
    return df_ml

In [18]:
df_ml = get_balanced_df_ml('infox',df_label)

In [20]:
import numpy as np

In [24]:
matrix = np.array([r['embeddings'] for i,r in df_ml.iterrows()])

In [25]:
X = pd.DataFrame(matrix)

In [26]:
y = df_ml['infox']

In [27]:
logreg = LogisticRegression(random_state=_rs, solver='lbfgs', multi_class='ovr', max_iter=1000)

In [28]:
scores = cross_validate(logreg, X, y, cv=4,scoring=('roc_auc','f1','accuracy','precision','recall'))

In [30]:
pd.DataFrame(scores).mean()

fit_time          0.044024
score_time        0.008357
test_roc_auc      0.838486
test_f1           0.748893
test_accuracy     0.760106
test_precision    0.784991
test_recall       0.718628
dtype: float64

## Prédiction liste de labels

In [31]:
labels = ['infox', 'entites_nommees', 'ouverture_esprit', 'faits', 'opinions',
       'propos_raportes', 'sources_citees', 'fausse_nouvelle', 'insinuations',
       'exageration', ]

In [32]:
def get_df_ml(label,df_label):
    return df_label[df_label[label].notna()]

In [33]:
def get_balanced_df_ml(label,df_label):
    df_0 = df_label[df_label[label] == 0]
    df_1 = df_label[df_label[label] == 1]
    min_sample = min(len(df_0),len(df_1))
    df_0=df_0.sample(min_sample,random_state=_rs)
    df_1=df_1.sample(min_sample,random_state=_rs)
    df_ml = pd.concat([df_0,df_1])
    df_ml = df_ml.sample(frac=1,random_state=_rs)
    return df_ml

In [37]:
def get_scores(label,df_ml):
    logreg = LogisticRegression(random_state=_rs, solver='lbfgs', multi_class='ovr', max_iter=1000)
    matrix = np.array([r['embeddings'] for i,r in df_ml.iterrows()])
    X = pd.DataFrame(matrix)
    y = df_ml[label]
    classifier = logreg
    scores = cross_validate(classifier, X, y, cv=4,scoring=('roc_auc','f1','accuracy','precision','recall'))
    df_scores=pd.DataFrame(scores)
    score_dic = df_scores.mean().to_dict()
    score_dic['label']=label
    score_dic['n_samples']=len(df_ml)
    return score_dic

In [38]:
score_list = []
for label in labels:
    print(f'Processing {label}')
    df_ml = get_balanced_df_ml(label,df_label)
    score_list.append(get_scores(label,df_ml))

Processing infox
Processing entites_nommees
Processing ouverture_esprit
Processing faits
Processing opinions
Processing propos_raportes
Processing sources_citees
Processing fausse_nouvelle
Processing insinuations
Processing exageration


In [39]:
pd.DataFrame(score_list)

Unnamed: 0,fit_time,score_time,test_roc_auc,test_f1,test_accuracy,test_precision,test_recall,label,n_samples
0,0.041651,0.007901,0.838486,0.748893,0.760106,0.784991,0.718628,infox,746
1,0.027566,0.007594,0.740149,0.681552,0.671028,0.662224,0.703435,entites_nommees,614
2,0.012312,0.007437,0.553008,0.461777,0.49,0.465659,0.471154,ouverture_esprit,102
3,0.028933,0.007489,0.633944,0.603768,0.605807,0.603735,0.608631,faits,454
4,0.033426,0.007813,0.720475,0.677674,0.679945,0.680962,0.675824,opinions,728
5,0.024109,0.007415,0.616097,0.564339,0.561224,0.559631,0.576531,propos_raportes,392
6,0.038597,0.008178,0.625849,0.563011,0.585404,0.596192,0.534182,sources_citees,644
7,0.018965,0.007454,0.69422,0.665156,0.655738,0.649601,0.687903,fausse_nouvelle,244
8,0.025813,0.007601,0.787992,0.69032,0.703008,0.722594,0.661635,insinuations,532
9,0.022115,0.007457,0.73957,0.655099,0.654323,0.656791,0.657241,exageration,350


# Sauvegarde des résultats
---

# Conclusions
---

# Bricolages
---

In [None]:
import patat.model.camembert

In [None]:
importlib.reload(patat.model.camembert)

In [None]:
model = patat.model.camembert.Camembert()

In [None]:
pd.DataFrame(matrix)

In [None]:
emb2 = model.get_embeddings('Voici est un autre texte')

In [None]:
import numpy as np

In [None]:
pd.DataFrame(np.array([df_label['embeddings'][0],df_label['embeddings'][1]]))

In [None]:
np.array([[1,2,3],[4,5,6]])