In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from transformers import BertTokenizerFast, BertForSequenceClassification
df = pd.read_csv('data_tokenized_2609_doctr_trad V2.txt', sep = '\t')

In [2]:
df.head()

Unnamed: 0,filename,type,size,width,height,mots_doctr,nb_mots,mots_concat,Langue,mots_doctr_trad
0,img_0000000.jpg,facture,59453,750,1061,"['factur', 'logo', 'joanner', 'binet', 'couber...",55,factur logo joanner binet coubertin pari factu...,fr,"['bill', 'logo', 'joanner', 'binet', 'couberti..."
1,img_0000001.jpg,facture,55157,750,1061,"['joanner', 'binet', 'coubertin', 'pari', 'fac...",53,joanner binet coubertin pari factur facturé ce...,fr,"['joanner', 'binet', 'coubertine', 'bet', 'bil..."
2,img_0000002.jpg,facture,32097,726,1024,"['factur', 'entreprise', 'nom', 'société', 'ad...",28,factur entreprise nom société adresse postal a...,fr,"['bill', 'enterprise', 'name', 'company', 'Add..."
3,img_0000003.jpg,facture,63937,750,1061,"['joanner', 'binet', 'coubertin', 'pari', 'fac...",53,joanner binet coubertin pari factur cendrillon...,fr,"['joanner', 'binet', 'coubertine', 'bet', 'bil..."
4,img_0000004.jpg,facture,79474,773,771,"['payer', 'ligne', 'factur', 'sfideli', 'date'...",63,payer ligne factur sfideli date création date ...,fr,"['pay', 'line', 'bill', 'sfideli', 'date', 'es..."


In [3]:
df.shape

(1134, 10)

On s'assure tout d'abord que le dataframe est bien lisible et utilisable.

In [4]:
import re
r2 = re.compile(r"[^a-zA-Zéèàùôâêëäïöü]")
for i in range(df.shape[0]):
    df['mots_doctr_trad'][i] = r2.sub(' ', str(df['mots_doctr_trad'][i]))
    df['mots_doctr_trad'][i] = df['mots_doctr_trad'][i].replace("   ", " ").replace("  ", " ").lower()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['mots_doctr_trad'][i] = r2.sub(' ', str(df['mots_doctr_trad'][i]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['mots_doctr_trad'][i] = df['mots_doctr_trad'][i].replace("   ", " ").replace("  ", " ").lower()


In [5]:
df.head()
df['mots_concat'][0]

'factur logo joanner binet coubertin pari factur cendrillon ayot rue nation pari envoye cendrillon ayot rue ferréol lle france factur date commande ance prix unit total tva total montant designation grand brun escargot manger petit marinière uniforme bleu facile jouer accordéon buncht condition modalit paiemer paiemer jour caisse epargne iban swift bic abcdfrp xxx merci '

In [6]:
df['mots_doctr_trad'][0]

' bill logo joanner binet coubertine bet bill butterfly ayot street nation bet send butterfly ayot street ferreol it s all right  france bill date command benance prices unit total tva total amount designation large brown snail eat small marine uniform blue easy play accordion buncht condition modality paying paying day body saving iban swift bic abcdfrp xxx thank you '

On supprime les colonnes qui ne seront pas utiles dans cette analyse.

In [7]:
df = df.drop(['mots_doctr', 'nb_mots', 'Langue', 'mots_concat'], axis = 1)

In [20]:
df.head()

Unnamed: 0,filename,type,size,width,height,mots_doctr_trad,type_num
0,img_0000000.jpg,facture,59453,750,1061,bill logo joanner binet coubertine bet bill b...,0
1,img_0000001.jpg,facture,55157,750,1061,joanner binet coubertine bet bill billed butt...,0
2,img_0000002.jpg,facture,32097,726,1024,bill enterprise name company address postal a...,0
3,img_0000003.jpg,facture,63937,750,1061,joanner binet coubertine bet bill butterfly a...,0
4,img_0000004.jpg,facture,79474,773,771,pay line bill sfideli date establishment date...,0


On recréé une colonne mots_concat à partir de la traduction.

Enfin, en se basant sur l'interprétation des résultats de l'itération précédente, on va rassembler les catégories "rrc.cvc", "scientific_publication" et "scientific_report", très similaires et difficiles à dissocier.

In [9]:
df['type'] = df['type'].replace(['rrc.cvc', 'scientific_publication', 'scientific_report'], ['scientific_doc', 'scientific_doc', 'scientific_doc'])

In [10]:
df['type'].value_counts()

scientific_doc     351
paye               145
id_pieces           82
carte postale       70
facture             66
passeport           43
handwritten         35
news_article        32
memo                31
questionnaire       31
resume              30
letter              29
budget              25
presentation        25
specification       24
invoice             24
advertisement       23
justif_domicile     23
email               23
form                22
Name: type, dtype: int64

In [11]:
df['type_num'] = df.type.replace(df.type.unique().tolist(), [x for x in range(len(df['type'].unique()))])

On sépare le dataset en un ensemble d'entrainement et un ensemble de test.

In [12]:
from sklearn.model_selection import train_test_split
X = df.mots_doctr_trad
y = df.type_num

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 42)

Puis on vient charger un modèle BERT pré-entrainé.

In [13]:
from transformers import BertTokenizerFast, BertForSequenceClassification

model = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(model, do_lower_case=True)

Et on encode les informations afin qu'elles soient utilisables par le modèle.

In [15]:
max_length = 512

X_train_encoded = tokenizer(
    list(X_train), truncation=True, padding=True, max_length=max_length)
X_test_encoded = tokenizer(list(X_test), truncation=True,
                           padding=True, max_length=max_length)

In [16]:
#pip install torch
np.array(y_train)

array([ 0,  5,  6,  6, 15,  6,  6, 11, 11,  3,  6, 12, 15,  8,  6, 14,  4,
       18,  6,  6, 13, 14,  6,  6, 16,  4,  5,  6,  0,  0, 10, 11,  9,  4,
        4,  7, 10, 14,  4, 19,  5,  6,  6, 12,  4,  4,  0,  6,  0,  1,  0,
        5,  3,  1,  6,  4,  5,  1,  6, 19,  4, 15,  6,  6,  4,  5, 10,  6,
        8, 17,  6,  9,  6,  0, 15,  1, 16, 19, 17,  4, 13,  6, 17,  3, 10,
        4,  1,  6, 16,  6, 12, 18, 18,  4,  6,  6,  6, 17,  1,  6,  5, 17,
       19,  6,  6,  0,  2,  4,  4,  4,  6, 18,  6,  6, 11, 13,  6, 18,  6,
        6,  0,  5,  6, 14,  5,  4,  6,  4, 10,  4,  4, 10,  6,  4, 12,  6,
       14,  4,  6, 11,  1,  1,  6,  0,  7,  6,  4,  3,  6,  6, 17,  5,  1,
        1,  6,  1, 17,  6,  6, 19,  6, 19,  6,  4,  5,  0,  4, 12,  6,  1,
        1,  6, 14,  4,  6,  6,  0,  4, 11,  4,  6,  4,  6,  1, 10,  6,  6,
        1,  9,  6,  4, 19,  4,  6,  0,  6, 14,  3,  6,  5,  1,  5,  6,  6,
       11, 17,  6,  4,  6,  3, 17, 13,  1,  6,  6,  4, 19,  9, 18, 15,  3,
        9,  6,  1,  6,  6

Le modèle pré-entrainé se base sur une classe "NewsGroupsDataset", permettant de classifier les thèmes de différents articles de journaux.

In [17]:
import torch


class NewsGroupsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = NewsGroupsDataset(X_train_encoded, np.array(y_train))
test_dataset = NewsGroupsDataset(X_test_encoded, np.array(y_test))

In [18]:
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments

target = list(df['type_num'].unique())

model = BertForSequenceClassification.from_pretrained(
    model, num_labels=len(target))
training_args = TrainingArguments(
    output_dir='./results',          # chemin de stockage des résultats
    num_train_epochs=3,              # nombre d'époques pour l'entraînement
    per_device_train_batch_size=10,  # batch size pour l'entraînement
    per_device_eval_batch_size=10,   # batch size pour l'évaluation du modèle
    warmup_steps=500,                # nombre d'étapes pour le pas d'apprentissage
    weight_decay=0.01,               # paramètre décidant des poids
    logging_dir='./logs',            # chemin de stockage des logs
    # utilisation du meilleur modèle à l'issu de l'entraînement
    load_best_model_at_end=True,
    logging_steps=400,               # log & enregistrer les poids à chaque étape
    save_steps=400,
    evaluation_strategy="steps",     # évaluation à chaque `logging_steps`
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

On entraine le modèle, sur seulement 3 epochs car il est particulièrement fastidieux à exécuter.

In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)
trainer.train()
trainer.evaluate()

***** Running training *****
  Num examples = 907
  Num Epochs = 3
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 10
  Gradient Accumulation steps = 1
  Total optimization steps = 273


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 227
  Batch size = 10


{'eval_loss': 1.355896234512329,
 'eval_runtime': 289.1585,
 'eval_samples_per_second': 0.785,
 'eval_steps_per_second': 0.08,
 'epoch': 3.0}

On définit une fonction qui nous permet de tester ce modèle : cette fonction prend en entrée un texte (par exemple l'OCR-isation d'un document) et en ressort une classe identifiée. Ici, on va utiliser des textes inventés, supposés être représentatifs de certaines classes de notre dataset.

In [31]:
def pipeline(text):
    inputs = tokenizer(text, padding=True, truncation=True,
                       max_length=max_length, return_tensors="pt")
    outputs = model(**inputs)
    probs = outputs[0].softmax(1)
    return list(df['type_num'].unique())[probs.argmax()]

D'abord une carte postale :

In [41]:
text = 'Hello mom and dad, we have arrived at Biarritz where the sun is shining and everything is cool. See you soon, kisses, bye'

In [42]:
print(pipeline(text))

5


Ici le type a bien été détecté, la correspondance index/type est décrite ci-dessous.

In [38]:
df[['type', 'type_num']].value_counts()

type             type_num
scientific_doc   6           351
paye             4           145
id_pieces        1            82
carte postale    5            70
facture          0            66
passeport        3            43
handwritten      17           35
news_article     15           32
questionnaire    19           31
memo             12           31
resume           10           30
letter           9            29
presentation     16           25
budget           14           25
specification    11           24
invoice          13           24
justif_domicile  2            23
email            18           23
advertisement    8            23
form             7            22
dtype: int64

Puis une facture :

In [60]:
text2 = 'Frank Bennett, Invoice, 29 cups, $56,  Total without tax : $152'
print(pipeline(text2))

6


Puis un email :

In [43]:
text3 = 'To : Maxwell Grant From : Jack Nicholson Subject : Payment missed last month Message : Hi Max, you have missed last month payment, please correct ASAP. Regards,'
print(pipeline(text3))

6


On voit que dans ces deux derniers cas, le modèle semble avoir le même biais que dans notre analyse principale : s'il n'est pas sûr, il classe les documents dans la classe prépondérante, à savoir le document scientifique.
Le modèle ne semble pas particulièrement robuste.