# Training the reader on SQuAD FR dataset

This notebook shows how to fine-tune a pre-trained BERT model on the SQuAD.

***Original CDQA Note:*** *To run this notebook you will need to have access to GPU. The fine-tuning of the Reader was done with an AWS EC2 p3.2xlarge machine (GPU Tesla V100 16GB). It took about 2 hours to complete (2 epochs on SQuAD 1.1 train was enough to achieve SOTA results on SQuAD 1.1 dev).*

In [2]:
import torch
import joblib
import json
import subprocess
import pandas as pd
from bertqa_sklearn_fr import BertProcessor, BertQA
import re, os

### Check SQuAD FR dataset

In [20]:
input_file = './data/SQuAD_FR/annotations-24022020.json'

In [21]:
with open(input_file) as json_file:
    d = json.load(json_file)

In [22]:
d['data']

[{'title': 'Sport',
  'categorie': 'Sport',
  'wikipedia_page_id': '2713',
  'audience': 'restricted',
  'paragraphs': [{'context': "Les dépenses des ménages représentent plus de 50 % de ces montants (14,2 milliards d'euros en 2003 et 12 milliards d'euros en 2019), contre 7,9 milliards d'euros pour les collectivités locales, 3,2 pour l'État, et 2,2 pour les entreprises. Parmi les dépenses sportives des ménages en 2003, 3,7 milliards sont consacrés aux vêtements de sport et chaussures, 2 aux biens durables, 2,7 aux autres biens et 5,8 aux services. Le Ministère de la Jeunesse et des Sports estime à 100 000 (58 % d'hommes pour 42 % de femmes) le nombre de salariés travaillant pour le secteur sportif en France pour quelque 20 000 employeurs.",
    'qas': [{'question': 'Combien de personnes travaillent au ministère des sports',
      'id': 1015,
      'answers': [{'text': '100 000', 'answer_start': 472}]},
     {'question': "Combien d'employeurs",
      'id': 1016,
      'answers': [{'text

In [15]:
questions = []
for p in d['data']:
    for par in p['paragraphs']:
        for q in par['qas']:
            questions.append(q['question'])

In [19]:
mean([len(x) for x in questions])

NameError: name 'mean' is not defined

In [6]:
len(d['data'])

300

### Preprocess SQuAD examples

In [10]:
train_processor = BertProcessor(bert_model='bert-base-uncased', do_lower_case=True, is_training=True)
train_examples, train_features = train_processor.fit_transform(X=input_file)

100%|███████████████████████████████████████████████████████████████████████| 231508/231508 [00:00<00:00, 702391.59B/s]


In [11]:
train_processor

BertProcessor(bert_model='bert-base-uncased', do_lower_case=True,
              doc_stride=128, is_training=True, max_query_length=64,
              max_seq_length=384,
              tokenizer=<pytorch_pretrained_bert.tokenization.BertTokenizer object at 0x00000218D3872EB8>,
              verbose=False, version_2_with_negative=False)

### Train the model

In [7]:
reader = BertQA(train_batch_size=6,
                learning_rate=3e-5,
                num_train_epochs=12,
                do_lower_case=True,
                output_dir='models')

In [8]:
# My GPU doesn't have engough memory (total 2GB), but comment this to use GPU instead of CPU
reader.model.to('cpu')
reader.device = torch.device('cpu')

In [None]:
reader.fit(X=(train_examples, train_features))

HBox(children=(IntProgress(value=0, description='Epoch', max=12, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='Iteration', max=758, style=ProgressStyle(description_width='i…

### Save model locally

In [None]:
joblib.dump(reader, os.path.join(reader.output_dir, 'bert_qa_fr.joblib'))