# Training the reader on SQuAD FR dataset

This notebook shows how to fine-tune a pre-trained BERT model on the SQuAD.

***Original CDQA Note:*** *To run this notebook you will need to have access to GPU. The fine-tuning of the Reader was done with an AWS EC2 p3.2xlarge machine (GPU Tesla V100 16GB). It took about 2 hours to complete (2 epochs on SQuAD 1.1 train was enough to achieve SOTA results on SQuAD 1.1 dev).*

In [1]:
import os
import torch
import joblib
import json
import subprocess
import pandas as pd
from hurry.filesize import size
from bertqa_sklearn_fr import BertProcessor, BertQA

  from tqdm.autonotebook import tqdm, trange


### Check SQuAD FR dataset

In [2]:
input_file = './data/SQuAD_FR/annotation-04112019.json'

In [3]:
with open(input_file) as json_file:
    d = json.load(json_file)

In [4]:
d[0]['paragraphs'][0]['questions']

[{'text': 'Qui dirige le comté d’Armagh ?',
  'answers': [{'text': 'Lord Gosford', 'index': 174}]},
 {'text': 'Combien de catholiques quittent Armagh ?',
  'answers': [{'text': '7 000', 'index': 104}]},
 {'text': 'Quelle est la religion des exilés ?',
  'answers': [{'text': 'catholiques', 'index': 73}]},
 {'text': 'Qui voit ses biens confisqués ?',
  'answers': [{'text': 'un grand nombre de catholiques', 'index': 54}]},
 {'text': 'Quand s’exilent les a catholiques d’Armagh ?',
  'answers': [{'text': 'Durant les mois qui suivirent la bataille du Diamond',
    'index': 0}]}]

### Preprocess SQuAD examples

In [5]:
train_processor = BertProcessor(bert_model='bert-base-uncased', do_lower_case=True, is_training=True)
train_examples, train_features = train_processor.fit_transform(X=input_file)

### Train the model

In [6]:
torch.cuda.empty_cache()
size(torch.cuda.memory_allocated())

'0B'

In [7]:
reader = BertQA(train_batch_size=12,
                learning_rate=3e-5,
                num_train_epochs=2,
                do_lower_case=True,
                output_dir='models')

In [8]:
# My GPU doesn't have engough memory (total 2GB), but comment this to use GPU instead of CPU
reader.model.to('cpu')
reader.device = torch.device('cpu')

In [9]:
reader.fit(X=(train_examples, train_features))

HBox(children=(IntProgress(value=0, description='Epoch', max=2, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='Iteration', max=90, style=ProgressStyle(description_width='in…




HBox(children=(IntProgress(value=0, description='Iteration', max=90, style=ProgressStyle(description_width='in…





BertQA(bert_model='bert-base-uncased', do_lower_case=True, fp16=False,
       gradient_accumulation_steps=1, learning_rate=3e-05, local_rank=-1,
       loss_scale=0, max_answer_length=30, n_best_size=20, no_cuda=False,
       null_score_diff_threshold=0.0, num_train_epochs=2, output_dir='models',
       predict_batch_size=8, seed=42, server_ip='', server_port='',
       train_batch_size=12, verbose_logging=False,
       version_2_with_negative=False, warmup_proportion=0.1)

### Save model locally

In [11]:
joblib.dump(reader, os.path.join(reader.output_dir, 'bert_qa_fr.joblib'))

['models\\bert_qa_fr.joblib']