<a href="https://colab.research.google.com/github/guggio/bbc_news/blob/master/bbc_bert_farm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BBC Article Genre Classification with BERT using the FARM Framework

## Setup

In [1]:
!pip install farm==0.4.3

Collecting farm==0.4.3
[?25l  Downloading https://files.pythonhosted.org/packages/0e/a9/b1f1ff65af01d5cd1d6df698e0c142ab3164afb1189b7cecd8075fee853b/farm-0.4.3.tar.gz (153kB)
[K     |████████████████████████████████| 163kB 3.5MB/s 
Collecting torch==1.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/24/19/4804aea17cd136f1705a5e98a00618cb8f6ccc375ad8bfa437408e09d058/torch-1.4.0-cp36-cp36m-manylinux1_x86_64.whl (753.4MB)
[K     |████████████████████████████████| 753.4MB 23kB/s 
Collecting seqeval
  Downloading https://files.pythonhosted.org/packages/34/91/068aca8d60ce56dd9ba4506850e876aba5e66a6f2f29aa223224b50df0de/seqeval-0.0.12.tar.gz
Collecting mlflow==1.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/01/ec/8c9448968d4662e8354b9c3a62e635f8929ed507a45af3d9fdb84be51270/mlflow-1.0.0-py3-none-any.whl (47.7MB)
[K     |████████████████████████████████| 47.7MB 56kB/s 
[?25hCollecting transformers==2.7.0
[?25l  Downloading https://files.pythonhosted.org/pa

In [2]:
!git clone https://github.com/guggio/bbc_news

Cloning into 'bbc_news'...
remote: Enumerating objects: 2190, done.[K
remote: Counting objects: 100% (2190/2190), done.[K
remote: Compressing objects: 100% (2168/2168), done.[K
remote: Total 2190 (delta 21), reused 2183 (delta 14), pack-reused 0[K
Receiving objects: 100% (2190/2190), 5.15 MiB | 17.87 MiB/s, done.
Resolving deltas: 100% (21/21), done.


In [3]:
from farm.data_handler.data_silo import DataSilo
from farm.data_handler.processor import TextClassificationProcessor
from farm.modeling.optimization import initialize_optimizer
from farm.infer import Inferencer
from farm.modeling.adaptive_model import AdaptiveModel
from farm.modeling.language_model import LanguageModel
from farm.modeling.prediction_head import MultiLabelTextClassificationHead
from farm.modeling.tokenization import Tokenizer
from farm.train import Trainer
from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings
import logging
import pandas as pd

05/29/2020 09:05:03 - INFO - transformers.file_utils -   PyTorch version 1.4.0 available.
05/29/2020 09:05:04 - INFO - transformers.file_utils -   TensorFlow version 2.2.0 available.


In [4]:
# Farm allows simple logging of many parameters & metrics. Let's use the MLflow framework to track our experiment ...
# You will see your results on https://public-mlflow.deepset.ai/

ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
ml_logger.init_experiment(experiment_name="BBC_Articles", run_name="BBC News Articles")


 __          __  _                            _        
 \ \        / / | |                          | |       
  \ \  /\  / /__| | ___ ___  _ __ ___   ___  | |_ ___  
   \ \/  \/ / _ \ |/ __/ _ \| '_ ` _ \ / _ \ | __/ _ \ 
    \  /\  /  __/ | (_| (_) | | | | | |  __/ | || (_) |
     \/  \/ \___|_|\___\___/|_| |_| |_|\___|  \__\___/ 
  ______      _____  __  __  
 |  ____/\   |  __ \|  \/  |              _.-^-._    .--.
 | |__ /  \  | |__) | \  / |           .-'   _   '-. |__|
 |  __/ /\ \ |  _  /| |\/| |          /     |_|     \|  |
 | | / ____ \| | \ \| |  | |         /               \  |
 |_|/_/    \_\_|  \_\_|  |_|        /|     _____     |\ |
                                     |    |==|==|    |  |
|---||---|---|---|---|---|---|---|---|    |--|--|    |  |
|---||---|---|---|---|---|---|---|---|    |==|==|    |  |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 


In [5]:
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=True)
n_epochs = 2
batch_size = 8
evaluate_every = 100

05/29/2020 09:05:07 - INFO - farm.utils -   device: cuda n_gpu: 1, distributed training: False, automatic mixed precision training: None


## Building own blocks

### Tokenizer

In [6]:
lang_model = "bert-base-cased"
do_lower_case = False

tokenizer = Tokenizer.load(
    pretrained_model_name_or_path=lang_model,
    do_lower_case=do_lower_case)

05/29/2020 09:05:07 - INFO - farm.modeling.tokenization -   Loading tokenizer of type 'BertTokenizer'
05/29/2020 09:05:07 - INFO - filelock -   Lock 140458994706136 acquired on /root/.cache/torch/transformers/5e8a2b4893d13790ed4150ca1906be5f7a03d6c4ddf62296c383f6db42814db2.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1.lock
05/29/2020 09:05:07 - INFO - transformers.file_utils -   https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt not found in cache or force_download set to True, downloading to /root/.cache/torch/transformers/tmp469ope3r


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…

05/29/2020 09:05:07 - INFO - transformers.file_utils -   storing https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt in cache at /root/.cache/torch/transformers/5e8a2b4893d13790ed4150ca1906be5f7a03d6c4ddf62296c383f6db42814db2.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1
05/29/2020 09:05:07 - INFO - transformers.file_utils -   creating metadata file for /root/.cache/torch/transformers/5e8a2b4893d13790ed4150ca1906be5f7a03d6c4ddf62296c383f6db42814db2.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1
05/29/2020 09:05:07 - INFO - filelock -   Lock 140458994706136 released on /root/.cache/torch/transformers/5e8a2b4893d13790ed4150ca1906be5f7a03d6c4ddf62296c383f6db42814db2.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1.lock
05/29/2020 09:05:07 - INFO - transformers.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt from cache at /root/.cache/torch/transf




### Data Processor

In [0]:
label_list = ['entertainment', 'sport', 'politics', 'business', 'tech'] #labels in our data set
metric = "f1_macro" # desired metric for evaluation

processor = TextClassificationProcessor(tokenizer=tokenizer,
                                            max_seq_len=512, # BERT can only handle sequence lengths of up to 512
                                            data_dir='bbc_news/generated_data', 
                                            label_list=label_list,
                                            label_column_name="genre", # our labels are located in the "genre" column
                                            metric=metric,
                                            quote_char='"',
                                            multilabel=True,
                                            train_filename="train.tsv",
                                            dev_filename=None,
                                            test_filename="test.tsv",
                                            dev_split=0.1 # this will extract 10% of the train set to create a dev set
                                            )

In [8]:
data_silo = DataSilo(
    processor=processor,
    batch_size=batch_size)

05/29/2020 09:05:08 - INFO - farm.data_handler.data_silo -   
Loading data into the data silo ... 
              ______
               |o  |   !
   __          |:`_|---'-.
  |__|______.-/ _ \-----.|       
 (o)(o)------'\ _ /     ( )      
 
05/29/2020 09:05:08 - INFO - farm.data_handler.data_silo -   Loading train set from: bbc_news/generated_data/train.tsv 
05/29/2020 09:05:08 - INFO - farm.data_handler.data_silo -   Got ya 1 parallel workers to convert 1780 dictionaries to pytorch datasets (chunksize = 356)...
05/29/2020 09:05:08 - INFO - farm.data_handler.data_silo -    0 
05/29/2020 09:05:08 - INFO - farm.data_handler.data_silo -   /w\
05/29/2020 09:05:08 - INFO - farm.data_handler.data_silo -   /'\
05/29/2020 09:05:08 - INFO - farm.data_handler.data_silo -   
Preprocessing Dataset bbc_news/generated_data/train.tsv:   0%|          | 0/1780 [00:00<?, ? Dicts/s]05/29/2020 09:05:13 - INFO - farm.data_handler.processor -   *** Show 2 random examples ***
05/29/2020 09:05:13 - INFO - fa

### Modeling

In [9]:
# loading the pretrained BERT base cased model
language_model = LanguageModel.load(lang_model)
# prediction head for our model that is suited for classifying news article genres
prediction_head = MultiLabelTextClassificationHead(num_labels=len(label_list))

model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_sequence"],
        device=device)

05/29/2020 09:05:41 - INFO - filelock -   Lock 140456152004200 acquired on /root/.cache/torch/transformers/b945b69218e98b3e2c95acf911789741307dec43c698d35fad11c1ae28bda352.9da767be51e1327499df13488672789394e2ca38b877837e52618a67d7002391.lock
05/29/2020 09:05:41 - INFO - transformers.file_utils -   https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json not found in cache or force_download set to True, downloading to /root/.cache/torch/transformers/tmpzm6jjxcq


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…

05/29/2020 09:05:42 - INFO - transformers.file_utils -   storing https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json in cache at /root/.cache/torch/transformers/b945b69218e98b3e2c95acf911789741307dec43c698d35fad11c1ae28bda352.9da767be51e1327499df13488672789394e2ca38b877837e52618a67d7002391
05/29/2020 09:05:42 - INFO - transformers.file_utils -   creating metadata file for /root/.cache/torch/transformers/b945b69218e98b3e2c95acf911789741307dec43c698d35fad11c1ae28bda352.9da767be51e1327499df13488672789394e2ca38b877837e52618a67d7002391
05/29/2020 09:05:42 - INFO - filelock -   Lock 140456152004200 released on /root/.cache/torch/transformers/b945b69218e98b3e2c95acf911789741307dec43c698d35fad11c1ae28bda352.9da767be51e1327499df13488672789394e2ca38b877837e52618a67d7002391.lock
05/29/2020 09:05:42 - INFO - filelock -   Lock 140456304074312 acquired on /root/.cache/torch/transformers/35d8b9d36faaf46728a0192d82bf7d00137490cd6074e8500778afed552a67e5.3fadbea36527ae472139f




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435779157.0, style=ProgressStyle(descri…

05/29/2020 09:05:48 - INFO - transformers.file_utils -   storing https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin in cache at /root/.cache/torch/transformers/35d8b9d36faaf46728a0192d82bf7d00137490cd6074e8500778afed552a67e5.3fadbea36527ae472139fe84cddaa65454d7429f12d543d80bfc3ad70de55ac2
05/29/2020 09:05:48 - INFO - transformers.file_utils -   creating metadata file for /root/.cache/torch/transformers/35d8b9d36faaf46728a0192d82bf7d00137490cd6074e8500778afed552a67e5.3fadbea36527ae472139fe84cddaa65454d7429f12d543d80bfc3ad70de55ac2
05/29/2020 09:05:48 - INFO - filelock -   Lock 140456304074312 released on /root/.cache/torch/transformers/35d8b9d36faaf46728a0192d82bf7d00137490cd6074e8500778afed552a67e5.3fadbea36527ae472139fe84cddaa65454d7429f12d543d80bfc3ad70de55ac2.lock
05/29/2020 09:05:48 - INFO - transformers.modeling_utils -   loading weights file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin from cache at /root




	 We guess it's an *ENGLISH* model ... 
	 If not: Init the language model by supplying the 'language' param.
05/29/2020 09:05:50 - INFO - farm.modeling.prediction_head -   Prediction head initialized with size [768, 5]


In [10]:
model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=3e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs)

05/29/2020 09:05:59 - INFO - farm.modeling.optimization -   Loading optimizer `TransformersAdamW`: '{'correct_bias': False, 'weight_decay': 0.01, 'lr': 3e-05}'
05/29/2020 09:06:00 - INFO - farm.modeling.optimization -   Using scheduler 'get_linear_schedule_with_warmup'
05/29/2020 09:06:00 - INFO - farm.modeling.optimization -   Loading schedule `get_linear_schedule_with_warmup`: '{'num_warmup_steps': 35.6, 'num_training_steps': 356}'


### Training

In [0]:
trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device)

In [12]:
trainer.train()

05/29/2020 09:06:02 - INFO - farm.train -   
 

          &&& &&  & &&             _____                   _             
      && &\/&\|& ()|/ @, &&       / ____|                 (_)            
      &\/(/&/&||/& /_/)_&/_&     | |  __ _ __ _____      ___ _ __   __ _ 
   &() &\/&|()|/&\/ '%" & ()     | | |_ | '__/ _ \ \ /\ / / | '_ \ / _` |
  &_\_&&_\ |& |&&/&__%_/_& &&    | |__| | | | (_) \ V  V /| | | | | (_| |
&&   && & &| &| /& & % ()& /&&    \_____|_|  \___/ \_/\_/ |_|_| |_|\__, |
 ()&_---()&\&\|&&-&&--%---()~                                       __/ |
     &&     \|||                                                   |___/
             |||
             |||
             |||
       , -=-~  .-^- _
              `

Train epoch 0/2 (Cur. train loss: 0.1582):  56%|█████▌    | 100/178 [00:49<00:35,  2.22it/s]
Evaluating: 100%|██████████| 45/45 [00:06<00:00,  7.22it/s]
  _warn_prf(average, modifier, msg_start, len(result))
05/29/2020 09:06:58 - INFO - farm.eval -   

\\|//       \\|// 

AdaptiveModel(
  (language_model): Bert(
    (model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(28996, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
            

## Saving and Inferencing

In [0]:
save_dir = "saved_models/bert-english-news-article"
model.save(save_dir)
processor.save(save_dir)

In [0]:
# to download the model
!zip -r saved_models/model.zip saved_models/bert-english-news-article

In [14]:
inferenced_model = Inferencer.load(save_dir)

05/29/2020 09:11:40 - INFO - farm.utils -   device: cpu n_gpu: 0, distributed training: False, automatic mixed precision training: None
05/29/2020 09:11:40 - INFO - transformers.modeling_utils -   loading weights file saved_models/bert-english-news-article/language_model.bin from cache at saved_models/bert-english-news-article/language_model.bin
05/29/2020 09:11:42 - INFO - farm.modeling.adaptive_model -   Found files for loading 1 prediction heads
05/29/2020 09:11:42 - INFO - farm.modeling.prediction_head -   Prediction head initialized with size [768, 5]
05/29/2020 09:11:42 - INFO - farm.modeling.prediction_head -   Loading prediction head from saved_models/bert-english-news-article/prediction_head_0.bin
05/29/2020 09:11:42 - INFO - transformers.tokenization_utils -   Model name 'saved_models/bert-english-news-article' not found in model shortcut name list (bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-base-multilingual

In [0]:
def read_file(file_name: str) -> dict:
  text_file = open (file_name, 'r')
  text_file = text_file.read().replace('\n', ' ')
  return {'text': text_file}

In [0]:
def create_input(text_files:list) -> list:
  model_input = list()
  for text_file in text_files:
    model_input.append(read_file(text_file['file']))
  return model_input

In [0]:
def create_result_overview (articles:list, result:list) -> pd.DataFrame:
  files = list()
  labels = list()
  predictions = list()
  for i in range(len(articles)):
    files.append (articles[i]['file'])
    labels.append(articles[i]['genre'])
    predictions.append(result[0]['predictions'][i]['label'].strip("'[]'"))
  data = {'file': files, 'actual': labels, 'prediction': predictions}
  df = pd.DataFrame(data)
  return df

In [52]:
articles = [{'file': 'bbc_news/generated_data/inferencing/business.txt', 'genre': 'business'},
            {'file': 'bbc_news/generated_data/inferencing/sport.txt', 'genre': 'sport'}]

article_texts = create_input(articles)

result = inferenced_model.inference_from_dicts(article_texts)

df = create_result_overview(articles, result)

df.head()

Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.54s/ Batches]


Unnamed: 0,file,actual,prediction
0,bbc_news/generated_data/inferencing/business.txt,business,business
1,bbc_news/generated_data/inferencing/sport.txt,sport,sport
