# Workshop WDL

In [None]:
!git clone https://github.com/dlite-tools/nlp-training

In [None]:
%cd nlp-training/

In [None]:
!git pull

In [None]:
!pip3 install -r requirements.txt

In [None]:
import os
import tempfile

import torch
from torchtext.datasets import AG_NEWS
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import MLFlowLogger
from pytorch_lightning.callbacks import (
    EarlyStopping,
    ModelCheckpoint,
    LearningRateMonitor
)

from inference.architectures.text_classification import BaselineModel
from inference.data_processors.transformers import BaseTransformer
from inference.data_processors.processor import Processor
from inference.data_processors.transformers.preprocessing import VocabTransform
from training.trainer import TextClassificationTrainer
from training.datasets.text_classification import AGNewsDataModule

## Hyper-parameters and other Settings


In [None]:
NUMBER_CLASSES = 4
N_EPOCHS = 10
EMBED_DIM = 64
BATCH_SIZE = 64
NUM_WORKERS = 8

model_checkpoint = ModelCheckpoint(monitor="valid_loss", mode="min", save_weights_only=True)
early_stop_callback = EarlyStopping(monitor="valid_loss", mode="min", patience=4)
learning_rate_monitor = LearningRateMonitor()

mf_logger = MLFlowLogger(
    experiment_name="AG News - Text Classification",
    run_name="Baseline",
)

## Data transformation pipeline

## Build a Tokenizer

Implement a class that inherits from `BaseTransformer`.

This class must have implemented the method `__call__` that receives
a string and return a list of strings.

In [None]:
from typing import List


class Tokenize(BaseTransformer):
    def __call__(self, text: str) -> List[str]:
        # Receive a string and split it into a list of strings
        pass

### Processor

The processor is a object that will sequentially apply the transformation to the data.

In [None]:
vocab = VocabTransform()
preprocessing = [
    Tokenize(),
    vocab
]
processor = Processor(preprocessing=preprocessing)


vocab.build_vocab(processor, AG_NEWS(split='train'))

print(f"\nVocabulary as size of {len(vocab)}.")

Try to modify the Tokenize to also remove symbols

(tip: use `from string import punctuation` to get the punctuation that should be removed)


## Setup data module

In [None]:
data_module = AGNewsDataModule(processor=processor, num_workers=NUM_WORKERS, batch_size=BATCH_SIZE)


## Model and Model Trainer

In [None]:
model = BaselineModel(vocab_size=len(vocab), embed_dim=EMBED_DIM, num_class=NUMBER_CLASSES)

model_trainer = TextClassificationTrainer(
    model=model,
    num_class=NUMBER_CLASSES
)

trainer = Trainer(
    callbacks=[model_checkpoint, early_stop_callback],
    max_epochs=N_EPOCHS,
    logger=mf_logger,
    gpus=torch.cuda.device_count(),
)

## Training and testing

In [None]:
trainer.fit(model_trainer, data_module)
trainer.test(datamodule=data_module)

## Lets try to add Data augmentation to our training

For that you can create a new transform based on `Tokenize`.
Since we only want to apply on training set, we should set attribute `_data_aug` to `True`

Tips:

```
class SentenceAugmentation(BaseTransformer):
    _data_aug = True

    def __call__(self, text: str) -> str:
        pass
```

```
>>>import nltk
>>>import nlpaug.augmenter.word as naw
>>>nltk.download('wordnet')
>>>nltk.download('omw-1.4')
>>>nltk.download('averaged_perceptron_tagger')
>>>aug = naw.SynonymAug()
>>>aug.augment('This is our random workshop on World Data League.')
'This is our random workshop on Earthly concern Datum League.'
```

## Using NLPiper Integration

In [None]:
import nlpiper
from inference.data_processors.transformers.preprocessing import NLPiperIntegration

vocab = VocabTransform()
preprocessing = [
    NLPiperIntegration(pipeline=nlpiper.core.Compose([
        nlpiper.transformers.cleaners.CleanPunctuation(),
        nlpiper.transformers.tokenizers.BasicTokenizer(),
        nlpiper.transformers.normalizers.CaseTokens(),
    ])),
    vocab
]

processor = Processor(preprocessing=preprocessing)
vocab.build_vocab(processor, AG_NEWS(split='train'))

print(f"Vocabulary as size of {len(vocab)}.")

In [None]:
mf_logger = MLFlowLogger(
    experiment_name="AG News - Text Classification",
    run_name="Baseline w/NLPiper",
)

model = BaselineModel(vocab_size=len(vocab), embed_dim=EMBED_DIM, num_class=NUMBER_CLASSES)

model_trainer = TextClassificationTrainer(
    model=model,
    num_class=NUMBER_CLASSES
)

trainer = Trainer(
    callbacks=[model_checkpoint, early_stop_callback, learning_rate_monitor],
    max_epochs=N_EPOCHS,
    logger=mf_logger,
    gpus=torch.cuda.device_count(),
)

In [None]:
trainer.fit(model_trainer, data_module)
trainer.test(datamodule=data_module)

## Zip MLFlow logs

In [None]:
 !zip -r mlruns.zip mlruns/