#### Author
Zalina Rusinova

#### Reference
[Notion ticket](https://www.notion.so/a74951e4e815480584dea7d61ddce6cc?v=dbfdb1207d0e451b827d3c5041ed0cfd&p=141d322a8f1a421fbb801755ea55caec)

#### Idea
Test different ways of data augmentation 

#### Data
4500 cryptonews titles labeled as positive, neutral or negative – zipped pwd-protected [CSV](https://drive.google.com/file/d/1Apr3YPZVf0kOJ5Pc1RYDoQxTdjJPbnt4/view?usp=sharing) (not to be shared outside of the project!)

#### Result
So far, it has not been possible to achieve an increase in accuracy by any of tried methods of data augmentation.

In [1]:
from datasets import load_dataset, load_metric
from transformers import (
    TrainingArguments, Trainer, 
    AdamW, get_scheduler,
    AutoModelForSequenceClassification,
    AutoTokenizer
)

import pandas as pd
import numpy as np
from copy import deepcopy
import matplotlib.pyplot as plt
import yaml
from importlib import import_module
import os

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from typing import Dict, Any, Tuple, List, Union, Callable

import torch
from torch import Tensor
from torch.utils.data import Dataset, DataLoader

import pytorch_lightning as pl
from pytorch_lightning import Trainer as Pl_trainer
from pytorch_lightning import seed_everything, Callback
from sklearn.metrics import classification_report
from sklearn.utils import shuffle

import nlpaug.augmenter.word as naw

import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')


import warnings
warnings.filterwarnings('ignore')

import logging
logging.disable(logging.INFO)
logging.disable(logging.WARNING)

import copy



seed_everything(42)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/victor/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/victor/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


42

### Read data

In [2]:
DATA_PATH = '../data/20190110_train_4500.csv'

In [3]:
dataset = pd.read_csv(DATA_PATH)

In [4]:
dataset.head()

Unnamed: 0,title,sentiment
0,Bitcoin Market Has Run Out of Juice: Cryptocur...,Negative
1,Bitcoin Core 0.14.0 Speeds Up Blockchain Synci...,Positive
2,Thinking of Travelling With Bitcoin? With Thes...,Positive
3,Investors Carried Out Mental Gymnastics to Jus...,Negative
4,"Bitcoin Price Holds Above $8,500 as Market Fig...",Positive


In [5]:
le = LabelEncoder()

dataset["label"] = le.fit_transform(dataset["sentiment"])

### Config

In [6]:
def build_object(
    object_cfg: Dict[str, Any], 
    is_hugging_face: bool = False, 
    **kwargs: Dict[str, Any]
) -> Callable:
    if "class" not in object_cfg.keys():
        raise ValueError("class key schould be in config")

    if "params" in object_cfg.keys():
        params = object_cfg["params"]

        for key, val in params.items():
            kwargs[key] = val
    else:
        params = {}
    
    if is_hugging_face:
        return get_instance(object_cfg["class"]).from_pretrained(**kwargs)
    
    return get_instance(object_cfg["class"])(**kwargs)


def get_instance(object_path: str) -> Callable:

    module_path, class_name = object_path.rsplit(".", 1)
    module = import_module(module_path)

    return getattr(module, class_name)

In [74]:
cfg_str = """
training_args:
    class: transformers.TrainingArguments
    params:
        output_dir: './test_trainer'
        num_train_epochs: 3
        per_device_train_batch_size: 32
        per_device_eval_batch_size: 64
        warmup_steps: 500                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   
        weight_decay: 0.001
        logging_steps: 10
        evaluation_strategy: epoch
        learning_rate: 0.00003


model_name: &model_name distilbert-base-uncased

tokenizer:
    class: transformers.DistilBertTokenizer
    params:
        pretrained_model_name_or_path: *model_name

model:
    class: transformers.DistilBertForSequenceClassification
    params:
        pretrained_model_name_or_path: *model_name
        num_labels: 3
"""

In [75]:
cfg = yaml.safe_load(cfg_str)

### Preprocess/split

In [9]:
class FinNewsDataset(Dataset):
    def __init__(self, encodings: Dict[str, Any], labels: list):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx: int) -> Dict[str, Any]:
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        
        return item
    
    def __len__(self) -> int:
        return len(self.labels)

In [37]:
def prepare_dataset(cfg: Dict[str, Any], data: pd.Series, labels: pd.Series) -> Dataset:
    
    tokenizer = build_object(cfg["tokenizer"], is_hugging_face=True)
    
    encodings = tokenizer(data.values.tolist(), truncation=True, padding=True)
    
    return FinNewsDataset(encodings, labels.values.tolist())

In [11]:
def split_train_val(dataset: pd.DataFrame, test_size: float = 0.2) -> Tuple[list, ...]:
    train_data, val_data, train_labels, val_labels = train_test_split(
        dataset, 
        dataset["label"],
        test_size=test_size
    )
    
    return train_data, val_data, train_labels, val_labels

In [12]:
train_data, val_data, train_labels, val_labels = split_train_val(dataset)

## Augmentation of training dataset

In [19]:
def get_augs(
    augmenter,
    data: pd.DataFrame,
    augs_frac: float = 0.1,
    aug_kwargs: Dict[str, Any] = {},
    mult_aug: bool = False
)-> pd.DataFrame:
    

    samples_to_augment = data.sample(frac=augs_frac)
    
    if mult_aug:
        
        aug_data = []
        
        for i in range(len(samples_to_augment)):
            aug_result = augmenter.augment(samples_to_augment.iloc[i]["title"], **aug_kwargs)

            assert (aug_result) != list, "Aug result should be list"

            for line in aug_result:
                aug_data.append(line)
    
        aug_labels = [entry for entry in samples_to_augment["label"] for _ in range(aug_kwargs["n"])]
    else:
        aug_data = augmenter.augment(samples_to_augment["title"].tolist())
        aug_labels = samples_to_augment["label"]

    augs_df = pd.DataFrame({"title": aug_data, "label": aug_labels})
    
    result = shuffle(data.append(augs_df).reset_index(drop=True))
    
    return result

### 1. Back Translation Augmenter

Back-translation is translating target language to source language and mixing both original source sentence and back-translated sentence to train a model. 

In [20]:
back_translation_aug = naw.BackTranslationAug(
    from_model_name="facebook/wmt19-en-de", 
    to_model_name="facebook/wmt19-de-en",
    device="cuda"
)

In [21]:
%%time
back_translation_data = get_augs(back_translation_aug, train_data)

CPU times: user 12.4 s, sys: 108 ms, total: 12.5 s
Wall time: 12.6 s


In [23]:
back_translation_data.head()

Unnamed: 0,title,sentiment,label
680,Australia: Tax Regulator Warns of Fraudulent R...,Negative,0
2070,Bitcoin Price Watch; Here Are Two Trades For T...,Neutral,1
471,Final Frontier? William Shatner Boldly Goes in...,Positive,2
2663,January 14th Will be Known as ;Bitcoin Cash Ch...,Neutral,1
1998,Reddit to Relaunch Bitcoin Payments (And Add M...,Neutral,1


### 2. Synonym Augmenter

Substitute word by WordNet's synonym

In this example generating 2 augmented sentences with 3 synonyms (this parameters can be changed)

In [51]:
synonym_aug = naw.SynonymAug(aug_src='wordnet', aug_max=3)

In [84]:
%%time
synonyms_data = get_augs(synonym_aug, train_data, aug_kwargs={"n": 2}, mult_aug=True, augs_frac=0.2)

CPU times: user 861 ms, sys: 12.2 ms, total: 873 ms
Wall time: 857 ms


In [85]:
synonyms_data.shape

(5101, 3)

In [86]:
synonyms_data.head()

Unnamed: 0,title,sentiment,label
1737,Why Major Crypto Exchanges are Granting Bitcoi...,Neutral,1
2399,Former FDIC Chair: BitcoinPolicies Shouldn't '...,Neutral,1
286,Arizona Bitcoin Trader Convicted for Crypto Mo...,Negative,0
936,Politician Ron Paul: US Government Should 'Sta...,Negative,0
4964,Bitcoins White person Paper: The Blueprint for...,,1


## Train pipeline

### Hugging Face api

In [76]:
def train_huggin_face_pipeline( 
    cfg,
    train_dataset: Dataset, 
    val_dataset: Dataset
):
    
    model = build_object(cfg["model"], is_hugging_face=True)
    
    training_args = build_object(cfg["training_args"])
    
    metric = load_metric("accuracy")
    
    def compute_metrics(eval_pred: tuple) -> dict:
    
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)

        return metric.compute(predictions=predictions, references=labels)
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics
    )
    
    trainer.train()

In [77]:
hg_val_dataset = prepare_dataset(cfg, val_data["title"], val_data["label"])

**baseline**

In [78]:
hg_train_dataset = prepare_dataset(cfg, train_data["title"], train_data["label"])

In [79]:
train_huggin_face_pipeline(cfg, hg_train_dataset, hg_val_dataset)

Epoch,Training Loss,Validation Loss,Accuracy
1,0.9451,0.888755,0.673985
2,0.6432,0.575787,0.774973
3,0.4819,0.529932,0.796926


**back translation**

In [80]:
hg_train_dataset = prepare_dataset(cfg, back_translation_data["title"], back_translation_data["label"])

In [81]:
train_huggin_face_pipeline(cfg, hg_train_dataset, hg_val_dataset)

Epoch,Training Loss,Validation Loss,Accuracy
1,0.8918,0.830054,0.688255
2,0.5477,0.562562,0.78595
3,0.4653,0.579991,0.783754


**synonyms**

In [87]:
hg_train_dataset = prepare_dataset(cfg, synonyms_data["title"], synonyms_data["label"])

In [88]:
train_huggin_face_pipeline(cfg, hg_train_dataset, hg_val_dataset)

Epoch,Training Loss,Validation Loss,Accuracy
1,0.7705,0.716458,0.726674
2,0.4914,0.568339,0.788145
3,0.4098,0.524468,0.807903
