# Sequence classification with *NeuralNetTrainer* #

### To fine-tune a model using *NeuralNetTrainer*, we need to follow four steps: ###
1. initialize backbone
2. prepare data
3. initialize optimizer and specify functions for bert model
4. initialize model with *NeuralNetTrainer*

after these steps the model can be fitted


In [None]:
import torch
import pytorch_lightning as pl
from transformers import  AdamW
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from torch.utils.data import TensorDataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datetime import datetime
from NeuralNetTrainer import NeuralNetTrainer

from torchmetrics.functional import accuracy

#### 1. initialize backbone ####
I will use xlm-roberta-base from [huggingface](https://huggingface.co/xlm-roberta-base)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
bert_model = AutoModelForSequenceClassification.from_pretrained(
    "xlm-roberta-base",
    output_attentions = False,
    output_hidden_states = False,
    num_labels=5
)

#### 2. prepare data ####

In [3]:
def from_numbers_to_string(x):
    if x<30:
        return "new"
    if (x>=30) and (x<75):
        return "recently"
    if (x>=75) and (x<120):
        return "not long ago"
    if (x>=120) and (x<165):
        return "middle"
    if (x>=165) and (x<200):
        return "almost old"
    if x>=200:
        return "old"

def load_data(path):
    df = pd.read_csv(path)
    df = df.dropna()
    df['at'] = df['at'].map(lambda x: x[0:10])
    datetime_object_now = datetime.strptime('2022 03 20', '%Y %m %d')
    df['at'] = list(map(lambda x: (datetime_object_now - datetime.strptime(str(x).replace('-', ' '), '%Y %m %d')).days, df['at']))
    df['at'] = df['at'].map(lambda x: from_numbers_to_string(x))
    sentences = ['[CLS]' + df['content'].iloc[i]+ '[SEP]' + df['at'].iloc[i] for i in range(len(df))]
    cat_c = 'score'

    if cat_c in list(df):
        labels = [x-1 for x in df[cat_c].values]
        return sentences, labels
    else:
        return sentences

def prepare_data(sentences, labels=None):
    input_ids = []
    attention_masks = []

    # For every sentence...
    for sent in tqdm(sentences):
        encoded_dict = tokenizer.encode_plus(
            sent,                      # Sentence to encode.
            add_special_tokens = False, # Add '[CLS]' and '[SEP]'
            max_length = 50,           # Pad & truncate all sentences.
            pad_to_max_length = True,
            return_attention_mask = True,   # Construct attn. masks.
            return_tensors = 'pt',     # Return pytorch tensors.
        )

        # Add the encoded sentence to the list.
        input_ids.append(encoded_dict['input_ids'])

        # And its attention mask (simply differentiates padding from non-padding).
        attention_masks.append(encoded_dict['attention_mask'])

    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    if labels is None:
        return input_ids, attention_masks
    else:
        labels = torch.tensor(labels)
        return input_ids, attention_masks, labels

def load():
    global train_sentences,val_sentences,train_labels,val_labels
    train_sentences, train_labels = load_data('coment_mark/review_train.csv')
    train_sentences,val_sentences,train_labels,val_labels=train_test_split(train_sentences,train_labels, train_size=0.8)

def prepare():
    global train_input_ids, train_attention_masks, train_labels,val_input_ids, val_attention_masks, val_labels, train_sentences, val_sentences
    train_input_ids, train_attention_masks, train_labels = prepare_data(train_sentences, train_labels)
    val_input_ids, val_attention_masks, val_labels = prepare_data(val_sentences, val_labels)

def load_and_tok_for_pred():
    global test_input_ids, test_attention_masks
    sentences = load_data('coment_mark/review_test.csv')
    test_input_ids, test_attention_masks = prepare_data(sentences)


In [None]:
load()
prepare()
load_and_tok_for_pred()


train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)
test_dataset = TensorDataset(test_input_ids, test_attention_masks)

#### 3. initialize optimizer and *specify functions for bert model* ####

In [None]:
optimizer = AdamW(bert_model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8
                  )

def forward_fn(model, batch):
    input_ids = batch[0]
    input_mask = batch[1]
    labels = batch[2]
    result = model(input_ids,
                    token_type_ids=None,
                    attention_mask=input_mask,
                    labels=labels,
                    return_dict=True
                   )
    return result

def get_loss_fn(segmenter_object, preds, batch):
    return preds.loss

def get_acc_fn(segmenter_object, preds, batch):
    target = batch[2]
    preds = preds.logits
    acc = accuracy(preds.squeeze(), F.one_hot(target.squeeze(), num_classes=5), task=segmenter_object.task, num_labels=segmenter_object.num_labels,
                   num_classes=segmenter_object.num_classes)
    return acc

#### 4. initialize model with *NeuralNetTrainer* ####

In [6]:
model = NeuralNetTrainer(
    backbone=bert_model,
    optimizer=optimizer,
    train_torch_dataset=train_dataset,
    val_torch_dataset=val_dataset,
    pred_torch_dataset=test_dataset,
    task='multilabel',
    num_labels=5,
    specify_forward_step=forward_fn,
    specify_get_loss=get_loss_fn,
    specify_get_accuracy=get_acc_fn,
    batch_size=128
)

if you specify forward step then you should generally specify 'specify_get_loss', and 'specify_get_accuracy'


In [None]:
torch.set_float32_matmul_precision('medium')
trainer = pl.Trainer(
    accelerator='gpu',
    min_epochs=1,
    max_epochs=2,
)

trainer.fit(model)