## Load data

In [36]:
!pip install transformers datasets



In [37]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel,\
                            BertModel, BertTokenizer, AutoModelForQuestionAnswering, pipeline,\
                            Trainer, TrainingArguments, BertForSequenceClassification, RobertaTokenizer,\
                            TFRobertaForQuestionAnswering

import tensorflow as tf

from datasets import list_metrics, load_metric

import torch
import torch.nn as nn

import numpy as np
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import train_test_split

In [38]:
## Load and pre-process data to try simply 2-class classification problem first: either the description leads to a positive or negative price estimation error\
df = pd.read_csv('drive/MyDrive/data/EU_motors_data/pl/PL_data_10k.csv')

In [39]:
df.head()

Unnamed: 0,price_eval_sk,ad_id,train,truncated_description,truncated_desc_length,prediction,error,pe,ape,discretized_pe,simple_class
0,6078637720|PL|otomoto,6078637720,False,Do sprzedania Opla Vectra C GTS OPC Line z rok...,89,15084.661133,-2415.338867,-13.801936,13.801936,-5.0,0
1,6084599422|PL|otomoto,6084599422,False,Więcej zdjęć na OTOMOTO \r\n-świeżo sprowadzon...,44,22104.378906,4110.378906,22.843053,22.843053,5.0,1
2,6084185721|PL|otomoto,6084185721,False,SKODA FABIA KOMBI\r\nROK PRODUKCJI 2008 model ...,171,13786.808594,-913.191406,-6.212186,6.212186,-3.0,0
3,6081681904|PL|otomoto,6081681904,False,Citroen C3 Picasso 1.6 Vti 120KM \r\n\r\nSamoc...,59,32854.804688,-145.195312,-0.439986,0.439986,-0.0,0
4,6082099343|PL|otomoto,6082099343,False,"Witam,\r\nna sprzedaż Ford Transit Connect 5 o...",88,47534.355469,-7815.644531,-14.120406,14.120406,-5.0,0


In [40]:
df['simple_class'].value_counts() / len(df)

1    0.6269
0    0.3731
Name: simple_class, dtype: float64

In [41]:
model_name = 'allegro/herbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.sso.sso_relationship.weight', 'cls.sso.sso_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification 

In [42]:
# Get maximum possible length
print(f"Maximum number of tokens: {model.config.max_position_embeddings}\n")


Maximum number of tokens: 514



In [43]:
# THERE MIGHT BE MORE PRE-PROCESSING NEEDED, AS THE MAX TOKENIZED LENGHT IS > 1500, WHILE I SELECTED ONLY THE FIRST 512 WORDS IN THE PRE-PROCESSING STEP
# # Get the number of tokens in the text
# df['tokenized_length'] = df['truncated_description'].apply(lambda row: len(tokenizer(row)['input_ids']))

# print(df['tokenized_length'].describe())
# df.tokenized_length.hist()
# df.tokenized_length.hist()

In [44]:
## Subsample data, take 30 minutes to tokenize 1M ads
#num_ads_in_sample = 50000
num_ads = len(df)
df_sample = df.sample(num_ads_in_sample)
df_sample.dropna(subset=['truncated_description', 'simple_class'], inplace=True)

## Split into train and validation
train, valid, test = np.split(df_sample.sample(frac=1, random_state=42),
        [int(.7 * len(df_sample)), int(.9 * len(df_sample))])

In [45]:
## Let's save our descriptions and labels into lists
train_descs, train_labels = train['truncated_description'].tolist(), train['simple_class'].tolist()
valid_descs, valid_labels = valid['truncated_description'].tolist(), valid['simple_class'].tolist()
test_descs, test_labels = test['truncated_description'].tolist(), test['simple_class'].tolist()

## And encode with tokenizer
MAX_LENGTH = 64
train_encodings = tokenizer(train_descs, truncation=True, padding='max_length', max_length=MAX_LENGTH)
val_encodings = tokenizer(valid_descs, truncation=True, padding='max_length', max_length=MAX_LENGTH)
test_encodings = tokenizer(test_descs, truncation=True, padding='max_length', max_length=MAX_LENGTH)

In [46]:
test['simple_class'].isna().sum()

0

In [47]:
class ClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __getitem__(self, idx):
        
        # Encodings are dictionaries that include list of lists for the keys: 
        # 'input_ids', 'attention_mask', 'token_type_ids'
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [48]:
train_dataset = ClassificationDataset(train_encodings, train_labels)
val_dataset = ClassificationDataset(val_encodings, valid_labels)
test_dataset = ClassificationDataset(test_encodings, test_labels)

In [49]:
#BertForSequenceClassification.forward?

## Fine-tuning the model
Now that we have our datasets ready, we can move to training. For this we'll use Transformers' Trainer class. Trainer is training and evaluation loop for PyTorch, optimized for Transformers. (There is also TFTrainer for Tensorflow, but we won't go into that.) We could have also used native PyTorch training, but the Trainer class streamlines the training process and provides useful abstractions for quickly customize the training. Hence, we'll go through it. For an example of native PyTorch fine-tuning, you can go through this notebook here.


## Training Arguments¶



In [None]:
training_args = TrainingArguments(
    output_dir="./results",            
    evaluation_strategy="epoch",   
    logging_steps=100,
    num_train_epochs=10,                
    per_device_train_batch_size=64,    
    per_device_eval_batch_size=64,     
    learning_rate=5.9e-05,                
    max_grad_norm=1.0,                 
    lr_scheduler_type='cosine_with_restarts',           
    warmup_steps=100,               
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    seed=16
)

In [51]:
accuracy = load_metric('accuracy')
f1 = load_metric('f1')
precision = load_metric('precision')
recall = load_metric('recall')

In [52]:
def compute_metrics(eval_pred):
    # Evaluation will return a tuple of (predictions and labels). Predictions will be the logit values.
    predictions, labels = eval_pred
    
    # We argmax the prediction array to get the predicted classes
    predictions = np.argmax(predictions, axis=1)
    
    # The datasets metrics has the compute method, which returns a dictionary with the name of the metric as the key.
    acc_score = accuracy.compute(predictions=predictions, references=labels)['accuracy']
    f1_score = f1.compute(predictions=predictions, references=labels)['f1']
    precision_score = precision.compute(predictions=predictions, references=labels)['precision']
    recall_score = recall.compute(predictions=predictions, references=labels)['recall']
    
    return {'accuracy': acc_score, 
            "f1": f1_score, 
            "recall": recall_score, 
            "precision": precision_score}

In [53]:
n_0 = (df_sample.simple_class==0).sum()
n_1 = (df_sample.simple_class==1).sum()

weights = [1.0, np.sqrt(n_0/n_1)]
print(weights)

[1.0, 0.7714601361995932]


In [54]:
# We define our custom loss function
w = torch.FloatTensor(weights).cuda()
loss_fct = nn.CrossEntropyLoss(weight=w)

class MyTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):

        labels = inputs.get("labels")
        outputs = model(**inputs)        
        logits = outputs['logits']
        
        # This is the only part that we change
        loss = loss_fct(logits,labels)
        
        if return_outputs:
            outputs.loss = loss
        
        return (loss, outputs) if return_outputs else loss

In [55]:
trainer = MyTrainer(
    model=model,         
    args=training_args,                  
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [56]:
train_dataset.labels.dtype

torch.int64

In [57]:
trainer.train()


  # This is added back by InteractiveShellApp.init_path()


Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision,Runtime,Samples Per Second
1,0.6844,0.670464,0.6335,0.774253,0.988985,0.636134,7.0583,283.353
2,0.683,0.669902,0.5965,0.718128,0.808812,0.645729,7.4616,268.04


  # This is added back by InteractiveShellApp.init_path()


TrainOutput(global_step=220, training_loss=0.6833536234768954, metrics={'train_runtime': 193.593, 'train_samples_per_second': 1.136, 'total_flos': 669013191168000.0, 'epoch': 2.0, 'init_mem_cpu_alloc_delta': 0, 'init_mem_gpu_alloc_delta': 497793024, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': -129097728, 'train_mem_gpu_alloc_delta': 2082668032, 'train_mem_cpu_peaked_delta': 129544192, 'train_mem_gpu_peaked_delta': 2546023424})

In [58]:
trainer.evaluate(test_dataset)


  # This is added back by InteractiveShellApp.init_path()


{'epoch': 2.0,
 'eval_accuracy': 0.603,
 'eval_f1': 0.7517198248905566,
 'eval_loss': 0.6816092133522034,
 'eval_mem_cpu_alloc_delta': 143360,
 'eval_mem_cpu_peaked_delta': 0,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_gpu_peaked_delta': 214030848,
 'eval_precision': 0.6034136546184738,
 'eval_recall': 0.9966832504145937,
 'eval_runtime': 3.6452,
 'eval_samples_per_second': 274.333}