In [1]:
import torch
import pandas as pd
from torch import nn
from datasets import Dataset, DatasetDict
from transformers import BertTokenizerFast, TrainingArguments, Trainer, BertForSequenceClassification

from sklearn.model_selection import train_test_split

In [2]:
pd.set_option('max_colwidth', 300)

In [3]:
SEED = 0
BATCH_SIZE = 30
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Choosing the model and the tokenizer

In [4]:
model_name = 'bert-base-cased'

tokenizer = BertTokenizerFast.from_pretrained(model_name)
bert_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=1, ignore_mismatched_sizes=True)
bert_model.to(DEVICE);

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

### Reading and preprocessing the dataset

In [5]:
train_df = pd.read_csv("../../../data/input/train.csv.zip", usecols=[3, 4])
train_df = train_df.rename(columns={'excerpt': 'text', 'target': 'labels'})
train_df, eval_df = train_test_split(train_df, test_size=0.1, stratify=pd.cut(train_df["labels"], 5), random_state=SEED)
train_df.head()

Unnamed: 0,text,labels
1367,Illustrious Sir: I have the honor to hand to your Royal Highness the letter by which his Majesty the Emperor of Russia has deigned to accredit me by his Majesty the King of Serbia.\nMy august master has charged me to express to you the vivid sympathy and the sincere admiration which his Majesty ...,-2.634137
2399,"By looking at any map of Europe, it will be seen that England is separated from France by the English Channel, a passage which, though it looks quite narrow on the map, is really very wide, especially toward the west. The narrowest place is between Dover and Calais, where the distance across is ...",-1.188881
2616,"When the ambassadors had returned to Rome the Senate commanded that there should be levied two armies; and that Minucius the Consul should march with the one against the Æquians on Mount Ægidus, and that the other should hinder the enemy from their plundering. This levying the tribunes of the Co...",-3.218972
1671,"Having just made the trip from Salt Lake City to this place on the Denver & Rio Grande line, I cannot write you on any other subject at present. There is not in the world a railroad journey of thirty hours so filled with grand and beautiful views. I should perhaps qualify this statement by deduc...",-1.215504
1497,"It was Saturday evening; the sun was setting, the workpeople were coming in crowds from the factory to the station, and they bowed to the carriage in which Korolyov was driving. And he was charmed with the evening, the farmhouses and villas on the road, and the birch-trees, and the quiet atmosph...",-1.212478


### Converting the dataset into transformer-friendly format

In [7]:
def create_dataset(train, evaluation):
    dataset = DatasetDict()
    dataset['train'] = Dataset.from_dict(train.to_dict(orient='list'), split="train")
    dataset['eval'] = Dataset.from_dict(evaluation.to_dict(orient='list'), split="eval")

    return dataset

dataset = create_dataset(train_df, eval_df)

### Preparing the dataset for feeding into the model

In [6]:
def tokenize(examples):
    return tokenizer(examples["text"], padding='max_length', truncation=True, max_length=300, return_tensors="pt", return_attention_mask=True)

tokenized_dataset = dataset.map(tokenize, batched=True, batch_size=64, remove_columns="text")
tokenized_dataset.set_format("torch", columns=['input_ids', 'attention_mask'], output_all_columns=True)

### Setting all training params

In [11]:
training_args = TrainingArguments(
    output_dir="trainer", 
    evaluation_strategy="epoch", 
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=3.0,
    learning_rate=1e-5,
    optim="adamw_torch",
)

### Calculating RMSE metrics

In [10]:
def compute_metrics(pred):
    labels = torch.from_numpy(pred.label_ids)
    preds = torch.from_numpy(pred.predictions).squeeze()
    mse = torch.mean((preds - labels) ** 2)
    rmse = torch.sqrt(mse)

    return {
        'rmse': rmse,
    }

### Creating custom trainer
We need it for calculating RMSE loss

In [9]:
class BertRegressorTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs["labels"]
        outputs = model(**inputs)
        loss = torch.sqrt(nn.functional.mse_loss(outputs["logits"].squeeze(), labels))

        return (loss, outputs) if return_outputs else loss

In [12]:
trainer = BertRegressorTrainer(
    model=bert_model,
    args=training_args,
    train_dataset=tokenized_dataset['train'].shuffle(seed=SEED),
    eval_dataset=tokenized_dataset['eval'].shuffle(seed=SEED),
    compute_metrics=compute_metrics,
)

In [13]:
trainer.train()

***** Running training *****
  Num examples = 2550
  Num Epochs = 5
  Instantaneous batch size per device = 30
  Total train batch size (w. parallel, distributed & accumulation) = 30
  Gradient Accumulation steps = 1
  Total optimization steps = 425
  Number of trainable parameters = 108311041


Epoch,Training Loss,Validation Loss,Rmse
1,No log,0.680478,0.687693
2,No log,0.60967,0.614483
3,No log,0.547744,0.551593
4,No log,0.590452,0.594233
5,No log,0.610101,0.613585


***** Running Evaluation *****
  Num examples = 284
  Batch size = 30
***** Running Evaluation *****
  Num examples = 284
  Batch size = 30
***** Running Evaluation *****
  Num examples = 284
  Batch size = 30
***** Running Evaluation *****
  Num examples = 284
  Batch size = 30
***** Running Evaluation *****
  Num examples = 284
  Batch size = 30


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=425, training_loss=0.5953230195886948, metrics={'train_runtime': 386.4163, 'train_samples_per_second': 32.996, 'train_steps_per_second': 1.1, 'total_flos': 1965606934950000.0, 'train_loss': 0.5953230195886948, 'epoch': 5.0})

In [14]:
trainer.save_model("/model/")

In [15]:
tokenizer.save_pretrained("/tokenizer/")