In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"
SEED = 42
MODEL_DIR = 'deu_lm=bert-base-german-cased_batchsize=8_lr=3e-5'
BERT_MODEL = 'bert-base-german-cased'
BATCH_SIZE = 8

In [2]:
import torch 
device = torch.device('cuda:0')

In [3]:
import pandas as pd
from datasets import Dataset

def read_df_custom(file):
    header = 'doc     unit1_toks      unit2_toks      unit1_txt       unit2_txt       s1_toks s2_toks unit1_sent      unit2_sent      dir     nuc_children    sat_children    genre   u1_discontinuous        u2_discontinuous       u1_issent        u2_issent       u1_length       u2_length       length_ratio    u1_speaker      u2_speaker      same_speaker    u1_func u1_pos  u1_depdir       u2_func u2_pos  u2_depdir       doclen  u1_position      u2_position     percent_distance        distance        lex_overlap_words       lex_overlap_length      unit1_case      unit2_case      label'
    extracted_columns = ['unit1_txt', 'unit1_sent', 'unit2_txt', 'unit2_sent', 'dir', 'label', 'distance', 'u1_depdir', 'u2_depdir', 'u2_func', 'u1_position', 'u2_position', 'sat_children', 'nuc_children', 'genre', 'unit1_case', 'unit2_case',
                            'u1_discontinuous', 'u2_discontinuous', 'same_speaker', 'lex_overlap_length', 'u1_func']
    header = header.split()
    df = pd.DataFrame(columns=extracted_columns)
    file = open(file, 'r')

    rows = []
    count = 0 
    for line in file:
        line = line[:-1].split('\t')
        count+=1
        if count ==1: continue
        row = {}
        for column in extracted_columns:
            index = header.index(column)
            row[column] = line[index]
        rows.append(row)

    df = pd.concat([df, pd.DataFrame.from_records(rows)])
    return df

train_dataset = Dataset.from_pandas(read_df_custom('../../processed/deu.rst.pcc_train_enriched.rels'))
test_dataset = Dataset.from_pandas(read_df_custom('../../processed/deu.rst.pcc_test_enriched.rels'))
valid_dataset = Dataset.from_pandas(read_df_custom('../../processed/deu.rst.pcc_dev_enriched.rels'))

len(train_dataset), len(test_dataset), len(valid_dataset)

(2164, 260, 241)

In [4]:
from transformers import AutoTokenizer, BertTokenizer
from datasets import ClassLabel

labels = ClassLabel(names=list(set(train_dataset['label'])|set(test_dataset['label'])|set(valid_dataset['label'])))

tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL)
class SNLIDataset(torch.utils.data.Dataset):
    """A customized dataset to load the SNLI dataset."""
    def __init__(self, dataset, labels, raw_text=False):
        self.text = []
        self.raw_text = []
        self.raw_label = []
        self.raw_text_flag = raw_text
        for premise, hypothesis in zip(dataset['unit1_txt'], dataset['unit2_txt']):
            self.text.append(tokenizer.encode_plus(premise, hypothesis, padding="max_length", truncation=True, max_length=512))
            if raw_text: self.raw_text.append([premise, hypothesis])
        # self.labels = torch.tensor(labels.str2int(dataset['label'])).to(device)
        self.labels = labels.str2int(dataset['label'])
        if raw_text: self.raw_label = dataset['label']
        print('read ' + str(len(self.text)) + ' examples')

    def __getitem__(self, idx):
        if self.raw_text_flag:  
            return {'input_ids':self.text[idx]['input_ids'], 
                'token_type_ids':self.text[idx]['token_type_ids'], 
                'attention_mask':self.text[idx]['attention_mask'], 
                'raw_text': self.raw_text[idx],
                'label':self.labels[idx],
                'raw_label': self.raw_label[idx]}

        return {'input_ids':self.text[idx]['input_ids'], 
                'token_type_ids':self.text[idx]['token_type_ids'], 
                'attention_mask':self.text[idx]['attention_mask'], 
                'label':self.labels[idx]}

    def __len__(self):
        return len(self.text)


def load_data_snli(batch_size, labels):
    """Download the SNLI dataset and return data iterators and vocabulary."""
    train_data = train_dataset
    valid_data = valid_dataset
    test_data = test_dataset
    train_set = SNLIDataset(train_data, labels, raw_text=False)
    valid_set = SNLIDataset(valid_data, labels, raw_text=False)
    test_set = SNLIDataset(test_data, labels, raw_text=False)
    train_iter = torch.utils.data.DataLoader(train_set, batch_size,
                                             shuffle=True)
    valid_iter = torch.utils.data.DataLoader(valid_set, batch_size,
                                            shuffle=False)
    test_iter = torch.utils.data.DataLoader(test_set, batch_size,
                                            shuffle=False)
    
    return train_set, valid_set, test_set

train_iter, valid_iter, test_iter = load_data_snli(BATCH_SIZE, labels)

read 2164 examples
read 241 examples
read 260 examples


In [5]:
from transformers import AutoModelForSequenceClassification
from transformers.optimization import Adafactor, AdafactorSchedule, AdamW
from torch import optim

model = AutoModelForSequenceClassification.from_pretrained(BERT_MODEL, num_labels=len(list(set(train_dataset['label'])|set(test_dataset['label'])|set(valid_dataset['label']))))
optimizer = Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)
lr_scheduler = AdafactorSchedule(optimizer)

Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoi

In [6]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# import evaluate
# metric = evaluate.load("accuracy")

In [7]:
from copy import deepcopy
from transformers import TrainerCallback

class CustomCallback(TrainerCallback):
    
    def __init__(self, trainer) -> None:
        super().__init__()
        self._trainer = trainer
    
    def on_epoch_end(self, args, state, control, **kwargs):
        if control.should_evaluate:
            control_copy = deepcopy(control)
            self._trainer.evaluate(eval_dataset=self._trainer.train_dataset, metric_key_prefix="train")
            return control_copy

In [8]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir=MODEL_DIR, 
                                evaluation_strategy="epoch",
                                per_device_train_batch_size =BATCH_SIZE,
                                per_device_eval_batch_size=BATCH_SIZE,
                                num_train_epochs=40,
                                save_total_limit=1,
                                learning_rate=3e-5,
                                weight_decay=0.01,
                                logging_steps=1,
                                metric_for_best_model = 'acc')

In [9]:
from transformers import EarlyStoppingCallback

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_iter,
    eval_dataset=valid_iter,
    compute_metrics=compute_metrics,
    optimizers=[optimizer, lr_scheduler],
    # callbacks = [EarlyStoppingCallback(early_stopping_patience=12)]
)

trainer.add_callback(CustomCallback(trainer)) 

In [10]:
trainer.train() 

***** Running training *****
  Num examples = 2164
  Num Epochs = 40
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 10840
  Number of trainable parameters = 109101338
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33merzaliator[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,2.8472,2.835521,0.207024,0.081046,0.097023,0.111254
1,2.8472,2.920946,0.157676,0.064908,0.064004,0.096294
2,2.4686,2.263037,0.436229,0.274088,0.327715,0.293852
2,2.4686,2.526372,0.298755,0.192091,0.188098,0.230382
3,1.4873,1.635161,0.604436,0.397624,0.463148,0.417277
3,1.4873,2.269877,0.344398,0.242981,0.246436,0.264635
4,1.6925,1.03449,0.749076,0.520569,0.618257,0.535163
4,1.6925,2.269451,0.356846,0.233081,0.239918,0.277124
5,0.798,0.620987,0.850739,0.637651,0.701333,0.639199
5,0.798,2.50044,0.352697,0.263061,0.259637,0.299743


***** Running Evaluation *****
  Num examples = 2164
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 241
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to deu_lm=bert-base-german-cased_batchsize=8_lr=3e-5/checkpoint-500
Configuration saved in deu_lm=bert-base-german-cased_batchsize=8_lr=3e-5/checkpoint-500/config.json
Model weights saved in deu_lm=bert-base-german-cased_batchsize=8_lr=3e-5/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2164
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 241
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 2164
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 241
  Batch size = 8
  _warn_pr

TrainOutput(global_step=10840, training_loss=0.9437711105086347, metrics={'train_runtime': 4890.7022, 'train_samples_per_second': 17.699, 'train_steps_per_second': 2.216, 'total_flos': 2.277980062973952e+16, 'train_loss': 0.9437711105086347, 'epoch': 40.0})

In [11]:
trainer.evaluate(test_iter)

***** Running Evaluation *****
  Num examples = 260
  Batch size = 8


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 5.641300201416016,
 'eval_accuracy': 0.12307692307692308,
 'eval_f1': 0.09876033419792386,
 'eval_precision': 0.12407411488293842,
 'eval_recall': 0.11849480333643228,
 'eval_runtime': 2.8994,
 'eval_samples_per_second': 89.673,
 'eval_steps_per_second': 11.382,
 'epoch': 40.0}