In [10]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"
SEED = 42
MODEL_DIR = 'deu_lm=bert-base-german-cased_batchsize=8_lr=3e-5'
BERT_MODEL = 'bert-base-german-cased'
BATCH_SIZE = 8

In [11]:
import torch 
device = torch.device('cuda:0')

In [12]:
import pandas as pd
from datasets import Dataset

def read_df_custom(file):
    header = 'doc     unit1_toks      unit2_toks      unit1_txt       unit2_txt       s1_toks s2_toks unit1_sent      unit2_sent      dir     nuc_children    sat_children    genre   u1_discontinuous        u2_discontinuous       u1_issent        u2_issent       u1_length       u2_length       length_ratio    u1_speaker      u2_speaker      same_speaker    u1_func u1_pos  u1_depdir       u2_func u2_pos  u2_depdir       doclen  u1_position      u2_position     percent_distance        distance        lex_overlap_words       lex_overlap_length      unit1_case      unit2_case      label'
    extracted_columns = ['unit1_txt', 'unit1_sent', 'unit2_txt', 'unit2_sent', 'dir', 'label', 'distance', 'u1_depdir', 'u2_depdir', 'u2_func', 'u1_position', 'u2_position', 'sat_children', 'nuc_children', 'genre', 'unit1_case', 'unit2_case',
                            'u1_discontinuous', 'u2_discontinuous', 'same_speaker', 'lex_overlap_length', 'u1_func']
    header = header.split()
    df = pd.DataFrame(columns=extracted_columns)
    file = open(file, 'r')

    rows = []
    count = 0 
    for line in file:
        line = line[:-1].split('\t')
        count+=1
        if count ==1: continue
        row = {}
        for column in extracted_columns:
            index = header.index(column)
            row[column] = line[index]
        rows.append(row)

    df = pd.concat([df, pd.DataFrame.from_records(rows)])
    return df

train_dataset = Dataset.from_pandas(read_df_custom('../../processed/deu.rst.pcc_train_enriched.rels'))
test_dataset = Dataset.from_pandas(read_df_custom('../../processed/deu.rst.pcc_test_enriched.rels'))
valid_dataset = Dataset.from_pandas(read_df_custom('../../processed/deu.rst.pcc_dev_enriched.rels'))

len(train_dataset), len(test_dataset), len(valid_dataset)

(2164, 260, 241)

In [13]:
from transformers import AutoTokenizer, BertTokenizer
from datasets import ClassLabel

labels = ClassLabel(names=list(set(train_dataset['label'])|set(test_dataset['label'])|set(valid_dataset['label'])))

tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL)
class SNLIDataset(torch.utils.data.Dataset):
    """A customized dataset to load the SNLI dataset."""
    def __init__(self, dataset, labels, raw_text=False):
        self.text = []
        self.raw_text = []
        self.raw_label = []
        self.raw_text_flag = raw_text
        for premise, hypothesis in zip(dataset['unit1_txt'], dataset['unit2_txt']):
            self.text.append(tokenizer.encode_plus(premise, hypothesis, padding="max_length", truncation=True, max_length=512))
            if raw_text: self.raw_text.append([premise, hypothesis])
        # self.labels = torch.tensor(labels.str2int(dataset['label'])).to(device)
        self.labels = labels.str2int(dataset['label'])
        if raw_text: self.raw_label = dataset['label']
        print('read ' + str(len(self.text)) + ' examples')

    def __getitem__(self, idx):
        if self.raw_text_flag:  
            return {'input_ids':self.text[idx]['input_ids'], 
                'token_type_ids':self.text[idx]['token_type_ids'], 
                'attention_mask':self.text[idx]['attention_mask'], 
                'raw_text': self.raw_text[idx],
                'label':self.labels[idx],
                'raw_label': self.raw_label[idx]}

        return {'input_ids':self.text[idx]['input_ids'], 
                'token_type_ids':self.text[idx]['token_type_ids'], 
                'attention_mask':self.text[idx]['attention_mask'], 
                'label':self.labels[idx]}

    def __len__(self):
        return len(self.text)


def load_data_snli(batch_size, labels):
    """Download the SNLI dataset and return data iterators and vocabulary."""
    train_data = train_dataset
    valid_data = valid_dataset
    test_data = test_dataset
    train_set = SNLIDataset(train_data, labels, raw_text=False)
    valid_set = SNLIDataset(valid_data, labels, raw_text=False)
    test_set = SNLIDataset(test_data, labels, raw_text=False)
    train_iter = torch.utils.data.DataLoader(train_set, batch_size,
                                             shuffle=True)
    valid_iter = torch.utils.data.DataLoader(valid_set, batch_size,
                                            shuffle=False)
    test_iter = torch.utils.data.DataLoader(test_set, batch_size,
                                            shuffle=False)
    
    return train_set, valid_set, test_set

train_iter, valid_iter, test_iter = load_data_snli(BATCH_SIZE, labels)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-german-cased/snapshots/702774c02b32a4f360d5fea60ab034d64bf0141c/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-german-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30000
}

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--bert-base-german-cased/snapshots/702774c02b32a4f360d5fea60ab034d64bf0141c/vocab.txt
loading file tokenizer.json from ca

read 2164 examples
read 241 examples
read 260 examples


In [14]:
from transformers import AutoModelForSequenceClassification
from transformers.optimization import Adafactor, AdafactorSchedule, AdamW
from torch import optim

model = AutoModelForSequenceClassification.from_pretrained(BERT_MODEL, num_labels=len(list(set(train_dataset['label'])|set(test_dataset['label'])|set(valid_dataset['label']))))
optimizer = Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)
lr_scheduler = AdafactorSchedule(optimizer)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-german-cased/snapshots/702774c02b32a4f360d5fea60ab034d64bf0141c/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-german-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25": "LAB

In [15]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# import evaluate
# metric = evaluate.load("accuracy")

In [16]:
from copy import deepcopy
from transformers import TrainerCallback

class CustomCallback(TrainerCallback):
    
    def __init__(self, trainer) -> None:
        super().__init__()
        self._trainer = trainer
    
    def on_epoch_end(self, args, state, control, **kwargs):
        if control.should_evaluate:
            control_copy = deepcopy(control)
            self._trainer.evaluate(eval_dataset=self._trainer.train_dataset, metric_key_prefix="train")
            return control_copy

In [17]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir=MODEL_DIR, 
                                evaluation_strategy="epoch",
                                per_device_train_batch_size =BATCH_SIZE,
                                per_device_eval_batch_size=BATCH_SIZE,
                                num_train_epochs=40,
                                save_total_limit=1,
                                learning_rate=3e-5,
                                weight_decay=0.01,
                                logging_steps=1,
                                metric_for_best_model = 'acc')

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [18]:
from transformers import EarlyStoppingCallback

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_iter,
    eval_dataset=valid_iter,
    compute_metrics=compute_metrics,
    optimizers=[optimizer, lr_scheduler],
    # callbacks = [EarlyStoppingCallback(early_stopping_patience=12)]
)

trainer.add_callback(CustomCallback(trainer)) 

In [19]:
trainer.train() 

***** Running training *****
  Num examples = 2164
  Num Epochs = 40
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 10840
  Number of trainable parameters = 109101338
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33merzaliator[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,2.8009,2.836465,0.172828,0.058731,0.109265,0.088633
1,2.8009,2.932996,0.141079,0.033474,0.063283,0.070627
2,2.1337,2.292809,0.389556,0.241913,0.316057,0.259969
2,2.1337,2.512104,0.294606,0.18047,0.243859,0.215568
3,1.2994,1.685368,0.599353,0.38406,0.417959,0.407773
3,1.2994,2.274735,0.33195,0.213372,0.200595,0.252482
4,1.3751,1.150985,0.670055,0.463152,0.612414,0.476467
4,1.3751,2.379601,0.319502,0.210836,0.224512,0.256454
5,0.8639,0.611207,0.854436,0.676191,0.78742,0.665991
5,0.8639,2.400862,0.352697,0.273228,0.288236,0.306044


***** Running Evaluation *****
  Num examples = 2164
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 241
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to deu_lm=bert-base-german-cased_batchsize=8_lr=3e-5/checkpoint-500
Configuration saved in deu_lm=bert-base-german-cased_batchsize=8_lr=3e-5/checkpoint-500/config.json
Model weights saved in deu_lm=bert-base-german-cased_batchsize=8_lr=3e-5/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2164
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 241
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 2164
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 241
  Batch size = 8
  _warn_pr

TrainOutput(global_step=10840, training_loss=0.9504544961719638, metrics={'train_runtime': 9859.7808, 'train_samples_per_second': 8.779, 'train_steps_per_second': 1.099, 'total_flos': 2.277980062973952e+16, 'train_loss': 0.9504544961719638, 'epoch': 40.0})

In [20]:
trainer.evaluate(test_iter)

***** Running Evaluation *****
  Num examples = 260
  Batch size = 8


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 5.5163798332214355,
 'eval_accuracy': 0.1,
 'eval_f1': 0.05581240371681176,
 'eval_precision': 0.05669644269825758,
 'eval_recall': 0.08297551805868235,
 'eval_runtime': 2.8768,
 'eval_samples_per_second': 90.379,
 'eval_steps_per_second': 11.471,
 'epoch': 40.0}