In [24]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="5"
SEED = 42
MODEL_DIR = 'deu_1_lr=3e-5_batch=8'
BATCH_SIZE = 8

In [25]:
import torch 
device = torch.device('cuda:0')

In [26]:
import pandas as pd
from datasets import Dataset

def read_df_custom(file):
    header = 'doc     unit1_toks      unit2_toks      unit1_txt       unit2_txt       s1_toks s2_toks unit1_sent      unit2_sent      dir     nuc_children    sat_children    genre   u1_discontinuous        u2_discontinuous       u1_issent        u2_issent       u1_length       u2_length       length_ratio    u1_speaker      u2_speaker      same_speaker    u1_func u1_pos  u1_depdir       u2_func u2_pos  u2_depdir       doclen  u1_position      u2_position     percent_distance        distance        lex_overlap_words       lex_overlap_length      unit1_case      unit2_case      label'
    extracted_columns = ['unit1_txt', 'unit1_sent', 'unit2_txt', 'unit2_sent', 'dir', 'label', 'distance', 'u1_depdir', 'u2_depdir', 'u2_func', 'u1_position', 'u2_position', 'sat_children', 'nuc_children', 'genre', 'unit1_case', 'unit2_case',
                            'u1_discontinuous', 'u2_discontinuous', 'same_speaker', 'lex_overlap_length', 'u1_func']
    header = header.split()
    df = pd.DataFrame(columns=extracted_columns)
    file = open(file, 'r')

    rows = []
    count = 0 
    for line in file:
        line = line[:-1].split('\t')
        count+=1
        if count ==1: continue
        row = {}
        for column in extracted_columns:
            index = header.index(column)
            row[column] = line[index]
        rows.append(row)

    df = pd.concat([df, pd.DataFrame.from_records(rows)])
    return df

train_dataset = Dataset.from_pandas(read_df_custom('../../processed/deu.rst.pcc_train_enriched.rels'))
test_dataset = Dataset.from_pandas(read_df_custom('../../processed/deu.rst.pcc_test_enriched.rels'))
valid_dataset = Dataset.from_pandas(read_df_custom('../../processed/deu.rst.pcc_dev_enriched.rels'))

len(train_dataset), len(test_dataset), len(valid_dataset)

(2164, 260, 241)

In [27]:
from transformers import AutoTokenizer, BertTokenizer
from datasets import ClassLabel

labels = ClassLabel(names=list(set(train_dataset['label'])|set(test_dataset['label'])|set(valid_dataset['label'])))

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
class SNLIDataset(torch.utils.data.Dataset):
    """A customized dataset to load the SNLI dataset."""
    def __init__(self, dataset, labels, raw_text=False):
        self.text = []
        self.raw_text = []
        self.raw_label = []
        self.raw_text_flag = raw_text
        for premise, hypothesis in zip(dataset['unit1_txt'], dataset['unit2_txt']):
            self.text.append(tokenizer.encode_plus(premise, hypothesis, padding="max_length", truncation=True, max_length=512))
            if raw_text: self.raw_text.append([premise, hypothesis])
        # self.labels = torch.tensor(labels.str2int(dataset['label'])).to(device)
        self.labels = labels.str2int(dataset['label'])
        if raw_text: self.raw_label = dataset['label']
        print('read ' + str(len(self.text)) + ' examples')

    def __getitem__(self, idx):
        if self.raw_text_flag:  
            return {'input_ids':self.text[idx]['input_ids'], 
                'token_type_ids':self.text[idx]['token_type_ids'], 
                'attention_mask':self.text[idx]['attention_mask'], 
                'raw_text': self.raw_text[idx],
                'label':self.labels[idx],
                'raw_label': self.raw_label[idx]}

        return {'input_ids':self.text[idx]['input_ids'], 
                'token_type_ids':self.text[idx]['token_type_ids'], 
                'attention_mask':self.text[idx]['attention_mask'], 
                'label':self.labels[idx]}

    def __len__(self):
        return len(self.text)


def load_data_snli(batch_size, labels):
    """Download the SNLI dataset and return data iterators and vocabulary."""
    train_data = train_dataset
    valid_data = valid_dataset
    test_data = test_dataset
    train_set = SNLIDataset(train_data, labels, raw_text=False)
    valid_set = SNLIDataset(valid_data, labels, raw_text=False)
    test_set = SNLIDataset(test_data, labels, raw_text=False)
    train_iter = torch.utils.data.DataLoader(train_set, batch_size,
                                             shuffle=True)
    valid_iter = torch.utils.data.DataLoader(valid_set, batch_size,
                                            shuffle=False)
    test_iter = torch.utils.data.DataLoader(test_set, batch_size,
                                            shuffle=False)
    
    return train_set, valid_set, test_set

train_iter, valid_iter, test_iter = load_data_snli(BATCH_SIZE, labels)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/a8d257ba9925ef39f3036bfc338acf5283c512d9/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/a8d257ba9925ef39f3036bfc338acf5283c512d9/vocab.txt
loading file tokenize

read 2164 examples
read 241 examples
read 260 examples


In [28]:
from transformers import AutoModelForSequenceClassification
from transformers.optimization import Adafactor, AdafactorSchedule, AdamW
from torch import optim

model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased', num_labels=len(list(set(train_dataset['label'])|set(test_dataset['label'])|set(valid_dataset['label']))))
optimizer = Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)
lr_scheduler = AdafactorSchedule(optimizer)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/a8d257ba9925ef39f3036bfc338acf5283c512d9/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABE

In [29]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# import evaluate
# metric = evaluate.load("accuracy")

In [30]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir=MODEL_DIR, 
                                evaluation_strategy="epoch",
                                per_device_train_batch_size =BATCH_SIZE,
                                per_device_eval_batch_size=BATCH_SIZE,
                                num_train_epochs=40,
                                save_total_limit=1,
                                learning_rate=3e-5,
                                weight_decay=0.01,
                                logging_steps=1,
                                metric_for_best_model = 'acc')

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [31]:
from transformers import EarlyStoppingCallback

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_iter,
    eval_dataset=valid_iter,
    compute_metrics=compute_metrics,
    optimizers=[optimizer, lr_scheduler],
    # callbacks = [EarlyStoppingCallback(early_stopping_patience=12)]
)

In [32]:
trainer.train() 

***** Running training *****
  Num examples = 2164
  Num Epochs = 40
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 10840
  Number of trainable parameters = 108330266
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,2.6876,2.971829,0.112033,0.04771,0.077456,0.068891
2,2.3505,2.803605,0.141079,0.089914,0.091141,0.118648
3,2.7998,2.790783,0.20332,0.124994,0.127115,0.148797
4,2.5884,2.709347,0.224066,0.132751,0.120829,0.189774
5,2.0924,2.771994,0.224066,0.163421,0.219906,0.19344
6,3.0196,2.811414,0.248963,0.175678,0.204066,0.213368
7,1.1744,2.878103,0.236515,0.173688,0.189696,0.191364
8,1.6329,3.098531,0.257261,0.180222,0.181484,0.213786
9,1.8416,3.102674,0.219917,0.178992,0.197292,0.198645
10,2.087,3.20832,0.219917,0.191628,0.210219,0.193921


***** Running Evaluation *****
  Num examples = 241
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to deu_1_lr=3e-5_batch=8/checkpoint-500
Configuration saved in deu_1_lr=3e-5_batch=8/checkpoint-500/config.json
Model weights saved in deu_1_lr=3e-5_batch=8/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 241
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 241
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to deu_1_lr=3e-5_batch=8/checkpoint-1000
Configuration saved in deu_1_lr=3e-5_batch=8/checkpoint-1000/config.json
Model weights saved in deu_1_lr=3e-5_batch=8/checkpoint-1000/pytorch_model.bin
Deleting older checkpoint [deu_1_lr=3e-5_batch=8/checkpoint-500] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 241
  Batch size = 8
  _warn_prf(average, modifier, msg

KeyboardInterrupt: 

In [33]:
trainer.evaluate(test_iter)

***** Running Evaluation *****
  Num examples = 260
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 3.8034090995788574,
 'eval_accuracy': 0.15,
 'eval_f1': 0.13196580969243657,
 'eval_precision': 0.18770108674783045,
 'eval_recall': 0.15567194595465386}