In [1]:
import os

from datasets import Dataset
import evaluate
import numpy as np
import pandas as pd

from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
import torch.nn.functional as F
import torch

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer, IntervalStrategy
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

from tqdm.auto import tqdm
from sklearn.model_selection import LeavePGroupsOut

os.environ["WANDB_DISABLED"] = "true"


In [2]:
df = pd.read_csv('../data/annotated_trees_101.csv', index_col=0)
df.columns = ['node_id', 'tree_id', 'timestamp', 'author', 'text', 'parent',
       'Aggressive', 'Agree But', 'Agree To Disagree', 'Alternative', 'Answer',
       'Attack Validity', 'BAD', 'Clarification', 'Complaint', 'Convergence',
       'Counter Argument', 'Critical Question', 'Direct No', 'Double Voicing',
       'Extension', 'Irrelevance', 'Moderation', 'Neg Transformation',
       'Nitpicking', 'No Reason Disagreement', 'Personal', 'Positive',
       'Repetition', 'Rephrase Attack', 'Request Clarification', 'Ridicule',
       'Sarcasm', 'Softening', 'Sources', 'Viable Transformation',
       'W Qualifiers']

In [3]:
def create_post_comment_pairs(df: pd.DataFrame, model) -> pd.DataFrame:
    tuples = []
    for row in tqdm(df.itertuples(), total=len(df)):
        if row.parent == -1:
            continue

        tree_id = row.tree_id
        comment = row.text
        root = df[(df['tree_id'] == tree_id) & (df['parent'] == -1)]['text'].values[0]

        if model.startswith('t5'):
            tuples.append((root, comment, tree_id, row.timestamp, row.labels))

        else:
            # row starts from 7 because itertuples also returns the index in the tuple.
            tuples.append((root, comment, tree_id, row.timestamp, *row[7:]))

    if model.startswith('t5'):
        tuples_df = pd.DataFrame(tuples, columns=['post', 'comment', 'tree_id', 'time', 'labels'])

    else:
        tuples_df = pd.DataFrame(tuples, columns=['post', 'comment', 'tree_id', 'time'] + df.columns[6:].tolist())
    tuples_df['inputs'] = 'comment: ' + tuples_df.comment.str.cat(' post: ' + tuples_df.post)

    # This makes sure that the labels are the last columns in the dataframe
    new_columns_order = tuples_df.columns[:4].tolist() + [tuples_df.columns[-1]] + tuples_df.columns[4:-1].tolist()
    tuples_df = tuples_df[new_columns_order]

    return tuples_df


def remove_bad_comments(df: pd.DataFrame) -> pd.DataFrame:
    removed_tokens = ['[removed]', '[deleted]']

    df = df[~(df.post.isin(removed_tokens)) & ~(df.comment.isin(removed_tokens))]

    return df

In [4]:
pairs_df = create_post_comment_pairs(df, 'bert')
pairs_df = remove_bad_comments(pairs_df)
pairs_df = pairs_df.drop(['post', 'comment'], axis=1)

  0%|          | 0/10559 [00:00<?, ?it/s]

In [5]:
pairs_df

Unnamed: 0,tree_id,time,inputs,Aggressive,Agree But,Agree To Disagree,Alternative,Answer,Attack Validity,BAD,...,Positive,Repetition,Rephrase Attack,Request Clarification,Ridicule,Sarcasm,Softening,Sources,Viable Transformation,W Qualifiers
0,4r2a4d,1467557821,comment: Are you talking about relationships s...,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,4r2a4d,1467558355,comment: I was focusing more on the first (rel...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4r2a4d,1467584235,comment: I've been in a LDR for the past 2 1/2...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4r2a4d,1467559384,comment: It depends on what people want. If yo...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4r2a4d,1467561555,comment: Agreed. But isn't companionship diffe...,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10453,7yf2le,1519073471,"comment: <quote>Also, there is no secret sauce...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10454,7yf2le,1519073875,comment: you should teach! post: Poverty Sensi...,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
10455,7yf2le,1519074041,comment: Nope there is even less money in that...,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
10456,7yf2le,1519067247,comment: Ted Bundy had an IQ of 136. Would mor...,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [6]:
def prepare_model(model, labels=None):
    tokenizer = AutoTokenizer.from_pretrained(model)

    if model.startswith('t5'):
        model = AutoModelForSeq2SeqLM.from_pretrained(model)

    else:
        num_labels = len(labels)
        id2label = {i: c for i, c in enumerate(labels)}
        label2id = {c: i for i, c in id2label.items()}
        problem_type = 'multi_label_classification'
        model = AutoModelForSequenceClassification.from_pretrained(model,
                                                                   problem_type=problem_type,
                                                                   num_labels=num_labels,
                                                                   id2label=id2label,
                                                                   label2id=label2id)

    return model, tokenizer

In [7]:
model, tokenizer = prepare_model('bert-base-uncased', pairs_df.columns[3:])

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [8]:
def tokenize_classification_input(examples, tokenizer, labels, source_max_length: int=512):
    model_inputs = tokenizer(examples["inputs"], max_length=source_max_length,
                                padding='max_length', truncation=True, return_tensors='pt')

    labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
    # create numpy array of shape (batch_size, num_labels)
    labels_matrix = np.zeros((len(examples["inputs"]), len(labels)))
    # fill numpy array
    for idx, label in enumerate(labels):
        labels_matrix[:, idx] = labels_batch[label]

    model_inputs["labels"] = labels_matrix.tolist()

    return model_inputs

In [9]:
def prepare_training_data(df, tokenizer, labels):

    lpgo = LeavePGroupsOut(5)
    for (train_index, test_index) in lpgo.split(df, groups=df['tree_id']):
        break

    train_df = df.iloc[train_index]
    test_df = df.iloc[test_index]



    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)

    train_dataset = train_dataset.map(tokenize_classification_input,
                                        batched=True,
                                        fn_kwargs={'tokenizer': tokenizer,
                                                    'labels': labels,
                                                    'source_max_length': 512,
                                                    },
                                        remove_columns=train_dataset.column_names)
    
    test_dataset = test_dataset.map(tokenize_classification_input,
                                        batched=True,
                                        fn_kwargs={'tokenizer': tokenizer,
                                                    'labels': labels,
                                                    'source_max_length': 512,
                                                    },
                                        remove_columns=test_dataset.column_names)

    return train_dataset, test_dataset

In [10]:
train_dataset, test_dataset = prepare_training_data(pairs_df, tokenizer, pairs_df.columns[3:])

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [11]:
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def compute_metrics(eval_preds):
    predictions, labels = eval_preds
    if isinstance(predictions, tuple):
        predictions = predictions[0]
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    probs = F.sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= 0.5)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, probs, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

In [12]:
training_args = TrainingArguments(
    output_dir='/tmp/results/',
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=16,
    eval_accumulation_steps=4,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy=IntervalStrategy.STEPS,
    save_strategy=IntervalStrategy.STEPS,
    logging_steps=50,
    save_steps=50,
    save_total_limit=5
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
***** Running training *****
  Num examples = 9224
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 1728
  Number of trainable parameters = 109506079
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
50,0.4558,0.309332,0.425806,0.71657,0.179144
100,0.2593,0.224503,0.425806,0.752924,0.179144
150,0.2039,0.199843,0.425806,0.76577,0.179144
200,0.1908,0.190972,0.425806,0.7728,0.179144
250,0.1775,0.189318,0.425806,0.765074,0.179144
300,0.1757,0.185095,0.425806,0.774238,0.179144
350,0.1751,0.182381,0.425806,0.784332,0.179144
400,0.1696,0.181504,0.425806,0.780319,0.179144
450,0.1704,0.180343,0.425806,0.789238,0.179144


***** Running Evaluation *****
  Num examples = 374
  Batch size = 1
Saving model checkpoint to /tmp/results/checkpoint-50
Configuration saved in /tmp/results/checkpoint-50/config.json
Model weights saved in /tmp/results/checkpoint-50/pytorch_model.bin
tokenizer config file saved in /tmp/results/checkpoint-50/tokenizer_config.json
Special tokens file saved in /tmp/results/checkpoint-50/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 374
  Batch size = 1
Saving model checkpoint to /tmp/results/checkpoint-100
Configuration saved in /tmp/results/checkpoint-100/config.json
Model weights saved in /tmp/results/checkpoint-100/pytorch_model.bin
tokenizer config file saved in /tmp/results/checkpoint-100/tokenizer_config.json
Special tokens file saved in /tmp/results/checkpoint-100/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 374
  Batch size = 1
Saving model checkpoint to /tmp/results/checkpoint-150
Configuration saved in /tmp/results/checkpoin

KeyboardInterrupt: 