In [97]:
from datasets import Dataset
import evaluate
import numpy as np
import pandas as pd

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer, IntervalStrategy
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

from tqdm.auto import tqdm
from sklearn.model_selection import LeavePGroupsOut


In [24]:
df = pd.read_csv('../data/annotated_trees_101.csv', index_col=0)
df.columns = ['node_id', 'tree_id', 'timestamp', 'author', 'text', 'parent',
       'Aggressive', 'Agree But', 'Agree To Disagree', 'Alternative', 'Answer',
       'Attack Validity', 'BAD', 'Clarification', 'Complaint', 'Convergence',
       'Counter Argument', 'Critical Question', 'Direct No', 'Double Voicing',
       'Extension', 'Irrelevance', 'Moderation', 'Neg Transformation',
       'Nitpicking', 'No Reason Disagreement', 'Personal', 'Positive',
       'Repetition', 'Rephrase Attack', 'Request Clarification', 'Ridicule',
       'Sarcasm', 'Softening', 'Sources', 'Viable Transformation',
       'W Qualifiers']

In [85]:
def create_post_comment_pairs(df: pd.DataFrame, task_type='t5') -> pd.DataFrame:
    tuples = []
    for row in tqdm(df.itertuples(), total=len(df)):
        if row.parent == -1:
            continue
      
        tree_id = row.tree_id
        comment = row.text
        root = df[(df['tree_id'] == tree_id) & (df['parent'] == -1)]['text'].values[0]
          
        if task_type == 't5':
            tuples.append((root, comment, tree_id, row.timestamp, row.labels))

        else:
            tuples.append((root, comment, tree_id, row.timestamp, *row[7:]))
    if task_type == 't5':        
        tuples_df = pd.DataFrame(tuples, columns=['post', 'comment', 'tree_id', 'time', 'labels'])
    else:
        tuples_df = pd.DataFrame(tuples, columns=['post', 'comment', 'tree_id', 'time'] + df.columns[6:].tolist())
    tuples_df['inputs'] = 'comment: ' + tuples_df.comment.str.cat(' post: ' + tuples_df.post)
    
    new_columns_order = tuples_df.columns[:4].tolist() + [tuples_df.columns[-1]] + tuples_df.columns[4:-1].tolist()
    tuples_df = tuples_df[new_columns_order]

    return tuples_df

def remove_bad_comments(df: pd.DataFrame) -> pd.DataFrame:
    removed_tokens = ['[removed]', '[deleted]']

    df = df[~(df.post.isin(removed_tokens)) & ~(df.comment.isin(removed_tokens))]

    return df

In [86]:
pairs_df = create_post_comment_pairs(df, 'bert')
pairs_df = remove_bad_comments(pairs_df) 

  0%|          | 0/10559 [00:00<?, ?it/s]

In [88]:
pairs_df

Unnamed: 0,post,comment,tree_id,time,inputs,Aggressive,Agree But,Agree To Disagree,Alternative,Answer,...,Positive,Repetition,Rephrase Attack,Request Clarification,Ridicule,Sarcasm,Softening,Sources,Viable Transformation,W Qualifiers
0,I posted this on other subreddit but I figure ...,Are you talking about relationships starting a...,4r2a4d,1467557821,comment: Are you talking about relationships s...,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,I posted this on other subreddit but I figure ...,I was focusing more on the first (relationship...,4r2a4d,1467558355,comment: I was focusing more on the first (rel...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,I posted this on other subreddit but I figure ...,I've been in a LDR for the past 2 1/2 years. W...,4r2a4d,1467584235,comment: I've been in a LDR for the past 2 1/2...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,I posted this on other subreddit but I figure ...,It depends on what people want. If you persona...,4r2a4d,1467559384,comment: It depends on what people want. If yo...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,I posted this on other subreddit but I figure ...,Agreed. But isn't companionship different in r...,4r2a4d,1467561555,comment: Agreed. But isn't companionship diffe...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10453,Poverty Sensible Gun control Bigotry Racism Ob...,"<quote>Also, there is no secret sauce teaching...",7yf2le,1519073471,"comment: <quote>Also, there is no secret sauce...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10454,Poverty Sensible Gun control Bigotry Racism Ob...,you should teach!,7yf2le,1519073875,comment: you should teach! post: Poverty Sensi...,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
10455,Poverty Sensible Gun control Bigotry Racism Ob...,Nope there is even less money in that than the...,7yf2le,1519074041,comment: Nope there is even less money in that...,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
10456,Poverty Sensible Gun control Bigotry Racism Ob...,Ted Bundy had an IQ of 136. Would more educati...,7yf2le,1519067247,comment: Ted Bundy had an IQ of 136. Would mor...,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [10]:
def create_clf_tokenize_fn(tokenizer, source_max_legnth: int=512):
    def tokenize_input(examples):
        model_inputs = tokenizer(examples["inputs"], max_length=source_max_legnth, padding=True, truncation=True)
        model_inputs["labels"] = examples.iloc[:, 4:]

        return model_inputs

    return tokenize_input

In [89]:
num_labels = pairs_df.shape[1] - 4
id2label = {i: c for i, c in enumerate(pairs_df.columns[4:])}
label2id = {c: i for i, c in id2label.items()}
problem_type = 'multi_label_classification'

In [90]:
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased',
                                                           problem_type=problem_type,
                                                           num_labels=num_labels,
                                                           id2label=id2label,
                                                           label2id=label2id)

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [93]:
lpgo = LeavePGroupsOut(5)
for (train_index, test_index) in lpgo.split(pairs_df, groups=pairs_df['tree_id']):
    break

In [117]:
def create_clf_tokenize_fn(tokenizer, labels, source_max_legnth: int=512):
    def tokenize_input(examples):
        model_inputs = tokenizer(examples["inputs"], max_length=source_max_legnth, padding=True, truncation=True)
        
        labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
        # create numpy array of shape (batch_size, num_labels)
        labels_matrix = np.zeros((len(examples["inputs"]), len(labels)))
        # fill numpy array
        for idx, label in enumerate(labels):
            labels_matrix[:, idx] = labels_batch[label]
            
        model_inputs["labels"] = labels_matrix.tolist()

        return model_inputs

    return tokenize_input

In [118]:
tokenizer_fn = create_clf_tokenize_fn(tokenizer, pairs_df.columns[5:].tolist(), 512)

In [119]:
train_df = pairs_df.iloc[train_index]
train_dataset = Dataset.from_pandas(train_df)
train_dataset = train_dataset.map(tokenizer_fn, batched=True)

test_df = pairs_df.iloc[test_index]
test_dataset = Dataset.from_pandas(test_df)
test_dataset = test_dataset.map(tokenizer_fn, batched=True)

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

(2, 31)