In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
from torch import nn
from transformers.models.bert.modeling_bert import BertModel, BertPreTrainedModel

In [None]:
import sklearn
import pandas as pd
import numpy as np

In [None]:
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

In [None]:
import os
from os import listdir
import sys
import json
from os import path

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [None]:
from sklearn.model_selection import KFold

In [None]:
# save models here:
general_models_dir = ""

In [None]:
logs = "./logs"
if os.path.exists(logs):
    print("Dir already exists: ", logs)
else:
    os.makedirs(logs)
    print("Created dir: ", logs)


In [None]:
from datetime import datetime
currentDateAndTime = datetime.now()
currentDateAndTime

In [None]:
# CONDITIONS

setting = "marked-trigger" # marked-trigger or unmarked-trigger
trigger_source = "extracted" # extracted or predicted (predicted not currently used)
training_type = "CV" # CV or full-model
data = "mturk-clean" # "mturk-clean" or "mturk-orig"
if data == "mturk-orig":
    threshold = 0.5 # 1.0 - only count mturk data as belief if all annotators marked it as belief; 0.5 - count as belief if majority marked it a belief
transformer_name = "bert-base-cased" 
random_seed=22

resample_triggerless = False
rewrite_sample = True # if rewrite if sample already exists when trying to sample from a larger set of neg examples
if resample_triggerless:
    neg_example_multiplier = 4 # num of positive class annotations * neg_sample_multiplier = triggerless sample size



In [None]:
# making a dir for current experiment
specific_dir = f"{training_type}_{data}_{trigger_source}-{setting}_{transformer_name}_{currentDateAndTime.year}-{currentDateAndTime.month}-{currentDateAndTime.day}-{currentDateAndTime.hour}-{currentDateAndTime.minute}"
if data == "mturk-orig":
    specific_dir = specific_dir.replace(data, data + f"-{threshold}-threshold")
models_dir = os.path.join(general_models_dir, specific_dir)
print("The model will be saved to:\n" + models_dir)

In [None]:
if os.path.exists(models_dir):
    print("Dir already exists: ", models_dir)
else:
    os.makedirs(models_dir)
    print("Created dir: ", models_dir)

In [None]:
annotated_data = ""
# can be a big set to sample from or a set to use as is
triggerless_sample = ""

# if resampling, set the directory to save the new sample
if resample_triggerless:
    # triggerless samples actually used
    triggerless_samples_used = ""
    if os.path.exists(triggerless_samples_used):
        print("Dir already exists: ", triggerless_samples_used)
    else:
        os.makedirs(triggerless_samples_used)
        print("Created dir: ", triggerless_samples_used)
else:
    print(f"Using previously sampled data from {triggerless_sample}")



In [None]:
# setting some vars
if data == "mturk-clean": 
    meaningful_columns = ["paragraph", "sentence", "approx_span", "trigger", "quality_controlled"]
elif data == "mturk-orig":
    meaningful_columns = ["paragraph", "sentence", "approx_span", "trigger", "quality_controlled", "accepted_count", "belief_ann_count"]
else:
    print("Unknown data type: ", data)
    
    
annotations_column = "quality_controlled"
neg_sample_meaningful_columns = ["sentence", "paragraph"]
mention_span_column = "approx_span"


In [None]:
# load annotated data
adf = pd.DataFrame()
for file in listdir(annotated_data):
    print(file)
    if file.endswith("tsv"):
        f_path = os.path.join(annotated_data, file)
        temp_df = pd.read_csv(f_path, sep='\t', usecols = meaningful_columns).dropna()
        adf = pd.concat([adf, temp_df])
print("Annoted data size: ", len(adf))

In [None]:
adf.head()

In [None]:
# minor clenanup
adf["sentence"] = [s.strip() for s in adf["sentence"]]
adf = adf.drop_duplicates(subset = ["sentence", "approx_span"])
print("Annoted data size (updated): ", len(adf))

In [None]:
# add labels
if data == "mturk-orig":
    adf["prop"] = adf["belief_ann_count"]/adf["accepted_count"]
    adf["label"] = [1 if x >= threshold else 0 for x in adf["prop"]]
    adf = adf.drop(["prop"], axis=1)
elif "mturk-clean":
    # assign numerical labels
    num_of_labels = len(list(set(adf[annotations_column])))
    if num_of_labels == 2:
        adf["label"] = [1 if x == "b" else 0 for x in adf[annotations_column]]
    else:
        print(f"Wrong number of labels: {number_of_labels}")
        

In [None]:
adf.tail(10)

In [None]:
# calculating proportion of rows annotated as beliefs 
anns = adf[annotations_column]
b_count = list(anns).count("b")

# percentage of sentences annotated as beliefs (among all annotated)
float(b_count)/len(adf)

In [None]:
# load negative (triggerless, automatically extracted) examples 
ndf = pd.DataFrame() 
for file in listdir(triggerless_sample):
    f_path = os.path.join(triggerless_sample, file)
    if f_path.endswith(".tsv"):
        temp_df = pd.read_csv(f_path, sep='\t', usecols = neg_sample_meaningful_columns)
        ndf = pd.concat([ndf, temp_df])
    
ndf["sentence"] = [s.strip().replace("\t", " ").replace("\n", " ") for s in ndf["sentence"]]
ndf = ndf.drop_duplicates(subset = ["sentence"])
ndf["quality_controlled"] = ["n"] * len(ndf)
ndf["label"] = [0] * len(ndf)
len(ndf)

In [None]:
ndf.head()

In [None]:
# how many times more triggerless data to use compared to the number annotated beliefs
# pick the number that is either the amount we want based on the multiplier or if that number is higher than the number
# of available examples, just use all triggerless examples available
if resample_triggerless:
    n_neg_examples_to_use = min(b_count * neg_example_multiplier, len(ndf))
    n_neg_examples_to_use
    print("Num of triggerless sentences to sample: ", n_neg_examples_to_use)

In [None]:
# take the neg example sample, write it to a file for record keeping, and read it back in
if resample_triggerless:
    sample_file_name = os.path.join(triggerless_samples_used, f"triggerless_sample_{neg_example_multiplier}_times_belief_number.tsv")
    if path.exists(sample_file_name) and not rewrite_sample:
        print("exists")
        ndf = pd.read_csv(sample_file_name, sep="\t")
    else:
        print("new sample")
        ndf = ndf.sample(n=n_neg_examples_to_use, random_state = random_seed).reset_index(drop=True).to_csv(sample_file_name, index=False, sep="\t")
        ndf = pd.read_csv(sample_file_name, sep="\t")

    print(f"N triggerless examples: {len(ndf)}")
else:
    print(f"Using full ndf (n={len(ndf)}) loaded from {triggerless_sample}")

In [None]:
ndf.head()

In [None]:
# annotated + sampled triggerless
df = pd.concat([adf, ndf])#.reset_index(drop=True)
print(len(df))
# shuffle df
df = df.sample(frac=1).reset_index(drop=True)
print(len(df))
print(f"Annotated + sampled = {len(df)}")

In [None]:
df.head()

In [None]:
df.index = [x for x in range(0,len(df.index))]

if setting == "marked-trigger":
    # Adding markers to trigger
    print("Setting: ", setting)
    for i in df.index: 
        if (not pd.isna(df.at[i,"trigger"])): 
            triggerText = df.at[i,"trigger"]
            orig_span = df.at[i, mention_span_column]
            updated_span = orig_span.replace(triggerText, "<t>" + triggerText + "</t>")
            df.at[i,"sentence"] = df.at[i,"sentence"].replace(orig_span, updated_span)
            df.at[i,"paragraph"] = df.at[i,"paragraph"].replace(orig_span, updated_span)
else:
    print("Else setting: ", setting)



In [None]:
# to see full cell content
pd.set_option('display.max_colwidth', None)

In [None]:
# df.head()

In [None]:
tokenizer = AutoTokenizer.from_pretrained(transformer_name)
if training_type == "full-model":
    model = AutoModelForSequenceClassification.from_pretrained(transformer_name, num_labels=2)
else:
    # the model will be loaded for each fold in the cross-validation condition
    print(f"Training type: {training_type}")

In [None]:
def tokenize(batch):
    return tokenizer(batch['sentence'], truncation=True)

In [None]:
def compute_metrics(eval_pred):
    y_true = eval_pred.label_ids
    y_pred = np.argmax(eval_pred.predictions, axis=-1)
    report = metrics.classification_report(y_true, y_pred)
    print("report: \n", report)
    
    print("rep type: ", type(report))
    

    return {'f1':metrics.f1_score(y_true, y_pred)}

In [None]:
# this is for creating cross-validation folds
def get_sample_based_on_idx(data, indeces):
    return data.iloc[indeces, :].reset_index()

In [None]:
# just checking labels are correct
0 in set(df["label"]) and 1 in set(df["label"]) and len(list(set(df["label"]))) == 2

In [None]:
# use sentences as text to base classification on
df["text"] = df["sentence"]
# how much of the data to use (can limit number for debugging)
df = df[:]


In [None]:
# just checking the df looks right
df.head(5)

In [None]:
# defining hyperparams
num_epochs = 20
batch_size = 16
weight_decay = 0.01
if training_type == "CV":
    n_folds = 5
training_args = TrainingArguments(
    output_dir=os.path.join("./checkpoints", specific_dir),  
    log_level='error',
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    weight_decay=weight_decay,
    load_best_model_at_end=True, # this is supposed to make sure the best model is loaded by the trainer at the end
    metric_for_best_model="eval_f1" 
    )

In [None]:
log_file = os.path.join(logs, specific_dir) + ".txt"
print("Log saved at: ", log_file)

In [None]:
import time
start_time = time.time()

In [None]:
output = open(log_file, "w") # start writing the file, don't add to existing

if training_type == "CV":
    print(f"Training type: {training_type}")
    fold = 0
    kfold = KFold(n_splits=n_folds, shuffle=True, random_state=random_seed)
    for train_df_idx, eval_df_idx in kfold.split(df):

        print("FOLD: ", fold)
        output.write(f"FOLD: {fold}\n")
        new_df = pd.DataFrame()

        train_df = get_sample_based_on_idx(df, train_df_idx)
        print("LEN TRAIN DF: ", len(train_df))
        output.write(f"LEN TRAIN DF: {len(train_df)}\n")
        
        eval_df = get_sample_based_on_idx(df, eval_df_idx)
        print("LEN EVAL: ", len(eval_df))
        output.write(f"LEN EVAL: {len(eval_df)}\n")
        
        ds = DatasetDict()
        ds['train'] = Dataset.from_pandas(train_df)
        ds['validation'] = Dataset.from_pandas(eval_df)
        
        train_ds = ds['train'].map(
            tokenize, batched=True,
            remove_columns=['index'] + meaningful_columns
        )
        
        eval_ds = ds['validation'].map(
            tokenize,
            batched=True,
            remove_columns=['index']+ meaningful_columns
        )


        model = AutoModelForSequenceClassification.from_pretrained(transformer_name, num_labels=2)
        tokenizer = AutoTokenizer.from_pretrained(transformer_name)

        trainer = Trainer(
            model=model,
            args=training_args,
            compute_metrics=compute_metrics,
            train_dataset=train_ds,
            eval_dataset=eval_ds,
            tokenizer=tokenizer,
        )
        
        trainer.train()
        
        # after training, make predictions 
        preds = trainer.predict(eval_ds)
        final_preds = [np.argmax(x) for x in preds.predictions]
        real_f1 = metrics.f1_score(eval_df["label"], final_preds)
        real_p = metrics.precision_score(eval_df["label"], final_preds)
        real_r = metrics.recall_score(eval_df["label"], final_preds)
        
        print("P: ", real_p)
        output.write(f"P: {real_p}\n")
        print("R: ", real_r)
        output.write(f"R: {real_r}\n")
        print("F-1: ", real_f1)
        output.write(f"F-1: {real_f1}\n")
        
        # save each model fold into a separate dir
        model_name = f"{transformer_name}-best-of-fold-{fold}-f1-{real_f1}"
        model_dir = os.path.join(models_dir, model_name)

        trainer.save_model(model_dir)
        
        output.write("Training Log:\n")
        for obj in trainer.state.log_history:
            print(obj)
            output.write(str(obj))
        
        # get false pos and neg
        count_f_n = 0
        count_f_p = 0
        for i, item in enumerate(final_preds):
            if not item == eval_ds["label"][i]:
                false_df = pd.DataFrame()
                false_df["sentence"] = [eval_df["sentence"][i]]
                false_df["real"] = [eval_df["label"][i]]
                false_df["predicted"] = [item]
                new_df = pd.concat([new_df, false_df])
                if item == 0:
                    count_f_n += 1

                else:
                    count_f_p += 1

        print(f"n of false pos: {count_f_p}")
        output.write(f"n of false pos: {count_f_p}\n")
        print(f"n of false neg: {count_f_n}")
        output.write(f"n of false neg: {count_f_n}\n")

        # write false predictions to file for error analysis
        new_df.to_csv(os.path.join(models_dir, "false_predictions_" + str(fold) + ".tsv"), sep="\t")  
        fold += 1
        
elif training_type == "full-model":
    
    print(f"Training type: {training_type}")

    train_df, eval_df = sklearn.model_selection.train_test_split(df, test_size=0.2, random_state=random_seed)
    
    # save eval sample 
    eval_df.to_csv(os.path.join(models_dir, "eval_from_full_model_training.tsv"), sep="\t")
    

    print("LEN TRAIN DF: ", len(train_df))
    output.write(f"LEN TRAIN DF: {len(train_df)}\n")
    print("LEN EVAL: ", len(eval_df))
    output.write(f"LEN EVAL: {len(eval_df)}\n")
    ds = DatasetDict()
    ds['train'] = Dataset.from_pandas(train_df)
    ds['validation'] = Dataset.from_pandas(eval_df)
    train_ds = ds['train'].map(
        tokenize, batched=True,
        remove_columns=meaningful_columns#, 'sentence', 'trigger', 'annotation: b (belief or attitude), n (not a belief and not an attitude)', 'paragraph'],
    )
    eval_ds = ds['validation'].map(
        tokenize,
        batched=True,
        remove_columns=meaningful_columns#, 'sentence', 'trigger', 'annotation: b (belief or attitude), n (not a belief and not an attitude)', 'paragraph'],
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        tokenizer=tokenizer,
    )
    trainer.train()
    # after training, make predictions (with the best model)
    preds = trainer.predict(eval_ds)
    final_preds = [np.argmax(x) for x in preds.predictions]
    real_f1 = metrics.f1_score(eval_df["label"], final_preds)
    print("F-1: ", real_f1)
    output.write(f"F-1: {real_f1}\n")
    model_name = f"{transformer_name}-best-f1-{real_f1}"
    model_dir = os.path.join(models_dir, model_name)

    trainer.save_model(model_dir)
    new_df = pd.DataFrame()

    count_f_n = 0
    count_f_p = 0
    for i, item in enumerate(final_preds):
        if not item == eval_ds["label"][i]:
            false_df = pd.DataFrame()
            false_df["sentence"] = [list(eval_df["sentence"])[i]]
            false_df["real"] = [list(eval_df["label"])[i]]
            false_df["predicted"] = [item]
            new_df = pd.concat([new_df, false_df])
            if item == 0:
                count_f_n += 1

            else:
                count_f_p += 1

    print(f"n of false pos: {count_f_p}")
    output.write(f"n of false pos: {count_f_p}\n")
    print(f"n of false neg: {count_f_n}")
    output.write(f"n of false neg: {count_f_n}\n")
    new_df.to_csv(os.path.join(models_dir, "false_predictions.tsv"), sep="\t")  

    
else:
    print(f"Unknown training setting: {training_type}")
        
        
torch.cuda.memory_summary(device=None, abbreviated=False)
output.write("{torch.cuda.memory_summary(device=None, abbreviated=False)}")

output.close()

In [None]:
print(f"--- {float(time.time() - start_time)/60} minutes ---")