In [None]:
from torch import nn
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers.models.bert.modeling_bert import BertModel, BertPreTrainedModel

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
import sklearn
import pandas as pd
import numpy as np

In [None]:
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

In [None]:
import os
from os import listdir
import sys
import json
from os import path

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import metrics

In [None]:
from sklearn.model_selection import KFold

In [None]:
from transformers import AutoConfig

In [None]:
import datetime

date = datetime.datetime.now()
year = date.year
month = date.month
day = date.day

In [None]:
checking_significance = True

In [None]:
annotated_data = ""
triggerless_sample = ""
tested_model_dir = ""
if checking_significance:
    baseline_model_dir = ""



In [None]:
baseline_setting = "unmarked-trigger" # marked-trigger or unmarked-trigger
tested_setting = "unmarked-trigger"

In [None]:
random_seed=22

In [None]:
transformer_name = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(transformer_name)
if checking_significance:
    baseline_model = AutoModelForSequenceClassification.from_pretrained(baseline_model_dir, num_labels=2)
    
tested_model = AutoModelForSequenceClassification.from_pretrained(tested_model_dir, num_labels=2)


In [None]:
# setting some vars
meaningful_columns = ["paragraph", "sentence", "approx_span", "trigger", "quality_controlled"]
annotations_column = "quality_controlled"
neg_sample_meaningful_columns = ["sentence", "paragraph"]
mention_span_column = "approx_span"


In [None]:
# load annotated data
adf = pd.DataFrame()
for file in listdir(annotated_data):
    print(file)
    if file.endswith("tsv"):
        f_path = os.path.join(annotated_data, file)
        temp_df = pd.read_csv(f_path, sep='\t', usecols = meaningful_columns).dropna()
        print(len(temp_df))
        adf = pd.concat([adf, temp_df])


In [None]:
adf.head()

In [None]:
adf["sentence"] = [s.strip() for s in adf["sentence"]]
adf = adf.drop_duplicates(subset = ["sentence", "approx_span"])

In [None]:
len(adf)

In [None]:
anns = adf[annotations_column]
b_count = list(anns).count("b")

# percentage of sentences annotated as beliefs (among all annotated)
float(b_count)/len(adf)

In [None]:
# load negative examples sampled
ndf = pd.DataFrame() 
for file in listdir(triggerless_sample):
    f_path = os.path.join(triggerless_sample, file)
    print(f_path)
    if f_path.endswith(".tsv"):
        temp_df = pd.read_csv(f_path, sep='\t', usecols = neg_sample_meaningful_columns)
        ndf = pd.concat([ndf, temp_df])
    
ndf["sentence"] = [s.strip().replace("\t", " ").replace("\n", " ") for s in ndf["sentence"]]
ndf = ndf.drop_duplicates(subset = ["sentence"])
ndf["quality_controlled"] = ["n"] * len(ndf)
len(ndf)

In [None]:
ndf.head()

In [None]:
# annotated + sampled triggerless
df = pd.concat([adf, ndf])#.reset_index(drop=True)
print(f"Annotated + sampled = {len(df)}")

In [None]:
# annotated + sampled triggerless for baseline model
if checking_significance:
    baseline_df = pd.concat([adf, ndf])#.reset_index(drop=True)
    print(f"Annotated + sampled = {len(baseline_df)}")

In [None]:
b_count = list(df["quality_controlled"]).count("b")

# percentage of sentences annotated as beliefs
float(b_count)/len(df)

In [None]:
# to see full cell content
pd.set_option('display.max_colwidth', None)

In [None]:
# df.head(100)

In [None]:
def tokenize(batch):
    return tokenizer(batch['sentence'], truncation=True)

In [None]:
def prep_test_data(orig_df, setting):
    df = orig_df

    df.index = [x for x in range(0,len(df.index))]

    if setting == "marked-trigger":
    # Adding markers to trigger
        print("Marking triggers")
        for i in df.index: 
            if (not pd.isna(df.at[i,"trigger"])): 
                triggerText = df.at[i,"trigger"]
                orig_span = df.at[i, mention_span_column]
                updated_span = orig_span.replace(triggerText, "<t>" + triggerText + "</t>")
                df.at[i,"sentence"] = df.at[i,"sentence"].replace(orig_span, updated_span)
                df.at[i,"paragraph"] = df.at[i,"paragraph"].replace(orig_span, updated_span)


    # assign numerical labels
    num_of_labels = len(list(set(df[annotations_column])))
    if num_of_labels == 2:
        df['label'] = np.array([1 if x == "b" else 0 for x in df[annotations_column]])
    else:
        print(f"Wrong number of labels: {number_of_labels}")
        
    ds = DatasetDict()
    ds['test'] = Dataset.from_pandas(df)
    test_ds = ds['test'].map(
        tokenize,
        batched=True,
        remove_columns=meaningful_columns
    )
    return test_ds
    

In [None]:
# just checking the df looks right
# df.head()

In [None]:
def compute_metrics(eval_pred):
    y_true = eval_pred.label_ids
    y_pred = np.argmax(eval_pred.predictions, axis=-1)
    report = metrics.classification_report(y_true, y_pred)
    print("report: \n", report)
    return {'f1':metrics.f1_score(y_true, y_pred)}

In [None]:
# defining hyperparams
batch_size = 16
training_args = TrainingArguments(
    output_dir="./exp_results", 
    log_level='error',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    )

In [None]:
tested_model_trainer = Trainer(
    model=tested_model,
    args=training_args,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

In [None]:
if checking_significance:
    baseline_trainer = Trainer(
        model=baseline_model,
        args=training_args,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
    )

In [None]:
# tested_setting

In [None]:
tested_model_test_ds = prep_test_data(df, tested_setting)
tested_preds = tested_model_trainer.predict(tested_model_test_ds)
tested_model_final_preds = [np.argmax(x) for x in tested_preds.predictions]

In [None]:
# df

In [None]:
if checking_significance:
    baseline_model_test_ds = prep_test_data(baseline_df, baseline_setting)
    baseline_preds = baseline_trainer.predict(baseline_model_test_ds)
    baseline_model_final_preds = [np.argmax(x) for x in baseline_preds.predictions]
    

In [None]:
tested_f1s = []
baseline_f1s = []

all_runs = 10000
tested_model_better = 0

ps = []
rs = []

# how many times my model is better than the baseline 
for i in range(all_runs):
    indices = np.random.randint(len(tested_model_final_preds), size=len(tested_model_final_preds))
    
    
    # take sample of gold labels, tested model, and, if checking significance, a baseline model
    labels_sample = np.take(tested_model_test_ds["label"], indices)
    final_preds_sample = np.take(tested_model_final_preds, indices)

    if checking_significance:
        baseline_preds_sample = np.take(baseline_model_final_preds, indices)

    # cal f1 scores for the two models
    f1 = metrics.f1_score(labels_sample, final_preds_sample)
    if checking_significance:
        baseline_f1 = metrics.f1_score(labels_sample, baseline_preds_sample)
        
        if f1 > baseline_f1:
            tested_model_better += 1

    # calculate other stats for the tested model    
    p = metrics.precision_score(labels_sample, final_preds_sample)
    r = metrics.recall_score(labels_sample, final_preds_sample)
    tested_f1s.append(f1)
    ps.append(p)
    rs.append(r)
    

if checking_significance:
    print(tested_model_better)
    prop_tested_model_better = float(tested_model_better)/all_runs
    print(prop_tested_model_better)
    print("p-value: ", round(1-prop_tested_model_better, 2))

f1s_arr = np.array(tested_f1s)
mean = np.mean(f1s_arr)
std = np.std(f1s_arr)

ps_arr = np.array(ps)
p_mean = np.mean(ps_arr)
p_std = np.std(ps_arr)

r_arr = np.array(rs)
r_mean = np.mean(r_arr)
r_std = np.std(r_arr)

print("p: ", p_mean, " +- ",p_std )
print("r: ", r_mean, " +- ",r_std )
print("f1: ", mean, " +- ",std )

latex_output = ""+str(round(p_mean,2))+"\\textsubscript{\\textpm "+str(round(p_std,2))+"} & "+str(round(r_mean,2))+"\\textsubscript{\\textpm "+str(round(r_std,2))+"} & "+str(round(mean,2))+"\\textsubscript{\\textpm "+str(round(std,2))+"}"
print("Latex table output: ",latex_output)