In [None]:
from torch import nn
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers.models.bert.modeling_bert import BertModel, BertPreTrainedModel

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
import sklearn
import pandas as pd
import numpy as np

In [None]:
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

In [None]:
import os
from os import listdir
import sys
import json
from os import path

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import metrics

In [None]:
from sklearn.model_selection import KFold

In [None]:
from transformers import AutoConfig

In [None]:
# annotated data directory
annotated_data = "/home/u32/cchyland/iBelieveFiles/only_uganda_with_triggers"
# original triggerless data directory
triggerless_original_dir = "/home/u32/cchyland/iBelieveFiles/triggerless_original/only_uganda"
# triggerless sample data (change directory for new samples); with header and labeled as "n" (non-belief)
triggerless_sample = "/home/u32/cchyland/iBelieveFiles/triggerless_sample/only_uganda"
# triggerless samples actually used
triggerless_samples_used = "/home/u32/cchyland/iBelieveFiles/triggerless_samples_used/only_uganda"
# save models here (change dir for newly-trained models):
models_dir = "/xdisk/msurdeanu/cchyland/models"


In [None]:
# create a sample of triggerless examples to use; if don't have too many, just use all of them (frac = 1.0)
# percentage of triggerless sentences to sample if there are too many triggerless examples available:
frac = 1.0
random_seed = 22

# this is what annotated column is called in the annotated data files---use the same names to assign annotations
# to sampled negative examples:
annotations_column = "annotation: b (belief or attitude), n (not a belief and not an attitude)"

# TODO: to experiment with different sizes of triggerless examples, need to sample in some other way because it should
# be based on the number of annotated examples, e.g., 4 times annotated examples;
# could oversample and then sample from there?..
for file in listdir(triggerless_original_dir):
    f_path = os.path.join(triggerless_original_dir, file)
    print(f_path)
    temp_df = pd.read_csv(f_path, sep='\t', header=None, on_bad_lines="skip")
#     print(temp_df.head())
    print(len(temp_df))
    # naming triggerless docs columns for easier use later
    temp_df.columns = ["file", "na", "sentence", "trigger", "na", "paragraph","na"]
    temp_df[annotations_column] = ["n"] * len(temp_df)
    temp_df.sample(frac=frac, random_state = random_seed).reset_index(drop=True).to_csv(os.path.join(triggerless_sample, file), sep="\t")
    

In [None]:
# annotated_data = os.path.join(project_dir, "annotated_as_of_dec13_both_uganda_and_rice")
# annotated_data

In [None]:
# df = pd.read_csv(os.path.join(annotated_data, "Subtask1-MainTask-double-annotation-prep-as-of-Nov2.tsv"), sep="\t")

In [None]:
# df.head()

In [None]:
# load annotated data
adf = pd.DataFrame()
for file in listdir(annotated_data):
    print(file)
    if file.endswith("tsv"):
        f_path = os.path.join(annotated_data, file)
        temp_df = pd.read_csv(f_path, sep='\t', usecols = ["paragraph", "mention text (just a few words around the trigger)","trigger","sentence","annotation: b (belief or attitude), n (not a belief and not an attitude)"]).dropna()
        print(len(temp_df))
        adf = pd.concat([adf, temp_df])

In [None]:
adf.head()

In [None]:
adf["sentence"] = [s.strip() for s in adf["sentence"]]
adf = adf.drop_duplicates(subset = ["sentence", "mention text (just a few words around the trigger)"])

In [None]:
len(adf)

In [None]:
anns = adf[annotations_column]
b_count = list(anns).count("b")

# percentage of sentences annotated as beliefs (among all annotated)
float(b_count)/len(adf)

In [None]:
# load negative examples sampled
ndf = pd.DataFrame() 
for file in listdir(triggerless_sample):
    f_path = os.path.join(triggerless_sample, file)
    temp_df = pd.read_csv(f_path, sep='\t', usecols = ["paragraph","trigger","sentence",annotations_column])
#     print(len(temp_df))
    ndf = pd.concat([ndf, temp_df])
    
ndf["sentence"] = [s.strip() for s in ndf["sentence"]]
ndf = ndf.drop_duplicates(subset = ["sentence"])
len(ndf)

In [None]:
ndf.head()

In [None]:
# how many times more triggerless data to use compared to trigger-ed examples
# pick the number that is either the amount we want based on the multiplier or if that number is higher than the number
# of available examples, just use all triggerless examples
neg_example_multiplier = 2
n_neg_examples_to_use = min(len(adf) * neg_example_multiplier, len(ndf))
n_neg_examples_to_use

In [None]:
# take the neg example sample, write it to a file for record keeping, and read it back in
sample_file_name = os.path.join(triggerless_samples_used, f"triggerless_sample_{neg_example_multiplier}_times_triggered_size")
rewrite_sample = True
if path.exists(sample_file_name) and not rewrite_sample:
    print("exists")
    ndf = pd.read_csv(sample_file_name, sep="\t")
else:
    print("new sample")
    ndf = ndf.sample(n=n_neg_examples_to_use, random_state = random_seed).reset_index(drop=True).to_csv(sample_file_name, index=False, sep="\t")
    ndf = pd.read_csv(sample_file_name, sep="\t")
    
print(f"N triggerless examples: {len(ndf)}")

In [None]:
ndf.head()

In [None]:
# annotated + sampled triggerless
df = pd.concat([adf, ndf])#.reset_index(drop=True)
print(f"Annotated + sampled = {len(df)}")

In [None]:
df.tail()

In [None]:
df.index = [x for x in range(0,len(df.index))]

# Adding markers to trigger
for i in df.index: 
   if (not pd.isna(df.at[i,"trigger"])): 
       triggerText = df.at[i,"trigger"]
       df.at[i,"trigger"] = df.at[i,"trigger"].replace(df.at[i,"trigger"], "<t>" + df.at[i,"trigger"] + "</t>")
       df.at[i,"sentence"] = df.at[i,"sentence"].replace(triggerText, "<t>" + triggerText + "</t>")
       df.at[i,"paragraph"] = df.at[i,"paragraph"].replace(triggerText, "<t>" + triggerText + "</t>")
       df.at[i,"mention text (just a few words around the trigger)"] = df.at[i,"mention text (just a few words around the trigger)"].replace(triggerText, "<t>" + triggerText + "</t>")

# assign numerical labels
num_of_labels = len(list(set(df[annotations_column])))
if num_of_labels == 2:
    df['label'] = np.array([1 if x == "b" else 0 for x in df['annotation: b (belief or attitude), n (not a belief and not an attitude)']])
else:
    print(f"Wrong number of labels: {number_of_labels}")

In [None]:
df.head()

In [None]:
transformer_name = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(transformer_name)
# NOTE: for cross validation, the model should be initialized inside the cv loop

In [None]:
def tokenize(batch):
    return tokenizer(batch['sentence'], truncation=True)

In [None]:

def compute_metrics(eval_pred):
    y_true = eval_pred.label_ids
    y_pred = np.argmax(eval_pred.predictions, axis=-1)
    report = metrics.classification_report(y_true, y_pred)
    print("report: \n", report)
    
    print("rep type: ", type(report))
    

    return {'f1':metrics.f1_score(y_true, y_pred)}

In [None]:
# Note: not used right now, but can be
# https://github.com/huggingface/transformers/blob/65659a29cf5a079842e61a63d57fa24474288998/src/transformers/models/bert/modeling_bert.py#L1486

class BertForSequenceClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.init_weights()
        
    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, **kwargs):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            **kwargs,
        )
        cls_outputs = outputs.last_hidden_state[:, 0, :]
        cls_outputs = self.dropout(cls_outputs)
        logits = self.classifier(cls_outputs)
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


In [None]:
# this is for creating cross-validation folds
def get_sample_based_on_idx(data, indeces):
    return data.iloc[indeces, :].reset_index()

In [None]:
0 in set(df["label"]) and 1 in set(df["label"]) and len(list(set(df["label"]))) == 2

In [None]:
# use sentences as text; TODO: can add wrapping for trigger 
df["text"] = df["sentence"]
# how much of the data to use (can limit number of debugging)
df = df[:]


In [None]:
# just checking the df looks right
df.head()

In [None]:
# defining hyperparams
num_epochs = 20
batch_size = 16
weight_decay = 0.01
training_args = TrainingArguments(
    output_dir="./results_triggerless", # is this location in the tmp dir? 
    log_level='error',
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    weight_decay=weight_decay,
    load_best_model_at_end=True, # this is supposed to make sure the best model is loaded by the trainer at the end
    metric_for_best_model="eval_f1" 
    )

In [None]:
output = open("original.txt", "a")

fold = 0
kfold = KFold(n_splits=5, shuffle=True, random_state=1)
for train_df_idx, eval_df_idx in kfold.split(df):
    
    print("FOLD: ", fold)
    output.write(f"FOLD: {fold}\n")
    new_df = pd.DataFrame()
    
    train_df = get_sample_based_on_idx(df, train_df_idx)
    print("LEN DF: ", len(train_df))
    output.write(f"LEN DF: {len(train_df)}\n")
#     train_df['label'] = [int(item) for item in train_df["annotation: b (belief or attitude), n (not a belief and not an attitude)"]]
    print("done train df")
    output.write("done train df\n")
    eval_df = get_sample_based_on_idx(df, eval_df_idx)
#     eval_df["label"] = [int(item) for item in eval_df['annotation: b (belief or attitude), n (not a belief and not an attitude)']]
    print("done eval df")
    output.write("done eval df\n")
    print("LEN EVAL: ", len(eval_df))
    output.write(f"LEN EVAL: {len(eval_df)}\n")
#     print(eval_df.head())
    ds = DatasetDict()
    ds['train'] = Dataset.from_pandas(train_df)
    ds['validation'] = Dataset.from_pandas(eval_df)
    train_ds = ds['train'].map(
        tokenize, batched=True,
        remove_columns=['index', 'sentence', 'trigger', 'annotation: b (belief or attitude), n (not a belief and not an attitude)', 'paragraph'],
    )
    eval_ds = ds['validation'].map(
        tokenize,
        batched=True,
        remove_columns=['index', 'sentence', 'trigger', 'annotation: b (belief or attitude), n (not a belief and not an attitude)', 'paragraph'],
    )

#     config = AutoConfig.from_pretrained(
#         transformer_name,
#         num_labels=2,
#     )

    model = AutoModelForSequenceClassification.from_pretrained(transformer_name, num_labels=2)
    tokenizer = AutoTokenizer.from_pretrained(transformer_name)
#     model = (
#         BertForSequenceClassification
#         .from_pretrained(transformer_name, config=config)
#     )
    trainer = Trainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        tokenizer=tokenizer,
    )
    trainer.train()
    # after training, predict (will use best model?)
    preds = trainer.predict(eval_ds)
#     print("HERE: " , preds)
    final_preds = [np.argmax(x) for x in preds.predictions]
    real_f1 = metrics.f1_score(final_preds, eval_df["label"])
    print("F-1: ", real_f1)
    output.write(f"F-1: {real_f1}\n")
    model_name = f"{transformer_name}-best-of-fold-{fold}-f1-{real_f1}"
    model_dir = os.path.join(models_dir, model_name)

    trainer.save_model(model_dir)
    count_f_n = 0
    count_f_p = 0
    for i, item in enumerate(final_preds):
        if not item == eval_ds["label"][i]:
            false_df = pd.DataFrame()
            false_df["sentence"] = [eval_df["sentence"][i]]
            false_df["real"] = [eval_df["label"][i]]
            false_df["predicted"] = [item]
            new_df = pd.concat([new_df, false_df])
#             print("NEW: \n", false_df.head())
            if item == 0:
                count_f_n += 1

            else:
                count_f_p += 1
#                 print(eval_ds["sentence"][i], " " , eval_ds["label"][i], " ", item, "\n")

    #     else:
    #         print(">>>", list(X_test)[i], " " , y_test_enc[i], " ", list(y_test)[i], " ", item, "\n")
    print(f"n of fasle pos: {count_f_n}")
    output.write(f"n of fasle pos: {count_f_n}\n")
    print(f"n of false neg: {count_f_p}")
    output.write(f"n of false neg: {count_f_p}\n")
    
    
#     print(new_df.head())
    new_df.to_csv(os.path.join(models_dir, "false_annotations_" + str(fold) + ".tsv"), sep="\t")  
    fold += 1
        

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
torch.cuda.memory_summary(device=None, abbreviated=False)
output.write("{torch.cuda.memory_summary(device=None, abbreviated=False)}")

output.close()