In [None]:
import random
import json
import numpy as np
import torch
import math
from nltk.tokenize import word_tokenize, sent_tokenize
import csv
import time
from tqdm import tqdm
import spacy

from data_loader import get_dataset_df
from run_sa import get_dataset
from rouge_score import rouge_scorer
from SA.args import get_model_args

import pandas as pd
import random

from nltk.parse.corenlp import CoreNLPParser

In [None]:
parser = CoreNLPParser('http://localhost:9000', encoding="utf-8")

## Dataset loader

In [None]:
sa_args = get_model_args()

sa_args.dataset_name = 'liar'
sa_args.sentences_path = "/Users/jolly/PycharmProjects/COPENLU/Data_HE/liar/results_serialized_test_filtered.jsonl"
sa_args.dataset_path = "/Users/jolly/PycharmProjects/COPENLU/Data_HE/liar/ruling_oracles_test.tsv"
dataset = get_dataset(sa_args)
SA = [line for line in open('/Users/jolly/PycharmProjects/COPENLU/Data_HE/liar/liar_sup_test.txt', 'r')] #sa_inp + '\t' +sa_out
SA_PM = [line for line in open('/Users/jolly/PycharmProjects/COPENLU/Data_HE/liar/liar_sup_test_filter.txt', 'r')]

## Data prep for Task1

In [None]:
random.seed(420)# - To save data for task 1
final_data = []

for org, sa, sa_pm in zip(dataset, SA, SA_PM):
    
    claim = org["statement"]
    veracity_label = org["label"]
    sa_inp = sa.split("\t")[0]
    sa_out = sa.split("\t")[1]
    
    line_data = {
        "claim": claim,
        "label": veracity_label,
        "sa_inp": sa_inp,
        "sa_out": sa_out,
        "sa_pm": sa_pm
    }
    
    final_data.append(line_data)

final_data_40 = []
for idx, line in enumerate(random.sample(final_data, 40)):
    line["id"] = idx+1
    final_data_40.append(line)
    
json.dump(final_data_40, open("he_data_liar.json", "w"), indent=2)
print(len(final_data_40))

## Data prep for Task2

In [None]:
new_final_data = []

In [None]:
#To save data for task2

#random.seed(20) #- To save data for task 2 -sa_inps
#random.seed(30) #- To save data for task 2 -sa_outs
random.seed(40) #- To save data for task 2 -sa_pm justs

final_data = []

for org, sa, sa_pm in zip(dataset, SA, SA_PM):
    
    claim = org["statement"]
    veracity_label = org["label"]
    sa_inp = sa.split("\t")[0]
    sa_out = sa.split("\t")[1]
    
    line_data = {
        "claim": claim,
        "label": veracity_label,
        "just": sa_pm,
        "just_type": "sa_pm"
    }
    
    final_data.append(line_data)
for idx, line in enumerate(random.sample(final_data, 20)):
    new_final_data.append(line)
print(len(new_final_data))

In [None]:
final_data_60 = []
for idx, line in enumerate(random.sample(new_final_data, 60)):
    line["id"] = idx+1
    final_data_60.append(line)
    
json.dump(final_data_60, open("he_data_liar_task2.json", "w"), indent=2)
print(len(final_data_60))

## Read SA output file

## CleanSents using Paraphrase Tools

In [None]:
import torch
import language_tool_python

from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from sentence_transformers import SentenceTransformer, util

tool = language_tool_python.LanguageTool('en-US')
nlp = spacy.load("en_core_web_sm")

model_name = 'tuner007/pegasus_paraphrase'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

model_sbert = SentenceTransformer('paraphrase-distilroberta-base-v1')

num_beams = 10
num_return_sequences = 10

def get_response(input_text,num_return_sequences,num_beams):
    batch = tokenizer([input_text],truncation=True,padding='longest',max_length=60, return_tensors="pt").to(torch_device)
    translated = model.generate(**batch, max_length=60,num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return tgt_text

def gramatical_tool(sent):
    matches = tool.check(sent)
    return language_tool_python.utils.correct(sent, matches)

def sentence_level_semantic_scorer_sbert(org, rep):
    org_embeds = model_sbert.encode(org)
    rep_embeds = model_sbert.encode(rep)
    return torch.FloatTensor([util.pytorch_cos_sim(e1, e2) for e1, e2 in zip(org_embeds, rep_embeds)])

def post_process(text):
    valid_sents = []
    for i in sent_tokenize(text):
        i = gramatical_tool(i)
        doc = nlp(i)
        verbs = [token.text for token in doc if token.pos_ in ['VERB', 'AUX']]
        if len(verbs)>0:
            valid_sents.append(i)
            
    return " ".join(valid_sents)

def pegasus(text):
    temp = []
    for i in sent_tokenize(text):
        if len(i.split(" ")) == 1:
            temp.append(i)
            continue
        else:
            all_responses = get_response(i, num_return_sequences, num_beams)
            temp_str = ''
            sim = sentence_level_semantic_scorer_sbert(all_responses, [i]*10)
            max_sim_rep = all_responses[torch.argmax(sim)]    
            temp.append(max_sim_rep)
    
    return(" ".join(temp))
    


In [None]:
outs = json.load(open("he_data_liar.json"))
for line in tqdm(outs):
    line['sa_pp'] = post_process(line["sa_out"])
    line['sa_pegasus'] = pegasus(line["sa_pp"])
    

In [None]:
#json.dump(outs, open('new_postprocess_liar.json', 'w+'))

In [None]:
import json
outs = json.load(open("new_postprocess_liar.json"))
for line in outs:
    print("sa_inp", line["sa_inp"])
    print("-----------")
    print("sa_out", line["sa_out"])
    print("-----------")
    print("sa_pp", line["sa_pp"])
    print("-----------")
    print("sa_pegasus", line["sa_pegasus"])
    print("-----------")
    input()

sa_inp But 90 percent of them, 90 percent, do not then show up in court later.'' That's substantially lower than the 90 percent figure Flake cited. ''And then what the record shows is that they're told to appear later in court, where their case will be adjudicated. In this context, Sen. Jeff Flake, R-Ariz., offered a notable statistic about the judicial treatment of people who arrive at the U.S. border. Between 2003 and 2012, the percentage of all immigrants who failed to appear in court after being released has bounced between 20 percent and 40 percent, settling in at about 30 percent at the end of that time span. This is a related and notable issue, but somewhat different from what Flake or Goodlatte said.
-----------
sa_out But 90 percent of them, 90 percent, do not show up in court.'' That's substantially lower than the 90 percent figure Flake cited. ''And what the record shows about the judicial treatment of people who arrive at the U.S. border is that they're told to appear in co


sa_inp The release claims that Koster ''fell silent'' on investigating the website Backpage.com in 2011, after receiving ''over $12,000 in campaign contributions'' from a legal and lobbying firm representing the site, SNR Denton. Backpage, a free classifieds website, has been criticized for promoting prostitution and helping facilitate human trafficking. According to the campaign release, Koster ceased any efforts against Backpage following donations from SNR Denton. Backpage has yet to comply. Koster's office referenced this letter. The letter asked attorneys general to sign on to a letter urging Congress to amend the Communications Decency Act of 1996.
-----------
sa_out The letter The release in 2011 claims that Koster ''fell silent'' on investigating the website Backpage.com, after receiving ''over $12,000 in campaign contributions'' from a legal and lobbying firm, SNR Denton. Backpage, , has been criticized for promoting prostitution and helping facilitate human trafficking. Acco

In [None]:
JustFact: Generating fact-checking explainations in unsupervised settings

Pipeline - Sentences from RC's --> SA --> post-processing to remove grammatical errors --> Pegasus to make it more consise

Human Eval - Using two justifications - pipeline inp (SA inp) & pipeline out (pegasus out)

## GPT-Debugging

In [None]:
import torch
import math

from torch.nn import CrossEntropyLoss
from transformers import GPT2LMHeadModel, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2").to("cpu").eval()

In [None]:
def scorer_batch(sentences):
    #Gpt for fluency
    tokenizer.pad_token = tokenizer.eos_token
    tensor_input = {k: v.to("cpu") for k,v in tokenizer(sentences, padding=True, truncation=True, return_tensors='pt').items()}


    lm_labels = tensor_input["input_ids"].detach().clone()
    lm_labels[lm_labels[:, :] == tokenizer.pad_token_id] = -100

    outputs = model(input_ids=tensor_input["input_ids"],
                attention_mask= tensor_input["attention_mask"],
                return_dict=True)

    lm_logits = outputs.logits
    shift_logits = lm_logits[..., :-1, :].contiguous()
    shift_labels = lm_labels[..., 1:].contiguous()
    
    print([tokenizer._convert_id_to_token(i) for i in shift_labels.tolist()[0]])

    loss_fct = CrossEntropyLoss(ignore_index=-100, reduction='none')  # give CE loss at each word generation step
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    
    log_prob_sum = loss.reshape(-1, shift_labels.shape[-1]) #.sum(dim=1)
    log_prob_sum1 = torch.exp(-loss.reshape(-1, shift_labels.shape[-1])) #.sum(dim=1)
    len_sum = tensor_input["attention_mask"][..., 1:].contiguous() #.sum(dim=1)
    
    #prob_products_per_sample = torch.exp(-1 * (log_prob_sum/len_sum)).cpu()

    print(log_prob_sum)
    #print(log_prob_sum1)
    print(len_sum)
    
    print(log_prob_sum.sum(dim=1))
    #print(log_prob_sum1)
    print(len_sum.sum(dim=1))
    print(log_prob_sum.sum(dim=1) / len_sum.sum(dim=1))
    
    print("\nFinal:", 100 * torch.exp(- 1 * (log_prob_sum.sum(dim=1) / len_sum.sum(dim=1))))
    
    #return (prob_products_per_sample * 100)
    
sents = ['I bought bananas, apples, and orange juice from the supermarket.']
scorer_batch(sents)

## Data Prep SA input

In [None]:
def clean_str(sent):
    sent = sent.replace("’", "'")
    sent = sent.replace("‘", "`")
    sent = sent.replace('"', "''")
    sent = sent.replace("—", "--")
    sent = sent.replace("…", "...")
    sent = sent.replace("–", "--")

    return sent

def get_dataset(scored_sentences_path, dataset_path, dataset_name, top_n, parser):

    if dataset_name == 'liar_plus':
        df = pd.read_csv(dataset_path, sep='\t', index_col=0)
        df = df.dropna()
        columns = ['dummy', 'id', 'statement', 'justification',
               'ruling_without_summary', 'label', 'just_tokenized',
               'ruling_tokenized', 'statement_tokenized', 'oracle_ids']
        print(df.columns)
        print(columns)
        df.columns = columns
        
    elif dataset_name == 'pub_health':
        df = pd.read_csv(dataset_path, sep='\t', index_col=0)
        df = df.dropna()
        
        columns = ['claim_id', 'claim', 'date_published', 'explanation',
                   'fact_checkers', 'main_text', 'sources', 'label', 'subjects']
        
        if len(df.columns) == 10:
            columns = ['dummy'] + columns
        
        df.columns = columns
        
    scored_sentences = [json.loads(line) for line in open(scored_sentences_path)]
    scored_sentences = {item["id"]: sorted(item['sentence_scores'], key=lambda x: x[1], reverse=True)[:top_n] for item in scored_sentences}
    
    
    inp_scored_sentences = {}
    for k, v in scored_sentences.items():
        
        temp = []
        for sent in v:
            temp.append(sent[0])
        inp_scored_sentences[k] = clean_str(" ".join(temp))

    scored_sentences = inp_scored_sentences
    
    
    if dataset_name == 'liar_plus':
        
        df['scored_sentences'] = df.apply(lambda x: scored_sentences.get(x['id'], None), axis=1)
        df = df[df['scored_sentences'] != None]
        df['justification_sentences'] = df.apply(lambda x: sent_tokenize(x['justification']), axis=1)
        df = df[['id', 'statement', 'justification', 'label', 'scored_sentences',
             'justification_sentences']]
        
    elif dataset_name == 'pub_health':
        df['claim_id'] = df['claim_id'].astype('str')
        df['scored_sentences'] = df.apply(lambda x: scored_sentences.get(x['claim_id'], None), axis=1)
        df = df[df['scored_sentences'] != None]
        df['justification_sentences'] = df.apply(lambda x: sent_tokenize(x['explanation']), axis=1)
        df = df[['claim_id', 'claim', 'explanation', 'label', 'scored_sentences',
             'justification_sentences']]
        
        
    dataset = [row.to_dict() for i, row in df.iterrows()]
    new_dataset = []
    if dataset_name == 'liar_plus':
        for i in dataset:
            if i["scored_sentences"] is None or i["id"] == '2001.json': #Sentence in Liarplus is too long:
                continue
            else:
                new_dataset.append(i)
    elif dataset_name == 'pub_health':
        for i in dataset:
        
            if i["scored_sentences"] is None or i["scored_sentences"] == None:
                continue
            else:
                new_dataset.append(i)
    

    print(f'Size of dataset: {len(dataset)}')
    print(f'Size of new dataset: {len(new_dataset)}')
    print('Sample: ', dataset[0])
    if len(new_dataset)!=0:
        print('Sample: ', new_dataset[0])

    return 

scored_sentences_path = "../../DATA-COPE-Project-DIKUServer/unsup_scores_liar/sentence_scores_val.jsonl" #Each line is a json
scored_sentences_path1 = "../../DATA-COPE-Project-DIKUServer/unsup_scores_pubhealth/sentence_scores_test.jsonl"

dataset_path = "../../liar_data/ruling_oracles_val.tsv"
dataset_path1 = "../../DATA-COPE-Project-DIKUServer/PUBHEALTH/test.tsv"

get_dataset(scored_sentences_path1, dataset_path1, 'pub_health', 6, parser)
#get_dataset(scored_sentences_path, dataset_path, 'liar_plus', 6, parser)
