In [None]:
import random
import json
import numpy as np
import torch
import math
from nltk.tokenize import word_tokenize, sent_tokenize
import csv
import time

from data_loader import get_dataset_df
from run_sa import get_dataset
from rouge_score import rouge_scorer
from SA.args import get_model_args


import pandas as pd
import random

from nltk.parse.corenlp import CoreNLPParser

parser = CoreNLPParser('http://localhost:9000', encoding="utf-8")


## Dataset loader

In [None]:
sa_args = get_model_args()

sa_args.dataset_name = 'liar'
sa_args.sentence_path = "/Users/jolly/PycharmProjects/COPENLU/FilteredData/sup_sccores_liar/results_serialized_val_filtered.jsonl"
sa_args.dataset_path = "/Users/jolly/PycharmProjects/COPENLU/liar_data/ruling_oracles_val.tsv"




## Read SA output file

In [None]:
val = [line for line in open('../outs/liar_sup_val.txt', 'r')] 
val_filter = [line for line in open('../outs/liar_sup_val_filter.txt', 'r')]
dataset = get_dataset(sa_args)


for v, vf, org in zip(val, val_filter, dataset):
    print("--------")
    print("SA-input: ", org['scored_sentences'])
    print("--------")
    print("SA-gold: ", org['justification'])
    print("--------")
    print("SA-output: ", v)
    print("--------")
    print("SA-output+pegasus: ", vf)
    input()



## CleanSents using Paraphrase Tools

In [None]:
import torch
import language_tool_python

from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from sentence_transformers import SentenceTransformer, util

tool = language_tool_python.LanguageTool('en-US')

model_name = 'tuner007/pegasus_paraphrase'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

model_sbert = SentenceTransformer('paraphrase-distilroberta-base-v1')

num_beams = 10
num_return_sequences = 10

def get_response(input_text,num_return_sequences,num_beams):
    batch = tokenizer([input_text],truncation=True,padding='longest',max_length=60, return_tensors="pt").to(torch_device)
    translated = model.generate(**batch, max_length=60,num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return tgt_text

def gramatical_tool(sent):
    matches = tool.check(sent)
    return language_tool_python.utils.correct(sent, matches)

def sentence_level_semantic_scorer_sbert(org, rep):
    org_embeds = model_sbert.encode(org)
    rep_embeds = model_sbert.encode(rep)
    return torch.FloatTensor([util.pytorch_cos_sim(e1, e2) for e1, e2 in zip(org_embeds, rep_embeds)])


In [None]:
a = "''If you add all that stuff up, it accounts for less than 1 percent of the overall package. Now, that doesn't mean that the package can't improve and that's what I said to the leadership last night, 'Let's improve it. Sen. Charles Grassley and other Republicans say they have less problem shovel-ready projects in the proposed stimulus plan with the proposed tax cuts and where the money can be spent in two years. In fact, in an opinion piece for the Wall Street Journal , Coburn laid out his case that the overwhelming majority of Obama's plan is not stimulative. . For example, Sen. Tom Coburn and his staff put together a widely circulated list of more than 30 items written by Coburn ."

org = "''If you add all that stuff up, it accounts for less than 1 percent of the overall package. Now, that doesn't mean that the package can't improve and that's what I said to the leadership last night, 'Let's improve it. Sen. Charles Grassley and other Republicans say they have less problem with the proposed tax cuts and shovel-ready projects where the money can be spent in two years. In fact, in an opinion piece written by Coburn for the Wall Street Journal , Coburn laid out his case that the overwhelming majority of Obama's plan is not stimulative. Hundreds of millions for contraceptives. For example, Sen. Tom Coburn and his staff put together a widely circulated list of more than 30 items in the proposed stimulus plan that he considers wasteful."

def new_imp(a, org):
    temp = []
    time1 = time.time()
    for i in sent_tokenize(a):

        if len(i.split(" "))==1:
            temp.append(i)
            continue
        else:
            i = gramatical_tool(i)
            all_responses = get_response(i, num_return_sequences, num_beams)
            temp_str = ''
            sim = sentence_level_semantic_scorer_sbert(all_responses, [i]*10)
            max_sim_rep = all_responses[torch.argmax(sim)]    
            temp.append(max_sim_rep)
    print(org)
    print("--------")
    print(a)
    print("--------")
    print(" ".join(temp))
    print("Time: ", time.time()-time1)

new_imp(a, org)



## GPT-Debugging

In [None]:
import torch
import math

from torch.nn import CrossEntropyLoss
from transformers import GPT2LMHeadModel, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2").to("cpu").eval()

In [None]:
def scorer_batch(sentences):
    #Gpt for fluency
    tokenizer.pad_token = tokenizer.eos_token
    tensor_input = {k: v.to("cpu") for k,v in tokenizer(sentences, padding=True, truncation=True, return_tensors='pt').items()}


    lm_labels = tensor_input["input_ids"].detach().clone()
    lm_labels[lm_labels[:, :] == tokenizer.pad_token_id] = -100

    outputs = model(input_ids=tensor_input["input_ids"],
                attention_mask= tensor_input["attention_mask"],
                return_dict=True)

    lm_logits = outputs.logits
    shift_logits = lm_logits[..., :-1, :].contiguous()
    shift_labels = lm_labels[..., 1:].contiguous()
    
    print([tokenizer._convert_id_to_token(i) for i in shift_labels.tolist()[0]])

    loss_fct = CrossEntropyLoss(ignore_index=-100, reduction='none')  # give CE loss at each word generation step
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    
    log_prob_sum = loss.reshape(-1, shift_labels.shape[-1]) #.sum(dim=1)
    log_prob_sum1 = torch.exp(-loss.reshape(-1, shift_labels.shape[-1])) #.sum(dim=1)
    len_sum = tensor_input["attention_mask"][..., 1:].contiguous() #.sum(dim=1)
    
    #prob_products_per_sample = torch.exp(-1 * (log_prob_sum/len_sum)).cpu()

    print(log_prob_sum)
    #print(log_prob_sum1)
    print(len_sum)
    
    print(log_prob_sum.sum(dim=1))
    #print(log_prob_sum1)
    print(len_sum.sum(dim=1))
    print(log_prob_sum.sum(dim=1) / len_sum.sum(dim=1))
    
    print("\nFinal:", 100 * torch.exp(- 1 * (log_prob_sum.sum(dim=1) / len_sum.sum(dim=1))))
    
    #return (prob_products_per_sample * 100)
    
sents = ['I bought bananas, apples, and orange juice from the supermarket.']
scorer_batch(sents)

## Data Prep SA input

In [None]:
def clean_str(sent):
    sent = sent.replace("’", "'")
    sent = sent.replace("‘", "`")
    sent = sent.replace('"', "''")
    sent = sent.replace("—", "--")
    sent = sent.replace("…", "...")
    sent = sent.replace("–", "--")

    return sent

def get_dataset(scored_sentences_path, dataset_path, dataset_name, top_n, parser):

    if dataset_name == 'liar_plus':
        df = pd.read_csv(dataset_path, sep='\t', index_col=0)
        df = df.dropna()
        columns = ['dummy', 'id', 'statement', 'justification',
               'ruling_without_summary', 'label', 'just_tokenized',
               'ruling_tokenized', 'statement_tokenized', 'oracle_ids']
        print(df.columns)
        print(columns)
        df.columns = columns
        
    elif dataset_name == 'pub_health':
        df = pd.read_csv(dataset_path, sep='\t', index_col=0)
        df = df.dropna()
        
        columns = ['claim_id', 'claim', 'date_published', 'explanation',
                   'fact_checkers', 'main_text', 'sources', 'label', 'subjects']
        
        if len(df.columns) == 10:
            columns = ['dummy'] + columns
        
        df.columns = columns
        
    scored_sentences = [json.loads(line) for line in open(scored_sentences_path)]
    scored_sentences = {item["id"]: sorted(item['sentence_scores'], key=lambda x: x[1], reverse=True)[:top_n] for item in scored_sentences}
    
    
    inp_scored_sentences = {}
    for k, v in scored_sentences.items():
        
        temp = []
        for sent in v:
            temp.append(sent[0])
        inp_scored_sentences[k] = clean_str(" ".join(temp))

    scored_sentences = inp_scored_sentences
    
    
    if dataset_name == 'liar_plus':
        
        df['scored_sentences'] = df.apply(lambda x: scored_sentences.get(x['id'], None), axis=1)
        df = df[df['scored_sentences'] != None]
        df['justification_sentences'] = df.apply(lambda x: sent_tokenize(x['justification']), axis=1)
        df = df[['id', 'statement', 'justification', 'label', 'scored_sentences',
             'justification_sentences']]
        
    elif dataset_name == 'pub_health':
        df['claim_id'] = df['claim_id'].astype('str')
        df['scored_sentences'] = df.apply(lambda x: scored_sentences.get(x['claim_id'], None), axis=1)
        df = df[df['scored_sentences'] != None]
        df['justification_sentences'] = df.apply(lambda x: sent_tokenize(x['explanation']), axis=1)
        df = df[['claim_id', 'claim', 'explanation', 'label', 'scored_sentences',
             'justification_sentences']]
        
        
    dataset = [row.to_dict() for i, row in df.iterrows()]
    new_dataset = []
    if dataset_name == 'liar_plus':
        for i in dataset:
            if i["scored_sentences"] is None or i["id"] == '2001.json': #Sentence in Liarplus is too long:
                continue
            else:
                new_dataset.append(i)
    elif dataset_name == 'pub_health':
        for i in dataset:
        
            if i["scored_sentences"] is None or i["scored_sentences"] == None:
                continue
            else:
                new_dataset.append(i)
    

    print(f'Size of dataset: {len(dataset)}')
    print(f'Size of new dataset: {len(new_dataset)}')
    print('Sample: ', dataset[0])
    if len(new_dataset)!=0:
        print('Sample: ', new_dataset[0])

    return 

scored_sentences_path = "../../DATA-COPE-Project-DIKUServer/unsup_scores_liar/sentence_scores_val.jsonl" #Each line is a json
scored_sentences_path1 = "../../DATA-COPE-Project-DIKUServer/unsup_scores_pubhealth/sentence_scores_test.jsonl"

dataset_path = "../../liar_data/ruling_oracles_val.tsv"
dataset_path1 = "../../DATA-COPE-Project-DIKUServer/PUBHEALTH/test.tsv"

get_dataset(scored_sentences_path1, dataset_path1, 'pub_health', 6, parser)
#get_dataset(scored_sentences_path, dataset_path, 'liar_plus', 6, parser)
