In [None]:
import random
import json
import numpy as np
import torch
import math
from nltk.tokenize import word_tokenize, sent_tokenize
import csv
import pandas as pd
import random

from nltk.parse.corenlp import CoreNLPParser

parser = CoreNLPParser('http://localhost:9000', encoding="utf-8")


In [1]:
import torch
import math

from torch.nn import CrossEntropyLoss
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [2]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [4]:
model = GPT2LMHeadModel.from_pretrained("gpt2").to("cpu").eval()

In [96]:
def scorer_batch(sentences):
    #Gpt for fluency
    tokenizer.pad_token = tokenizer.eos_token
    tensor_input = {k: v.to("cpu") for k,v in tokenizer(sentences, padding=True, truncation=True, return_tensors='pt').items()}


    lm_labels = tensor_input["input_ids"].detach().clone()
    lm_labels[lm_labels[:, :] == tokenizer.pad_token_id] = -100

    outputs = model(input_ids=tensor_input["input_ids"],
                attention_mask= tensor_input["attention_mask"],
                return_dict=True)

    lm_logits = outputs.logits
    shift_logits = lm_logits[..., :-1, :].contiguous()
    shift_labels = lm_labels[..., 1:].contiguous()
    
    print([tokenizer._convert_id_to_token(i) for i in shift_labels.tolist()[0]])

    loss_fct = CrossEntropyLoss(ignore_index=-100, reduction='none')  # give CE loss at each word generation step
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    
    log_prob_sum = loss.reshape(-1, shift_labels.shape[-1]) #.sum(dim=1)
    log_prob_sum1 = torch.exp(-loss.reshape(-1, shift_labels.shape[-1])) #.sum(dim=1)
    len_sum = tensor_input["attention_mask"][..., 1:].contiguous() #.sum(dim=1)
    
    #prob_products_per_sample = torch.exp(-1 * (log_prob_sum/len_sum)).cpu()

    print(log_prob_sum)
    #print(log_prob_sum1)
    print(len_sum)
    
    print(log_prob_sum.sum(dim=1))
    #print(log_prob_sum1)
    print(len_sum.sum(dim=1))
    print(log_prob_sum.sum(dim=1) / len_sum.sum(dim=1))
    
    print("\nFinal:", 100 * torch.exp(- 1 * (log_prob_sum.sum(dim=1) / len_sum.sum(dim=1))))
    
    #return (prob_products_per_sample * 100)
    
sents = ['I bought bananas, apples, and orange juice from the supermarket.']
scorer_batch(sents)

['Ġbought', 'Ġbananas', ',', 'Ġapples', ',', 'Ġand', 'Ġorange', 'Ġjuice', 'Ġfrom', 'Ġthe', 'Ġsupermarket', '.']
tensor([[ 9.1669, 11.3402,  2.1985,  4.7560,  0.5062,  1.7837,  5.7995,  0.4455,
          1.8520,  1.7130,  2.1359,  1.2360]], grad_fn=<ViewBackward>)
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
tensor([42.9334], grad_fn=<SumBackward1>)
tensor([12])
tensor([3.5778], grad_fn=<DivBackward0>)

Final: tensor([2.7937], grad_fn=<MulBackward0>)


In [87]:
sents = ['I bought bananas, apples, and orange juice from the supermarket.']
scorer_batch(sents)

['Ġbought', 'Ġbananas', ',', 'Ġapples', ',', 'Ġand', 'Ġorange', 'Ġjuice', 'Ġfrom', 'Ġthe', 'Ġsupermarket', '.']
tensor([[ 9.1669, 11.3402,  2.1985,  4.7560,  0.5062,  1.7837,  5.7995,  0.4455,
          1.8520,  1.7130,  2.1359,  1.2360]], grad_fn=<ViewBackward>)
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
tensor([42.9334], grad_fn=<SumBackward1>)
tensor([12])
tensor([3.5778], grad_fn=<DivBackward0>)

Final: tensor([2.7937], grad_fn=<MulBackward0>)


In [69]:
sents = ['I bought bananas from the supermarket.']
scorer_batch(sents)

['Ġbought', 'Ġbananas', 'Ġfrom', 'Ġthe', 'Ġsupermarket', '.']

Final: tensor([0.9943], grad_fn=<MulBackward0>)


In [None]:
def clean_str(sent):
    sent = sent.replace("’", "'")
    sent = sent.replace("‘", "`")
    sent = sent.replace('"', "''")
    sent = sent.replace("—", "--")
    sent = sent.replace("…", "...")
    sent = sent.replace("–", "--")

    return sent

def get_dataset(scored_sentences_path, dataset_path, dataset_name, top_n, parser):

    if dataset_name == 'liar_plus':
        df = pd.read_csv(dataset_path, sep='\t', index_col=0)
        df = df.dropna()
        columns = ['dummy', 'id', 'statement', 'justification',
               'ruling_without_summary', 'label', 'just_tokenized',
               'ruling_tokenized', 'statement_tokenized', 'oracle_ids']
        print(df.columns)
        print(columns)
        df.columns = columns
        
    elif dataset_name == 'pub_health':
        df = pd.read_csv(dataset_path, sep='\t', index_col=0)
        df = df.dropna()
        
        columns = ['claim_id', 'claim', 'date_published', 'explanation',
                   'fact_checkers', 'main_text', 'sources', 'label', 'subjects']
        
        if len(df.columns) == 10:
            columns = ['dummy'] + columns
        
        df.columns = columns
        
    scored_sentences = [json.loads(line) for line in open(scored_sentences_path)]
    scored_sentences = {item["id"]: sorted(item['sentence_scores'], key=lambda x: x[1], reverse=True)[:top_n] for item in scored_sentences}
    
    
    inp_scored_sentences = {}
    for k, v in scored_sentences.items():
        
        temp = []
        for sent in v:
            temp.append(sent[0])
        inp_scored_sentences[k] = clean_str(" ".join(temp))

    scored_sentences = inp_scored_sentences
    
    
    if dataset_name == 'liar_plus':
        
        df['scored_sentences'] = df.apply(lambda x: scored_sentences.get(x['id'], None), axis=1)
        df = df[df['scored_sentences'] != None]
        df['justification_sentences'] = df.apply(lambda x: sent_tokenize(x['justification']), axis=1)
        df = df[['id', 'statement', 'justification', 'label', 'scored_sentences',
             'justification_sentences']]
        
    elif dataset_name == 'pub_health':
        df['claim_id'] = df['claim_id'].astype('str')
        df['scored_sentences'] = df.apply(lambda x: scored_sentences.get(x['claim_id'], None), axis=1)
        df = df[df['scored_sentences'] != None]
        df['justification_sentences'] = df.apply(lambda x: sent_tokenize(x['explanation']), axis=1)
        df = df[['claim_id', 'claim', 'explanation', 'label', 'scored_sentences',
             'justification_sentences']]
        
        
    dataset = [row.to_dict() for i, row in df.iterrows()]
    new_dataset = []
    if dataset_name == 'liar_plus':
        for i in dataset:
            if i["scored_sentences"] is None or i["id"] == '2001.json': #Sentence in Liarplus is too long:
                continue
            else:
                new_dataset.append(i)
    elif dataset_name == 'pub_health':
        for i in dataset:
        
            if i["scored_sentences"] is None or i["scored_sentences"] == None:
                continue
            else:
                new_dataset.append(i)
    

    print(f'Size of dataset: {len(dataset)}')
    print(f'Size of new dataset: {len(new_dataset)}')
    print('Sample: ', dataset[0])
    if len(new_dataset)!=0:
        print('Sample: ', new_dataset[0])

    return 

scored_sentences_path = "../../DATA-COPE-Project-DIKUServer/unsup_scores_liar/sentence_scores_val.jsonl" #Each line is a json
scored_sentences_path1 = "../../DATA-COPE-Project-DIKUServer/unsup_scores_pubhealth/sentence_scores_test.jsonl"

dataset_path = "../../liar_data/ruling_oracles_val.tsv"
dataset_path1 = "../../DATA-COPE-Project-DIKUServer/PUBHEALTH/test.tsv"

get_dataset(scored_sentences_path1, dataset_path1, 'pub_health', 6, parser)
#get_dataset(scored_sentences_path, dataset_path, 'liar_plus', 6, parser)
