In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
import json
import matplotlib.pyplot as plt

In [3]:
from sklearn.metrics import f1_score,roc_auc_score,precision_score,recall_score

In [4]:
import torch
from scipy.special import softmax
import numpy as np

In [5]:
from nltk.tokenize import word_tokenize

In [6]:
def sent_scoring(model,tokenizer, text, cuda=True):
    assert model is not None
    assert tokenizer is not None
    input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0)  # Batch size 1
    if cuda:
        input_ids = input_ids.to('cuda')
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
    loss, logits = outputs[:2]
    sentence_prob = loss.item()
    return sentence_prob

In [7]:
root_path = None
dataset = 'PubMed'
path = os.path.join(root_path,dataset)

In [8]:
with open(os.path.join(path,'exp-data-AID.txt')) as f:
    lines = f.readlines()

In [9]:
with open(os.path.join(path,'word_entropy.json')) as f:
    word_prob = json.load(f)

In [10]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("gpt2-large")
model = AutoModelForCausalLM.from_pretrained("gpt2-large")
model = model.cuda()

In [11]:
from nltk.tokenize import word_tokenize
import string
def normalize(text):
    s = ''
    text = word_tokenize(text)
    for v in text:
        if v in string.punctuation:
            s += v
        else:
            s += ' '+ v
    s = s[1:]
    return s

In [12]:
RawTexts = []
for i in range(len(lines)):
    if dataset == 'News' and i == 121:
        continue
    data = json.loads(lines[i].strip())
    for key in ['abstract','gen_abstract','pol_abstract','mix_abstract']:
        data[key] = normalize(data[key])
        for ix,text in enumerate(data['sum_'+key]):
            data['sum_'+key][ix] = normalize(text)
    RawTexts.append(data)

In [13]:
GPTScores = []
for i in range(len(RawTexts)):
    score = {}
    data = RawTexts[i]
    for key in data:
        if type(data[key]) is str:
            score[key] = sent_scoring(model,tokenizer,data[key])
        else:
            score[key] = []
            for text in data[key]:
                score[key].append(sent_scoring(model,tokenizer,text))
    GPTScores.append(score)

In [14]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("t5-large")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-large")
model = model.cuda()

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [15]:
T5Scores = []
for i in range(len(RawTexts)):
    score = {}
    data = RawTexts[i]
    for key in data:
        if type(data[key]) is str:
            score[key] = sent_scoring(model,tokenizer,data[key])
        else:
            score[key] = []
            for text in data[key]:
                score[key].append(sent_scoring(model,tokenizer,text))
    T5Scores.append(score)
    print(score)

{'title': 1.1487032175064087, 'abstract': 8.646973609924316, 'gen_abstract': 8.68810749053955, 'pol_abstract': 8.392799377441406, 'mix_abstract': 8.663036346435547, 'sum_abstract': [0.6413389444351196, 7.0657501220703125, 6.832719326019287, 2.2395620346069336, 4.726503849029541, 1.1080888509750366, 7.233928680419922, 2.526122570037842], 'sum_gen_abstract': [0.6833982467651367, 5.911873817443848, 0.6059079170227051, 0.6613081693649292, 0.3962560296058655, 1.9816300868988037, 7.109287738800049, 0.5226378440856934], 'sum_pol_abstract': [0.7039114832878113, 7.727961540222168, 0.4307384788990021, 0.49967852234840393, 4.021182537078857, 5.616705894470215, 8.354650497436523, 3.2472939491271973], 'sum_mix_abstract': [1.5419045686721802, 7.848028182983398, 0.6286421418190002, 0.5367681384086609, 3.0207173824310303, 4.06066370010376, 8.523361206054688, 5.372523784637451]}
{'title': 1.2586432695388794, 'abstract': 6.7399001121521, 'gen_abstract': 11.461606979370117, 'pol_abstract': 8.268567085266

Token indices sequence length is longer than the specified maximum sequence length for this model (561 > 512). Running this sequence through the model will result in indexing errors


{'title': 0.7771096229553223, 'abstract': 8.672086715698242, 'gen_abstract': 7.877549648284912, 'pol_abstract': 8.281562805175781, 'mix_abstract': 13.027090072631836, 'sum_abstract': [0.43444395065307617, 5.381487846374512, 0.5524150729179382, 0.5683428645133972, 4.689258098602295, 0.4471871256828308, 5.381587028503418, 0.4354362189769745], 'sum_gen_abstract': [0.6795082092285156, 8.323989868164062, 0.6121733784675598, 0.582872211933136, 0.3409592807292938, 0.39405664801597595, 8.253957748413086, 0.5158154368400574], 'sum_pol_abstract': [2.8543643951416016, 8.419272422790527, 0.575287938117981, 0.6969205737113953, 6.286447525024414, 0.5768299698829651, 7.962246417999268, 0.5204514265060425], 'sum_mix_abstract': [0.4256700575351715, 10.470601081848145, 1.8061447143554688, 0.7067472338676453, 5.040590763092041, 6.227184295654297, 8.869048118591309, 5.932440280914307]}
{'title': 1.6230441331863403, 'abstract': 6.5786824226379395, 'gen_abstract': 5.084085941314697, 'pol_abstract': 7.938205

In [16]:
def f(key,score,):
    raw = score[key]
    summary = np.array(score['sum_'+key])
    summary = summary[summary.argsort()]
    #summary = summary[2:-2]
    delta = np.mean(summary)- raw
    return delta

In [17]:
def compute_entropy(text):

    text = word_tokenize(text)
    s = 0
    for word in text:
        if word in word_prob:
            s += word_prob[word]
        else:
            s += word_prob['[Sepcial Token] min_ent_log'] - 10*np.log(10)
    s = s/len(text)
    return s

In [20]:
Entropy = []
for i in range(len(RawTexts)):
    data = RawTexts[i]
    ent = {}
    for key in data:
        if type(data[key]) is str:
            ent[key] = compute_entropy(data[key])
        else:
            ent[key] = []
            for text in data[key]:
                ent[key].append(compute_entropy(text))
    Entropy.append(ent)

In [21]:
Features = {'raw_text':RawTexts, 'entropy':Entropy, 'GPTConf':GPTScores, 'T5Conf':T5Scores}

In [22]:
with open(os.path.join(path,'exp-data-AID-feature.json'),'w') as f:
    s = json.dumps(Features)
    f.write(s)