In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
import json
import matplotlib.pyplot as plt

In [3]:
from sklearn.metrics import f1_score,roc_auc_score,precision_score,recall_score

In [4]:
import torch
from scipy.special import softmax
import numpy as np
torch.cuda.is_available()

True

In [5]:
from nltk.tokenize import word_tokenize

In [6]:
def sent_scoring(model,tokenizer, text, cuda=True):
    assert model is not None
    assert tokenizer is not None
    input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0)  # Batch size 1
    if cuda:
        input_ids = input_ids.to('cuda')
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
    loss, logits = outputs[:2]
    sentence_prob = loss.item()
    return sentence_prob

In [9]:
root_path = '/home/gene/Documents/Senti/AI-Detector'
dataset = 'ExampleData/PubMed'
path = os.path.join(root_path,dataset)
# path = '/home/gene/Documents/Fathom/AI-Detector/ExampleData/PubMed/exp-data-AID.txt'

In [8]:
print(os.path.join(path,'exp-data-AID.txt'))

/home/gene/Documents/Fathom/AI-Detector/ExampleData/PubMed/exp-data-AID.txt


In [10]:
with open(os.path.join(path,'exp-data-AID.txt'), 'r') as f:
    lines = f.readlines()

In [11]:
with open(os.path.join(path,'word_entropy.json')) as f:
    word_prob = json.load(f)

In [12]:
from transformers import AutoTokenizer, AutoModelForCausalLM

gpt2_path = '/data/pretrained/gpt2-large'
tokenizer = AutoTokenizer.from_pretrained(gpt2_path)
model = AutoModelForCausalLM.from_pretrained(gpt2_path)
model = model.cuda()

In [14]:
from nltk.tokenize import word_tokenize
import string
def normalize(text):
    s = ''
    text = word_tokenize(text)
    for v in text:
        if v in string.punctuation:
            s += v
        else:
            s += ' '+ v
    s = s[1:]
    return s

In [15]:
RawTexts = []
for i in range(len(lines)):
    if dataset == 'News' and i == 121:
        continue
    data = json.loads(lines[i].strip())
    for key in ['abstract','gen_abstract','pol_abstract','mix_abstract']:
        data[key] = normalize(data[key])
        for ix,text in enumerate(data['sum_'+key]):
            data['sum_'+key][ix] = normalize(text)
    RawTexts.append(data)

In [17]:
print(len(RawTexts))
print(RawTexts[0])
print(RawTexts[0].keys())

305
{'title': 'expression and regulation of cav3_2 t-type calcium channels during inflammatory_hyperalgesia in mouse dorsal root ganglion neurons', 'abstract': 'the cav3_2 isoform of the t-type calcium channel is expressed in primary sensory neurons of the dorsal root ganglion( drg), and these channels contribute to nociceptive and neuropathic pain in rats. however, there are conflicting reports on the roles of these channels in pain processing in rats and mice. in addition, the function of t-type channels in persistent inflammatory_hyperalgesia is poorly understood. we performed behavioral and comprehensive histochemical analyses to characterize cav3_2-expressing drg neurons and examined the regulation of t-type channels in drgs from c57bl/6 mice with carrageenan-induced inflammatory_hyperalgesia. we show that approximately 20% of mouse drg neurons express cav3_2 mrna and protein.', 'gen_abstract': 'the expression and regulation of cav3_2 t-type calcium channels play a crucial role in

In [14]:
GPTScores = []
for i in range(len(RawTexts)):
    score = {}
    data = RawTexts[i]
    for key in data:
        if type(data[key]) is str:
            score[key] = sent_scoring(model,tokenizer,data[key])
        else:
            score[key] = []
            for text in data[key]:
                score[key].append(sent_scoring(model,tokenizer,text))
    GPTScores.append(score)

In [20]:
# save GPT scores
import pickle
with open(os.path.join(root_path, 'Checkpoint/GPTScores.pkl'),'wb') as f:
    pickle.dump(GPTScores,f)

# load GPT scores
import pickle
with open(os.path.join(root_path, 'Checkpoint/GPTScores.pkl'),'rb') as f:
    GPTScores = pickle.load(f)

In [15]:
print(GPTScores[0])

{'title': 4.398680210113525, 'abstract': 2.8748881816864014, 'gen_abstract': 1.9536079168319702, 'pol_abstract': 2.82426118850708, 'mix_abstract': 3.045504331588745, 'sum_abstract': [3.6705758571624756, 2.816843032836914, 3.0238521099090576, 3.1881492137908936, 3.1471691131591797, 3.0889086723327637, 3.098926305770874, 3.026578903198242], 'sum_gen_abstract': [3.337862968444824, 2.36635684967041, 3.208001136779785, 3.308825731277466, 2.6987199783325195, 2.246875762939453, 2.201767921447754, 2.9914708137512207], 'sum_pol_abstract': [3.859642267227173, 2.8943827152252197, 3.2574291229248047, 3.4140684604644775, 2.874396324157715, 3.236072540283203, 2.7604641914367676, 3.3002893924713135], 'sum_mix_abstract': [3.4192490577697754, 3.0340332984924316, 3.175586700439453, 3.8069822788238525, 3.350067377090454, 3.4526925086975098, 2.8621349334716797, 3.1607606410980225]}


In [15]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

t5_path = '/data/pretrained/t5-large'
tokenizer = AutoTokenizer.from_pretrained(t5_path)
model = AutoModelForSeq2SeqLM.from_pretrained(t5_path)
model = model.cuda()

In [16]:
T5Scores = []
for i in range(len(RawTexts)):
    score = {}
    data = RawTexts[i]
    for key in data:
        if type(data[key]) is str:
            score[key] = sent_scoring(model,tokenizer,data[key])
        else:
            score[key] = []
            for text in data[key]:
                score[key].append(sent_scoring(model,tokenizer,text))
    T5Scores.append(score)
    print(score)

{'title': 1.148895502090454, 'abstract': 8.646953582763672, 'gen_abstract': 8.688214302062988, 'pol_abstract': 8.392966270446777, 'mix_abstract': 8.663107872009277, 'sum_abstract': [0.6412978172302246, 7.065831184387207, 6.8328680992126465, 2.239459753036499, 4.726569652557373, 1.1080565452575684, 7.233890533447266, 2.526355028152466], 'sum_gen_abstract': [0.6835289597511292, 5.91184139251709, 0.6057877540588379, 0.6612673401832581, 0.39628922939300537, 1.9815936088562012, 7.109105587005615, 0.5224846005439758], 'sum_pol_abstract': [0.7038611769676208, 7.728537082672119, 0.430807501077652, 0.49979856610298157, 4.021157741546631, 5.6166791915893555, 8.354838371276855, 3.2472875118255615], 'sum_mix_abstract': [1.5432173013687134, 7.848160266876221, 0.6285637021064758, 0.5367878675460815, 3.0206305980682373, 4.060458183288574, 8.525507926940918, 5.372742176055908]}
{'title': 1.2251572608947754, 'abstract': 6.739748477935791, 'gen_abstract': 11.462320327758789, 'pol_abstract': 8.2686557769

In [21]:
# save T5 scores
import pickle
with open(os.path.join(root_path, 'Checkpoint/T5Scores.pkl'),'wb') as f:
    pickle.dump(T5Scores,f)

# # load T5 scores
# import pickle
# with open(os.path.join(root_path, 'Checkpoint/T5Scores.pkl'),'rb') as f:
#     T5Scores = pickle.load(f)

In [15]:
# reserve the original data
# T5Scores = []
# for i in range(len(RawTexts)):
#     score = {}
#     data = RawTexts[i]
#     for key in data:
#         if type(data[key]) is str:
#             score[key] = sent_scoring(model,tokenizer,data[key])
#         else:
#             score[key] = []
#             for text in data[key]:
#                 score[key].append(sent_scoring(model,tokenizer,text))
#     T5Scores.append(score)
#     print(score)

{'title': 1.1487032175064087, 'abstract': 8.646973609924316, 'gen_abstract': 8.68810749053955, 'pol_abstract': 8.392799377441406, 'mix_abstract': 8.663036346435547, 'sum_abstract': [0.6413389444351196, 7.0657501220703125, 6.832719326019287, 2.2395620346069336, 4.726503849029541, 1.1080888509750366, 7.233928680419922, 2.526122570037842], 'sum_gen_abstract': [0.6833982467651367, 5.911873817443848, 0.6059079170227051, 0.6613081693649292, 0.3962560296058655, 1.9816300868988037, 7.109287738800049, 0.5226378440856934], 'sum_pol_abstract': [0.7039114832878113, 7.727961540222168, 0.4307384788990021, 0.49967852234840393, 4.021182537078857, 5.616705894470215, 8.354650497436523, 3.2472939491271973], 'sum_mix_abstract': [1.5419045686721802, 7.848028182983398, 0.6286421418190002, 0.5367681384086609, 3.0207173824310303, 4.06066370010376, 8.523361206054688, 5.372523784637451]}
{'title': 1.2586432695388794, 'abstract': 6.7399001121521, 'gen_abstract': 11.461606979370117, 'pol_abstract': 8.268567085266

Token indices sequence length is longer than the specified maximum sequence length for this model (561 > 512). Running this sequence through the model will result in indexing errors


{'title': 0.7771096229553223, 'abstract': 8.672086715698242, 'gen_abstract': 7.877549648284912, 'pol_abstract': 8.281562805175781, 'mix_abstract': 13.027090072631836, 'sum_abstract': [0.43444395065307617, 5.381487846374512, 0.5524150729179382, 0.5683428645133972, 4.689258098602295, 0.4471871256828308, 5.381587028503418, 0.4354362189769745], 'sum_gen_abstract': [0.6795082092285156, 8.323989868164062, 0.6121733784675598, 0.582872211933136, 0.3409592807292938, 0.39405664801597595, 8.253957748413086, 0.5158154368400574], 'sum_pol_abstract': [2.8543643951416016, 8.419272422790527, 0.575287938117981, 0.6969205737113953, 6.286447525024414, 0.5768299698829651, 7.962246417999268, 0.5204514265060425], 'sum_mix_abstract': [0.4256700575351715, 10.470601081848145, 1.8061447143554688, 0.7067472338676453, 5.040590763092041, 6.227184295654297, 8.869048118591309, 5.932440280914307]}
{'title': 1.6230441331863403, 'abstract': 6.5786824226379395, 'gen_abstract': 5.084085941314697, 'pol_abstract': 7.938205

In [18]:
def f(key,score,):
    raw = score[key]
    summary = np.array(score['sum_'+key])
    summary = summary[summary.argsort()]
    #summary = summary[2:-2]
    delta = np.mean(summary)- raw
    return delta

In [22]:
print(word_prob['[Sepcial Token] min_ent_log'])
print(word_prob['[Sepcial Token] min_ent_log'] - 10*np.log(10))

-19.071556771093334


KeyError: '[Sepcial Token] max_ent_log'

In [19]:
def compute_entropy(text):

    text = word_tokenize(text)
    s = 0
    for word in text:
        if word in word_prob:
            s += word_prob[word]
        else:
            s += word_prob['[Sepcial Token] min_ent_log'] - 10*np.log(10)
    s = s/len(text)
    return s

In [24]:
Entropy = []
for i in range(len(RawTexts)):
    data = RawTexts[i]
    ent = {}
    for key in data:
        if type(data[key]) is str:
            ent[key] = compute_entropy(data[key])
        else:
            ent[key] = []
            for text in data[key]:
                ent[key].append(compute_entropy(text))
    Entropy.append(ent)

In [25]:
Features = {'raw_text':RawTexts, 'entropy':Entropy, 'GPTConf':GPTScores, 'T5Conf':T5Scores}

In [26]:
with open(os.path.join(path,'exp-data-AID-feature.json'),'w') as f:
    s = json.dumps(Features)
    f.write(s)