In [2]:
import os
os.environ["CUDA`_VISIBLE_DEVICES"] = "0"

In [3]:
import json
import numpy as np

In [4]:
root_path = '/home/gene/Documents/Senti/Fathom/AI-Detector/ExampleData/'
dataset = 'PubMed'

In [5]:
# Load Data
path = os.path.join(root_path,dataset)
with open(os.path.join(path,'exp-data-AID-feature.json')) as f:
    Features = json.load(f)

In [6]:
# Data Preprocessing
Keys = ['abstract','gen_abstract','pol_abstract','mix_abstract']
feature_names = ['GPTConf','T5Conf','entropy',]
raw_x = np.zeros((len(Features['entropy'])*4,9*3))
raw_y = np.zeros((len(Features['entropy'])*4,))
index = np.zeros((len(Features['entropy'])*4,))
                 
for i in range(len(Features['entropy'])):
    if i ==363:
        continue
    for j in range(len(Keys)):
        raw_y[4*i+j] = min(j,1)
        index[4*i+j] = j
        for k in range(len(feature_names)):
            raw_x[4*i+j,9*k] = Features[feature_names[k]][i][Keys[j]]
            raw_x[4*i+j,9*k+1:9*k+9] = np.array(Features[feature_names[k]][i]['sum_'+Keys[j]])

In [7]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

In [8]:
def metric(y,haty):
    s = precision_score(y,haty),recall_score(y,haty),f1_score(y,haty), f1_score(y,haty,average='micro'),f1_score(y,haty,average='macro'),roc_auc_score(y,haty)
    return s

In [9]:
from sklearn.metrics import *
def evaluation(model,test_images,test_labels):
    model = model.eval()
    bz = 32
    NUM = int(np.ceil(len(test_images)/bz))
    ys = []
    for i in range(NUM):
        s,e = i*bz,(i+1)*bz
        if e>len(test_images):
            e = len(test_images)
        x = test_images[s:e]
        x = torch.FloatTensor(x).cuda()
        y_hat = model(x)
        y_hat = y_hat.detach().to('cpu').numpy()
        ys.append(y_hat)
    ys = np.concatenate(ys,axis=0)
    logit = ys.argmax(axis=-1)
    label = test_labels

    
    return metric(label,logit)

In [9]:
class Model(nn.Module):   
    def __init__(self,feature_num):
        super(Model, self).__init__()        
        self.linear_layers = nn.Sequential(
            #nn.LazyLinear(512),
           nn.Linear(feature_num, feature_num), #41472 25088
           nn.ReLU(inplace=True),
            nn.Linear(feature_num, feature_num),
            nn.ReLU(inplace=True),
            nn.Linear(feature_num, feature_num),
            nn.ReLU(inplace=True),
            nn.Linear(feature_num, feature_num),
            nn.ReLU(inplace=True),
            nn.Linear(feature_num, 2),
        )
    
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, x,y=None):
        hat_y = self.linear_layers(x)
        if not y ==None:
            loss = self.loss_fn(hat_y, y)
            return loss,hat_y
        else:
            return hat_y

In [10]:
def make_data(m,sum_num = 8,used_feature = ['Conf','entropy','lengths'],):
    feature_names = ['gptb_Conf','t5l_Conf','entropy']
    sum_feature = (1+sum_num)*len(used_feature)

    valid_features = []
    for k in range(len(feature_names)):
        if feature_names[k] in used_feature:
            valid_features.append(9*k)
            valid_features += [9*k+1+j for j in range(sum_num)]
    valid_features = np.array(valid_features)

    x = raw_x[index<m]
    y = raw_y[index<m]
    x = np.transpose(x,(1,0))
    x = x[valid_features]
    x = np.transpose(x,(1,0))
    return x,y

In [11]:
def split_data(x,y,i=0,k=2,n=5):
    num = len(x)//k
    bias = len(x)//(k*n)
    assert i>=0 and i<n
    s = bias*i
    e = bias*i + num
    train_x = x[s:e]
    train_y = y[s:e]
    test_x = np.concatenate([x[:s],x[e:]])
    test_y = np.concatenate([y[:s],y[e:]])
    return train_x,train_y,test_x,test_y

In [12]:
def train(m,sum_num,used_feature,k = 2,n = 5):
    x,y = make_data(m,sum_num,used_feature)
    scores = []
    bz = 16

    for i in range(n):
        train_x,train_y,test_x,test_y = split_data(x,y,i,k,n)
        model = Model(x.shape[1])
        model = model.cuda()
        optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
        for t in range(150):
            for j in range(int(np.ceil(len(train_x)/bz))):
                s = j*bz
                ed = s+bz
                ed = min(ed,len(train_x))
                x0 = train_x[s:ed]
                y0 = train_y[s:ed]
                x0 = torch.FloatTensor(x0).cuda()
                y0 = torch.LongTensor(y0).cuda()
                loss, y_hat = model(x0,y0)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        s = evaluation(model,test_x,test_y)
        scores.append(s)
        print(s)
        
    scores = np.array(scores)
    return scores

In [13]:
K = 8
Setting = 'Setting-3'
used_feature = ['T5Conf','GPTConf','entropy']

print('precision, recall,f1, micro_f1, macro_f1, auc')
s = {'Setting-1':2,'Setting-2':3,'Setting-3':4}[Setting]
scores = train(s,K,used_feature)

(0.8333333333333334, 0.9388646288209607, 0.8829568788501027, 0.8131147540983606, 0.7097711223518806, 0.6865375775683751)
(0.7902097902097902, 0.9890590809628009, 0.8785228377065113, 0.7950819672131149, 0.6120362879632033, 0.6023726777363024)
(0.7854671280276817, 0.9912663755458515, 0.8764478764478765, 0.7901639344262295, 0.5903978512674165, 0.5877384509308204)
(0.7491803278688525, 1.0, 0.8566073102155577, 0.7491803278688525, 0.42830365510777885, 0.5)
(0.7508196721311475, 1.0, 0.8576779026217228, 0.7508196721311475, 0.4288389513108614, 0.5)
