In [19]:
import transformers
from omnixai.data.text import Text
from omnixai.explainers.nlp import ShapText
import shap

In [20]:
from pathlib import Path
from tqdm.auto import tqdm
import numpy as np
import pandas as pd

from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from transformers import AutoModel, AutoTokenizer, pipeline
from transformers.pipelines.pt_utils import KeyDataset
from datasets import Dataset
import torch

In [21]:
DEVICE = 0 if torch.cuda.is_available() else -1

In [27]:
known_df = pd.read_csv("../data/real_and_fake_passages_dataset_test.csv")
unknown_df = pd.read_csv("../detecting-generated-scientific-papers/fake_papers_train_part_public.csv")
translation_df = pd.read_csv("../data/fake_papers_translated.csv")

In [28]:
unknown_df.head()

Unnamed: 0,id,text,fake
0,1,Modern two-dimensional imaging is of such qual...,0
1,2,Background: The optimal sequence of systemic p...,1
2,5,This chapter opens with a discussion of the ef...,1
3,10,The time scale of the ultra-short-term can str...,1
4,23,Electronic nose or machine olfaction are syste...,1


In [25]:
translation_df.head()

Unnamed: 0.1,Unnamed: 0,text,fake
0,0,"In any case, these figures should be taken wit...",1
1,1,"The reasons for using the device, the use of s...",1
2,2,"Therefore, standard thromboprophylaxis should ...",1
3,3,Blood count values ​​showed hematocrit and hem...,1
4,4,Colombian flag with the silhouette of a popula...,1


In [30]:
known_df.head()

Unnamed: 0,passages,fake,type,model,tool
0,a nearby recurrent laryngeal nerve was the oth...,0,real,real,real
1,15 ml of the aqueous extract was transferred ...,0,real,real,real
2,This is a private property of Zed The questi...,1,scigen,scigen,generate
3,n(cb nb ) ) was not of significance ( p = 0.1...,0,real,real,real
4,"for the swiss ball exercise group , the frt re...",0,real,real,real


In [47]:
models = known_df.model.unique()
texts = [
        ('translate', 'google-translate-zh', 'google-translate-zh',
         translation_df[translation_df["fake"] == 1].sample(1, random_state=3)['text'].item()),
        ('unknown', 'unknown', 'unknown', 
         unknown_df[unknown_df["fake"] == 1].sample(1, random_state=3)['text'].item())
]
for model in models:
    sample = known_df[known_df['model'] == model].sample(1, random_state=3)
    texts.append(
         (sample['tool'].item(), sample['type'].item(), sample['model'].item(), sample['passages'].item())
    )

In [50]:
text_inputs = [
    text[3]
    for text in texts
]

In [7]:
model = transformers.pipeline(
    'text-classification',
    model='anon/deberta-v3-large-finetuned-synthetic-multi-class',
    return_all_scores=True
)

The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.
`return_all_scores` is now deprecated, use `top_k=1` if you want similar functionnality


In [17]:
x = Text(text_inputs)
explainer = ShapText(model=model)
explanations = explainer.explain(x)
explanations.ipython_plot()

  0%|          | 0/498 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
x = text_inputs
explainer = shap.Explainer(model)
shap_values = explainer(x)