# ======= inference =======
# ======= 토큰화 준비 ======

In [None]:
import transformers
import datasets
import pandas as pd
import numpy as np
from datasets import Dataset
import os
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
from transformers import AutoTokenizer

from tokenizers import(
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

from transformers import PreTrainedTokenizerFast

from datasets import Dataset
from tqdm.auto import tqdm
from tokenizers import processors

In [None]:
test = pd.read_csv('test_essays.csv')

In [None]:
# test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')

In [None]:
LOWERCASE = False
VOCAB_SIZE = 30522

In [None]:
# Tokenizer 초기화
raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))

# Normalizer 설정
raw_tokenizer.normalizer = normalizers.Sequence([normalizers.NFC()] + [normalizers.Lowercase()] if LOWERCASE else [])

# Pre-tokenizer 설정: Byte-Level pre-tokenizer를 사용
# "Ġ" 문자 제거 하고 싶다면 -> add_prefix_space=False 추가
raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()

# BPE Trainer 설정 및 스페셜 토큰 추가
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.BpeTrainer(vocab_size=VOCAB_SIZE, special_tokens=special_tokens)

# 데이터셋 로드 및 토크나이저 훈련 (훈련 셋 : df_train_essays_final[['text']])
dataset = Dataset.from_pandas(test[['text']])
def train_corp_iter(): 
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["text"]
raw_tokenizer.train_from_iterator(train_corp_iter(), trainer=trainer)

# PreTrainedTokenizerFast로 래핑
from transformers import RobertaTokenizerFast

tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

In [None]:
test_input_ids = []
test_attention_masks = []

for text in tqdm(test['text'].tolist(), desc="토큰화 진행중"):
    encoded = tokenizer.encode_plus(text, 
                                    add_special_tokens=True, 
                                    max_length=256, 
                                    padding='max_length', 
                                    truncation=True, 
                                    return_attention_mask=True, 
                                    return_tensors='np')

    test_input_ids.append(encoded['input_ids'][0])
    test_attention_masks.append(encoded['attention_mask'][0])
    # 토큰화된 데이터를 pandas DataFrame으로 변환
test_inputs = pd.DataFrame({
    'input_ids': test_input_ids,
    'attention_mask': test_attention_masks
})

## ============== 예측 진행 =================

In [None]:
model_checkpoint = "roberta-base-finetuned_v5/checkpoint-2300"  

In [None]:
# ls -l /kaggle/input/detect-llm-models/distilroberta-finetuned_v5          # "모델이름"

In [None]:
# model_checkpoint = "/kaggle/input/detect-llm-models/roberta-base-finetuned_v5/checkpoint-49654"        #"모델이름"

In [None]:
# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
# def preprocess_function(examples):
#     return tokenizer(examples['text'], max_length = 512 , padding=True, truncation=True)

In [None]:
num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # Move your model and data to the GPU
model.to(device);

In [None]:
trainer = Trainer(
    model,
    tokenizer=tokenizer,
)
test_dataset = Dataset.from_pandas(test_inputs)
# test_ds_enc = test_ds.map(preprocess_function, batched=True)
test_preds = trainer.predict(test_dataset)
logits = test_preds.predictions
probs = (np.exp(logits) / np.sum(np.exp(logits), axis=-1, keepdims=True))[:,0]
sub = pd.DataFrame()
sub['id'] = test['id']
sub['generated'] = probs
sub.to_csv('submission.csv', index=False)
sub.head()