## KoELECTRA 불러오기

In [32]:
!pip install transformers
!pip install accelerate>=0.20.1
!pip install transformers[torch]
!pip install accelerate transformers[torch]
!pip install gradio



In [None]:
# 소스 코드
'''
# 0. 필요한 라이브러리 import
import pandas as pd
import numpy as np
from transformers import ElectraTokenizer, ElectraForSequenceClassification, Trainer, TrainingArguments
import torch
from google.colab import drive
drive.mount('/content/drive')

# 1. 토크나이저 설정
tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")

# 2. 데이터셋 설정
df_93 = pd.read_csv("/content/drive/MyDrive/KorCCViD_v1.3_fullcleansed.csv")
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_93['Transcript'], df_93['Label'], test_size=0.2, random_state=42)
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=256)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=256)

class KoELECTRADataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = KoELECTRADataset(train_encodings, list(y_train))
test_dataset = KoELECTRADataset(test_encodings, list(y_test))

# 3. 모델 설정
model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-base-v3-discriminator", num_labels=2)

# 4. 학습 설정
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="steps",
    eval_steps=50,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
)

from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {'accuracy': accuracy_score(labels, predictions)}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

#5. 모델 학습
### 경고: 다시 학습시킬때만 사용합니다.
trainer.train()
from sklearn.metrics import accuracy_score
results = trainer.evaluate()
print(results)

#6. 모델 저장
### 경고: 새로 저장할때만 사용합니다.
save_path = '/content/drive/MyDrive/finalvoice/koelectra_saved_model'
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)
'''


In [33]:
# 입력해서 확률 반환받기

from transformers import ElectraForSequenceClassification, ElectraTokenizer
import torch.nn.functional as F

# 1. 저장된 모델과 토크나이저 불러오기
from google.colab import drive
drive.mount('/content/drive')
model_path = '/content/drive/MyDrive/finalvoice/koelectra_saved_model'
tokenizer = ElectraTokenizer.from_pretrained(model_path)
model = ElectraForSequenceClassification.from_pretrained(model_path)

def get_prediction_probabilities(text):
    # 텍스트를 토크나이징
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)

    # 모델을 평가 모드로 설정
    model.eval()

    # 예측
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # 확률을 계산하기 위해 softmax 함수 사용
    probs = F.softmax(logits, dim=-1)

    return probs[0].tolist()

# 사용자 입력 받기
text = input("텍스트를 입력하세요: ")
probs = get_prediction_probabilities(text)

# 각 라벨의 확률 출력
for i, prob in enumerate(probs):
    print(f"라벨 {i}의 확률: {prob:.4f}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
텍스트를 입력하세요: 한번도 출력
라벨 0의 확률: 0.3122
라벨 1의 확률: 0.6878


In [None]:
# Gradio 인터페이스로 출력하기
'''
import gradio as gr
from transformers import ElectraForSequenceClassification, ElectraTokenizer
import torch.nn.functional as F

# 저장된 모델과 토크나이저 불러오기
model_path = '/content/drive/MyDrive/finalvoice/koelectra_saved_model'
tokenizer = ElectraTokenizer.from_pretrained(model_path)
model = ElectraForSequenceClassification.from_pretrained(model_path)

def get_prediction_probabilities(text):
    # 텍스트를 토크나이징
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)

    # 모델을 평가 모드로 설정
    model.eval()

    # 예측
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # 확률을 계산하기 위해 softmax 함수 사용
    probs = F.softmax(logits, dim=-1)

    # 결과를 딕셔너리 형태로 반환
    results = {}
    for i, prob in enumerate(probs[0]):
        results[f"라벨 {i}"] = float(prob)

    return results

# gr.Interface를 사용하여 웹 인터페이스 구성
interface = gr.Interface(fn=get_prediction_probabilities,
                         inputs="text",
                         outputs="label",
                         live=True,
                         title="KoELECTRA 라벨 예측",
                         description="입력된 텍스트의 라벨 예측 확률을 출력합니다.")
interface.launch()
'''