## KoELECTRA

In [None]:
import pandas as pd
import numpy as np
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# df_93이라는 데이터가 있다고 가정
df_93 = pd.read_csv("/content/drive/MyDrive/KorCCViD_v1.3_fullcleansed.csv")

In [None]:
# 데이터를 학습 데이터와 테스트 데이터로 나눕니다.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_93['Transcript'], df_93['Label'], test_size=0.2, random_state=42)

In [None]:
!pip install transformers



In [None]:
# 필요한 라이브러리 import
from transformers import ElectraTokenizer, ElectraForSequenceClassification, Trainer, TrainingArguments

# 1. 토크나이저 설정
tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")

# 2. 데이터셋 설정
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=256)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=256)

import torch

class KoELECTRADataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = KoELECTRADataset(train_encodings, list(y_train))
test_dataset = KoELECTRADataset(test_encodings, list(y_test))

In [None]:
!pip install accelerate>=0.20.1
!pip install transformers[torch]
!pip uninstall accelerate transformers
!pip install accelerate transformers[torch]

In [None]:
# 3. 모델 설정
model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-base-v3-discriminator", num_labels=2)

# 4. 학습 설정
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="steps",
    eval_steps=50,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
)

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {'accuracy': accuracy_score(labels, predictions)}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [None]:
# 경고: 다시 학습시킬때만 사용합니다.
'''
trainer.train()
'''
# 경고: 새로 저장할때만 사용합니다.
'''
# 모델 저장 경로
save_path = '/content/drive/MyDrive/finalvoice/koelectra_saved_model'
# 모델 저장
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)
'''

Step,Training Loss,Validation Loss,Accuracy
50,0.0272,0.002291,1.0
100,0.0023,0.001216,1.0
150,0.0016,0.000899,1.0


Step,Training Loss,Validation Loss


TrainOutput(global_step=183, training_loss=0.011562081524603517, metrics={'train_runtime': 128.0956, 'train_samples_per_second': 22.811, 'train_steps_per_second': 1.429, 'total_flos': 384405251880960.0, 'train_loss': 0.011562081524603517, 'epoch': 3.0})

('/content/drive/MyDrive/finalvoice/koelectra_saved_model/tokenizer_config.json',
 '/content/drive/MyDrive/finalvoice/koelectra_saved_model/special_tokens_map.json',
 '/content/drive/MyDrive/finalvoice/koelectra_saved_model/vocab.txt',
 '/content/drive/MyDrive/finalvoice/koelectra_saved_model/added_tokens.json')

In [None]:
# 6. 평가
results = trainer.evaluate()
print(results)

{'eval_loss': 0.0008554938831366599, 'eval_accuracy': 1.0, 'eval_runtime': 3.5092, 'eval_samples_per_second': 69.531, 'eval_steps_per_second': 1.14, 'epoch': 3.0}


## KoELECTRA 불러오기

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from transformers import ElectraForSequenceClassification, ElectraTokenizer

# 모델과 토크나이저를 불러옵니다.
model_path = "/content/drive/MyDrive/finalvoice/koelectra_saved_model"
tokenizer = ElectraTokenizer.from_pretrained(model_path)
model = ElectraForSequenceClassification.from_pretrained(model_path)


In [6]:
# 6. 평가
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {'accuracy': accuracy_score(labels, predictions)}

results = trainer.evaluate()
print(results)

NameError: ignored

In [7]:
import torch.nn.functional as F

def predict(text, model, tokenizer):
    # 텍스트를 토크나이징
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)

    # 모델을 평가 모드로 설정
    model.eval()

    # 예측
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # 확률을 얻기 위해 softmax 함수 사용
    probs = F.softmax(logits, dim=-1)

    return probs[0].tolist()

# 테스트
text = input("여기에 테스트할 문장을 입력하세요.")
probs = predict(text, model, tokenizer)
print(f"라벨 0의 확률: {probs[0]:.4f}")
print(f"라벨 1의 확률: {probs[1]:.4f}")

여기에 테스트할 문장을 입력하세요.dddd


NameError: ignored