##KoBERT TO-GO

In [None]:
!pip install transformers
!pip install gradio

In [5]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import torch.nn.functional as F

# 저장된 모델 경로
model_path = "/content/drive/MyDrive/finalvoice/kobert_model"

# KoBERT 토크나이저 불러오기
tokenizer = BertTokenizer.from_pretrained('monologg/kobert')

# 모델 구조 불러오기
model = BertForSequenceClassification.from_pretrained('monologg/kobert')
# 저장된 가중치 불러오기
model.load_state_dict(torch.load(model_path))
model.eval()

def get_prediction_probabilities(text):
    # 텍스트를 토크나이징
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)

    # 예측
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # 확률을 계산하기 위해 softmax 함수 사용
    probs = F.softmax(logits, dim=-1)

    # 결과를 딕셔너리 형태로 반환
    results = {}
    for i, prob in enumerate(probs[0]):
        results[f"라벨 {i}"] = float(prob)

    return results

# 사용자 입력 받기
user_input = input("텍스트를 입력하세요: ")
prediction_results = get_prediction_probabilities(user_input)
print(prediction_results)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


텍스트를 입력하세요: 텍스트를 입력하세요.
{'라벨 0': 0.005742729641497135, '라벨 1': 0.9942572116851807}


In [None]:
# KoBERT Gradio 구현
'''
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import torch.nn.functional as F
import gradio as gr
from google.colab import drive
drive.mount('/content/drive')

# 저장된 모델 경로
model_path = "/content/drive/MyDrive/finalvoice/kobert_model"

# KoBERT 토크나이저 불러오기
tokenizer = BertTokenizer.from_pretrained('monologg/kobert')

# 모델 구조 불러오기
model = BertForSequenceClassification.from_pretrained('monologg/kobert')
# 저장된 가중치 불러오기
model.load_state_dict(torch.load(model_path))
model.eval()

def get_prediction_probabilities(text):
    # 텍스트를 토크나이징
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)

    # 예측
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # 확률을 계산하기 위해 softmax 함수 사용
    probs = F.softmax(logits, dim=-1)

    # 결과를 딕셔너리 형태로 반환
    results = {}
    for i, prob in enumerate(probs[0]):
        results[f"라벨 {i}"] = float(prob)

    return results

# gr.Interface를 사용하여 웹 인터페이스 구성
interface = gr.Interface(fn=get_prediction_probabilities,
                         inputs="text",
                         outputs="label",
                         live=True,
                         title="KoBERT 라벨 예측",
                         description="입력된 텍스트의 라벨 예측 확률을 출력합니다.")
interface.launch()
'''

In [None]:
# KoBERT 소스 코드
'''
# 필요한 라이브러리
import torch
from torch import nn
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW
from sklearn.metrics import accuracy_score
import pandas as pd
# df_93이라는 데이터가 있다고 가정
df_93 = pd.read_csv("/content/drive/MyDrive/KorCCViD_v1.3_fullcleansed.csv")
# 데이터를 학습 데이터와 테스트 데이터로 나눕니다.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_93['Transcript'], df_93['Label'], test_size=0.2, random_state=42)
# KoBERT 토크나이저 로드
tokenizer = BertTokenizer.from_pretrained('monologg/kobert')
# 데이터 토큰화
class KoBERTDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = list(texts)
        self.labels = list(labels)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = str(self.texts[index])
        label = self.labels[index]
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_attention_mask=True,
            truncation=True
        )

        return {
            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# DataLoader 설정
BATCH_SIZE = 16
MAX_LEN = 128
train_data = KoBERTDataset(X_train.reset_index(drop=True), y_train.reset_index(drop=True), tokenizer, MAX_LEN)
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_data = KoBERTDataset(X_test.reset_index(drop=True), y_test.reset_index(drop=True), tokenizer, MAX_LEN)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE)


# 모델 정의
model = BertForSequenceClassification.from_pretrained('monologg/kobert', num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# 학습 설정
optimizer = AdamW(model.parameters(), lr=2e-5)
epochs = 10
from tqdm.notebook import tqdm

# 학습 진행
for epoch in range(epochs):
    model.train()
    total_loss = 0

    # tqdm을 train_loader에 적용
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}", leave=False)
    for batch_idx, batch in enumerate(progress_bar):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        # 진행 상황 업데이트
        progress_bar.set_postfix({'loss': total_loss / (batch_idx + 1)})

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss/len(train_loader)}")
import torch

# 기존의 테스트 코드
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs[0], 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# 정확도 평가
accuracy = accuracy_score(all_labels, all_preds)
print(f"Accuracy: {accuracy * 100:.2f}%")
# 모델 저장하기
save_path = "/content/drive/MyDrive/finalvoice/kobert_model"
torch.save(model.state_dict(), save_path)
print(f"모델이 {save_path}에 저장되었습니다.")
'''