### 모듈화 완료

text_classification_module.py

In [9]:
import torch
import torch.nn as nn
from transformers import ElectraTokenizer
from torch.utils.data import DataLoader, TensorDataset

class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.hidden_size = hidden_size
        self.W = nn.Linear(hidden_size, hidden_size)
        self.v = nn.Linear(hidden_size, 1, bias=False)

    def forward(self, encoder_outputs):
        energy = torch.tanh(self.W(encoder_outputs))
        attention_scores = self.v(energy).squeeze(2)
        attention_weights = torch.softmax(attention_scores, dim=1)
        context = torch.bmm(attention_weights.unsqueeze(1), encoder_outputs).squeeze(1)
        return context

class BiLSTMWithAttention(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(BiLSTMWithAttention, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, bidirectional=True)
        self.attention = Attention(hidden_size * 2)
        self.fc = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        context = self.attention(lstm_out)
        output = self.fc(context)
        return output

class TextClassifier:
    def __init__(self, model_save_path):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-discriminator", use_fast=False)
        self.model = None
        self.load_model(model_save_path)

    def load_model(self, model_save_path):
        self.model = BiLSTMWithAttention(input_size=len(self.tokenizer), hidden_size=256, num_classes=2)
        self.model.load_state_dict(torch.load(model_save_path, map_location=self.device))
        self.model.to(self.device)
        self.model.eval()

    def classify_text(self, text):
        encoding = self.tokenizer(text, truncation=True, padding=True, return_tensors="pt")
        input_ids = encoding['input_ids'].to(self.device)
        with torch.no_grad():
            outputs = self.model(input_ids)
            probabilities = torch.softmax(outputs, dim=1).tolist()[0]

        label_names = ['Label 0', 'Label 1']
        label_probabilities = {label: prob for label, prob in zip(label_names, probabilities)}
        return label_probabilities


모듈을 불러와 실행할 파일.py

In [None]:
# text_classification_module.py를 불러온다.
from text_classification_module import TextClassifier

def main():
    model_save_path = "/content/drive/MyDrive/AttBiLSTM_2K"
    classifier = TextClassifier(model_save_path)

    example_text = input("대화를 입력하세요. (최소 50자/20어절/2문장 이상)")
    result = classifier.classify_text(example_text)

    for label, prob in result.items():
        print(f"{int(prob * 100)}% 확률로 {label} 라벨로 분류됩니다.")


if __name__ == "__main__":
    main()

### 모듈화 이전 코드

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# 데이터셋 및 경로 설정
data = pd.read_csv('/content/drive/MyDrive/intergrated_unbalan5.csv') #데이터셋
model_save_path = "/content/drive/MyDrive/AttBiLSTM_2K" # 모델

!pip install transformers
!pip install accelerate>=0.20.1
!pip install transformers[torch]
!pip install accelerate transformers[torch]

import torch
import torch.nn as nn
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from transformers import ElectraTokenizer, AdamW


# Define Attention Layer
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.hidden_size = hidden_size
        self.W = nn.Linear(hidden_size, hidden_size)
        self.v = nn.Linear(hidden_size, 1, bias=False)

    def forward(self, encoder_outputs):
        energy = torch.tanh(self.W(encoder_outputs))
        attention_scores = self.v(energy).squeeze(2)
        attention_weights = torch.softmax(attention_scores, dim=1)
        context = torch.bmm(attention_weights.unsqueeze(1), encoder_outputs).squeeze(1)
        return context

# Define Bi-LSTM with Attention
class BiLSTMWithAttention(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(BiLSTMWithAttention, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, bidirectional=True)
        self.attention = Attention(hidden_size * 2)
        self.fc = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        context = self.attention(lstm_out)
        output = self.fc(context)
        return output

# Separate text and label columns
texts = data['text'].tolist()
labels = data['label'].tolist()

# Separate training and validation data
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Load the Electra tokenizer
tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-discriminator", use_fast=False)

# Tokenization and padding
max_length = 128
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")

# Create a PyTorch dataset
train_dataset = TensorDataset(train_encodings['input_ids'],
                              train_encodings['attention_mask'],
                              torch.tensor(train_labels))
val_dataset = TensorDataset(val_encodings['input_ids'],
                            val_encodings['attention_mask'],
                            torch.tensor(val_labels))

# Create data loader
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Model settings
input_size = len(tokenizer)
hidden_size = 256
num_classes = 2

model = BiLSTMWithAttention(input_size, hidden_size, num_classes)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters())

# Load the model
loaded_model = BiLSTMWithAttention(input_size, hidden_size, num_classes)
loaded_model.load_state_dict(torch.load(model_save_path))
loaded_model.to(device)
loaded_model.eval()

# Define the prediction function for the loaded model
def predict_loaded_model(text):
    encoding = tokenizer(text, truncation=True, padding=True, return_tensors="pt")
    input_ids = encoding['input_ids'].to(device)
    with torch.no_grad():
        outputs = loaded_model(input_ids)
        probabilities = torch.softmax(outputs, dim=1).tolist()[0]

    return {str(label): prob for label, prob in enumerate(probabilities)}
# gradio 인터페이스로 구현
'''
!pip install gradio
import gradio as gr
# Define the Gradio interface for the loaded model
loaded_iface = gr.Interface(fn=predict_loaded_model, inputs="text", outputs=gr.outputs.JSON())
loaded_iface.launch()
'''
# 입력
def classify_text_without_gr(text):
    # Use the loaded_model and tokenizer
    encoding = tokenizer(text, truncation=True, padding=True, return_tensors="pt")
    input_ids = encoding['input_ids'].to(device)

    with torch.no_grad():
        outputs = loaded_model(input_ids)
        probabilities = torch.softmax(outputs, dim=1).tolist()[0]

    label_names = ['Label 0', 'Label 1']  # Modify label names accordingly

    label_probabilities = {label: prob for label, prob in zip(label_names, probabilities)}
    return label_probabilities

In [18]:
# 예시 텍스트 입력 , 함수 호출 및 결과 출력
# 학습 데이터 특성상 최소 3문장 이상, 40자 이상의 문답 형식으로 입력하실 것을 권합니다.
# 너무 짧거나 맥락이 없는 내용의 나열은 높은 오탐률이 나타날 수 있습니다.
example_text = input ("대화 입력:")
result = classify_text_without_gr(example_text)
if result['Label 1'] >= 0.001:
    print(f"{int(result['Label 1'] * 100)}% 확률로 피싱입니다.")

대화 입력:ㅇㅇ
0% 확률로 피싱입니다.
