### 필요한 라이브러리

In [None]:
!pip install transformers
!pip install accelerate>=0.20.1
!pip install transformers[torch]
!pip install accelerate transformers[torch]
!pip install gradio

In [None]:
import pandas as pd
import torch
import transformers
import gradio as gr
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from tqdm import tqdm
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
from sklearn.utils import shuffle

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# 데이터 불러오기
label1_data = pd.read_csv("/content/drive/MyDrive/voise_antinoise.csv",encoding='euc-kr')
label0_data_1 = pd.read_csv("/content/drive/MyDrive/normalcall_original.csv")
label0_data_2 = pd.read_csv("/content/drive/MyDrive/finaldataset/nofish.csv")
label1_data["text"] = label1_data["Sentence"]
label1_data.drop(columns=['Sentence'], inplace=True)
label0_data_2["text"] = label0_data_2["refined_text"]
label0_data_2.drop(columns=['refined_text'], inplace=True)

# 라벨 0 데이터 랜덤 추출
label0_data_1_sampled = label0_data_1.sample(n=1204, random_state=42)
label0_data_2_sampled = label0_data_2.sample(n=1200, random_state=42)

# 라벨 부여
label1_data['label'] = 1
label0_data_1_sampled['label'] = 0
label0_data_2_sampled['label'] = 0

# 데이터 병합
intergrated_unbalan5 = pd.concat([label1_data, label0_data_1_sampled, label0_data_2_sampled])
intergrated_unbalan5 = shuffle(intergrated_unbalan5, random_state=42).reset_index(drop=True)

In [None]:
intergrated_unbalan5.to_csv("/content/drive/MyDrive/intergrated_unbalan5.csv", index = False)

### KoBERT 2천건

In [None]:
|# 데이터 불러오기
data = pd.read_csv("/content/drive/MyDrive/intergrated_unbalan5.csv")

# 텍스트와 라벨 컬럼 분리
texts = data['text'].tolist()
labels = data['label'].tolist()

# 훈련 데이터와 검증 데이터 분리
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# KoBERT 토크나이저 및 모델 불러오기
tokenizer = AutoTokenizer.from_pretrained("monologg/kobert", use_fast=False)
model = AutoModelForSequenceClassification.from_pretrained("monologg/kobert", num_labels=2)

# 토큰화 및 패딩
max_length = 128  # 적절한 시퀀스 길이 설정
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")

# PyTorch 데이터셋 생성
train_dataset = TensorDataset(train_encodings['input_ids'],
                              train_encodings['attention_mask'],
                              torch.tensor(train_labels))
val_dataset = TensorDataset(val_encodings['input_ids'],
                            val_encodings['attention_mask'],
                            torch.tensor(val_labels))

# 데이터 로더 생성
batch_size = 16  # 적절한 배치 크기 설정
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# 모델 및 옵티마이저 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

# 학습
num_epochs = 5  # 적절한 에폭 수 설정

for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Training"):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # 검증
    model.eval()
    val_loss = 0
    correct_predictions = 0
    total_predictions = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Validation"):
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()

            predicted_labels = outputs.logits.argmax(dim=1)
            correct_predictions += (predicted_labels == labels).sum().item()
            total_predictions += labels.size(0)

    val_accuracy = correct_predictions / total_predictions
    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch+1}/{num_epochs}: Val Loss: {avg_val_loss:.4f}, Val Acc: {val_accuracy:.4f}")



# 예측 함수 정의
def predict(text):
    encoding = tokenizer(text, truncation=True, padding=True, return_tensors="pt")
    encoding = {k: v.to(device) for k, v in encoding.items()}

    with torch.no_grad():
        outputs = model(**encoding)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=1).tolist()[0]

    return {str(label): prob for label, prob in enumerate(probabilities)}

# Gradio 인터페이스 정의
iface = gr.Interface(fn=predict, inputs="text", outputs=gr.outputs.JSON())
iface.launch()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/5 - Training: 100%|██████████| 140/140 [11:54<00:00,  5.11s/it]
Epoch 1/5 - Validation: 100%|██████████| 35/35 [00:57<00:00,  1.63s/it]


Epoch 1/5: Val Loss: 0.0593, Val Acc: 0.9875


Epoch 2/5 - Training: 100%|██████████| 140/140 [11:44<00:00,  5.03s/it]
Epoch 2/5 - Validation: 100%|██████████| 35/35 [00:59<00:00,  1.69s/it]


Epoch 2/5: Val Loss: 0.0648, Val Acc: 0.9804


Epoch 3/5 - Training: 100%|██████████| 140/140 [11:35<00:00,  4.97s/it]
Epoch 3/5 - Validation: 100%|██████████| 35/35 [00:56<00:00,  1.61s/it]


Epoch 3/5: Val Loss: 0.0243, Val Acc: 0.9964


Epoch 4/5 - Training: 100%|██████████| 140/140 [11:33<00:00,  4.96s/it]
Epoch 4/5 - Validation: 100%|██████████| 35/35 [00:54<00:00,  1.57s/it]


Epoch 4/5: Val Loss: 0.0437, Val Acc: 0.9893


Epoch 5/5 - Training: 100%|██████████| 140/140 [11:31<00:00,  4.94s/it]
Epoch 5/5 - Validation: 100%|██████████| 35/35 [00:56<00:00,  1.63s/it]


Epoch 5/5: Val Loss: 0.0377, Val Acc: 0.9875


  iface = gr.Interface(fn=predict, inputs="text", outputs=gr.outputs.JSON())


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>



In [None]:
# 학습된 모델 저장
model_save_path = "/content/drive/MyDrive/intergrated_unbalan5"
model.save_pretrained(model_save_path)

### KoELECTRA 2천건 -1
과적합

In [None]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from transformers import ElectraTokenizer, ElectraForSequenceClassification, AdamW
import gradio as gr

# Load data
data = pd.read_csv("/content/drive/MyDrive/intergrated_unbalan5.csv")

# Separate text and label columns
texts = data['text'].tolist()
labels = data['label'].tolist()

# Separate training and validation data
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Load the KoELECTRA tokenizer and model
tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-discriminator", use_fast=False)
model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-base-discriminator", num_labels=2)

# Tokenization and padding
max_length = 128
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")

# Create a PyTorch dataset
train_dataset = TensorDataset(train_encodings['input_ids'],
                              train_encodings['attention_mask'],
                              torch.tensor(train_labels))
val_dataset = TensorDataset(val_encodings['input_ids'],
                            val_encodings['attention_mask'],
                            torch.tensor(val_labels))

# Create data loader
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Model and optimizer settings
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

# Learning
num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Training"):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    val_loss = 0
    correct_predictions = 0
    total_predictions = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Validation"):
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()

            predicted_labels = outputs.logits.argmax(dim=1)
            correct_predictions += (predicted_labels == labels).sum().item()
            total_predictions += labels.size(0)

    val_accuracy = correct_predictions / total_predictions
    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch+1}/{num_epochs}: Val Loss: {avg_val_loss:.4f}, Val Acc: {val_accuracy:.4f}")

# Define the prediction function
def predict(text):
    encoding = tokenizer(text, truncation=True, padding=True, return_tensors="pt")
    encoding = {k: v.to(device) for k, v in encoding.items()}

    with torch.no_grad():
        outputs = model(**encoding)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=1).tolist()[0]

    return {str(label): prob for label, prob in enumerate(probabilities)}

# Define the Gradio interface
iface = gr.Interface(fn=predict, inputs="text", outputs=gr.outputs.JSON())
iface.launch()

### KoELECTRA 2천건 -2
가중치 규제 weight_decay=0.01



In [None]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from transformers import ElectraTokenizer, ElectraForSequenceClassification, AdamW
import gradio as gr

# Load data
data = pd.read_csv("/content/drive/MyDrive/intergrated_unbalan5.csv")

# Separate text and label columns
texts = data['text'].tolist()
labels = data['label'].tolist()

# Separate training and validation data
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Load the KoELECTRA tokenizer and model
tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-discriminator", use_fast=False)
model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-base-discriminator", num_labels=2)

# Tokenization and padding
max_length = 128
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")

# Create a PyTorch dataset
train_dataset = TensorDataset(train_encodings['input_ids'],
                              train_encodings['attention_mask'],
                              torch.tensor(train_labels))
val_dataset = TensorDataset(val_encodings['input_ids'],
                            val_encodings['attention_mask'],
                            torch.tensor(val_labels))

# Create data loader
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Model and optimizer settings
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

# Learning rate scheduling
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.8)

# Learning
num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Training"):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    val_loss = 0
    correct_predictions = 0
    total_predictions = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Validation"):
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()

            predicted_labels = outputs.logits.argmax(dim=1)
            correct_predictions += (predicted_labels == labels).sum().item()
            total_predictions += labels.size(0)

    val_accuracy = correct_predictions / total_predictions
    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch+1}/{num_epochs}: Val Loss: {avg_val_loss:.4f}, Val Acc: {val_accuracy:.4f}")

    # Learning rate scheduling step
    scheduler.step()

# Define the prediction function (same as before)
def predict(text):
    encoding = tokenizer(text, truncation=True, padding=True, return_tensors="pt")
    encoding = {k: v.to(device) for k, v in encoding.items()}

    with torch.no_grad():
        outputs = model(**encoding)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=1).tolist()[0]

    return {str(label): prob for label, prob in enumerate(probabilities)}

# Define the Gradio interface (same as before)
iface = gr.Interface(fn=predict, inputs="text", outputs=gr.outputs.JSON())
iface.launch()


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-discriminator and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3 - Training: 100%|██████████| 140/140 [00:11<00:00, 11.74it/s]
Epoch 1/3 - Validation: 100%|██████████| 35/35 [00:00<00:00, 36.21it/s]


Epoch 1/3: Val Loss: 0.0229, Val Acc: 0.9929


Epoch 2/3 - Training: 100%|██████████| 140/140 [00:11<00:00, 11.77it/s]
Epoch 2/3 - Validation: 100%|██████████| 35/35 [00:00<00:00, 36.11it/s]


Epoch 2/3: Val Loss: 0.0024, Val Acc: 1.0000


Epoch 3/3 - Training: 100%|██████████| 140/140 [00:11<00:00, 11.79it/s]
Epoch 3/3 - Validation: 100%|██████████| 35/35 [00:00<00:00, 36.37it/s]
  iface = gr.Interface(fn=predict, inputs="text", outputs=gr.outputs.JSON())


Epoch 3/3: Val Loss: 0.0053, Val Acc: 0.9982
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>



In [None]:
# 학습된 모델 저장
model_save_path = "/content/drive/MyDrive/KoELECTRA_2K"
model.save_pretrained(model_save_path)

### KoGPT - FAIL

In [None]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW
import gradio as gr

# Load data
data = pd.read_csv("/content/drive/MyDrive/integrated_unbalan5.csv")

# Separate text and label columns
texts = data['text'].tolist()
labels = data['label'].tolist()

# Separate training and validation data
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Load the KoGPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("skt/kogpt2-base-v2")
model = GPT2LMHeadModel.from_pretrained("skt/kogpt2-base-v2")

# Tokenization and padding
max_length = 128
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")

# Create a PyTorch dataset
train_dataset = TensorDataset(train_encodings['input_ids'],
                              train_encodings['attention_mask'],
                              torch.tensor(train_labels))
val_dataset = TensorDataset(val_encodings['input_ids'],
                            val_encodings['attention_mask'],
                            torch.tensor(val_labels))

# Create data loader
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Model and optimizer settings
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

# Learning
num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Save the trained model
save_path = "/content/drive/MyDrive/KoGPT_2K"
model.save_pretrained(save_path)

# Define the prediction function
def predict(text):
    input_ids = tokenizer.encode(text, return_tensors="pt").to(device)
    with torch.no_grad():
        output = model.generate(input_ids, max_length=100, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

# Define the Gradio interface
iface = gr.Interface(fn=predict, inputs="text", outputs="text")
iface.launch()


OSError: ignored

### KoAlpaca - FAIL

In [None]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
import gradio as gr

# Load data
data = pd.read_csv("/content/drive/MyDrive/integrated_unbalan5.csv")

# Separate text and label columns
texts = data['text'].tolist()
labels = data['label'].tolist()

# Separate training and validation data
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Load the KoAlpaca tokenizer and model with pretrained weights
tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-base-v2-discriminator", use_fast=False)
model = AutoModelForSequenceClassification.from_pretrained("monologg/koelectra-base-v2-discriminator", num_labels=2)

# Tokenization and padding
max_length = 128
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")

# Create a PyTorch dataset
train_dataset = TensorDataset(train_encodings['input_ids'],
                              train_encodings['attention_mask'],
                              torch.tensor(train_labels))
val_dataset = TensorDataset(val_encodings['input_ids'],
                            val_encodings['attention_mask'],
                            torch.tensor(val_labels))

# Create data loader
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Model and optimizer settings
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

# Learning
num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Training"):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    val_loss = 0
    correct_predictions = 0
    total_predictions = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Validation"):
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()

            predicted_labels = outputs.logits.argmax(dim=1)
            correct_predictions += (predicted_labels == labels).sum().item()
            total_predictions += labels.size(0)

    val_accuracy = correct_predictions / total_predictions
    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch+1}/{num_epochs}: Val Loss: {avg_val_loss:.4f}, Val Acc: {val_accuracy:.4f}")

# Save the model
model.save_pretrained("/content/drive/MyDrive/KoAlpaca_2K")

# Define the prediction function
def predict(text):
    encoding = tokenizer(text, truncation=True, padding=True, return_tensors="pt")
    encoding = {k: v.to(device) for k, v in encoding.items()}

    with torch.no_grad():
        outputs = model(**encoding)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=1).tolist()[0]

    return {str(label): prob for label, prob in enumerate(probabilities)}

# Define the Gradio interface
iface = gr.Interface(fn=predict, inputs="text", outputs=gr.outputs.JSON())
iface.launch()


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v2-discriminator and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: ignored

### ATTBILSTM
DROPOUT 적용 (Accuracy가 0.85로 고정되는 문제 발생)


In [None]:
import torch
import torch.nn as nn
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from transformers import ElectraTokenizer, AdamW
import gradio as gr

# Define Attention Layer
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.hidden_size = hidden_size
        self.W = nn.Linear(hidden_size, hidden_size)
        self.v = nn.Linear(hidden_size, 1, bias=False)

    def forward(self, encoder_outputs):
        energy = torch.tanh(self.W(encoder_outputs))
        attention_scores = self.v(energy).squeeze(2)
        attention_weights = torch.softmax(attention_scores, dim=1)
        context = torch.bmm(attention_weights.unsqueeze(1), encoder_outputs).squeeze(1)
        return context

# Define Bi-LSTM with Attention
class BiLSTMWithAttention(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes, dropout_rate):
        super(BiLSTMWithAttention, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.dropout = nn.Dropout(dropout_rate)
        self.lstm = nn.LSTM(hidden_size, hidden_size, bidirectional=True)
        self.attention = Attention(hidden_size * 2)
        self.fc = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        lstm_out, _ = self.lstm(embedded)
        context = self.attention(lstm_out)
        output = self.fc(context)
        return output

# Load data
data = pd.read_csv("/content/drive/MyDrive/intergrated_unbalan5.csv")

# Separate text and label columns
texts = data['text'].tolist()
labels = data['label'].tolist()

# Separate training and validation data
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Load the Electra tokenizer
tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-discriminator", use_fast=False)

# Tokenization and padding
max_length = 128
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")

# Create a PyTorch dataset
train_dataset = TensorDataset(train_encodings['input_ids'],
                              train_encodings['attention_mask'],
                              torch.tensor(train_labels))
val_dataset = TensorDataset(val_encodings['input_ids'],
                            val_encodings['attention_mask'],
                            torch.tensor(val_labels))

# Create data loader
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Model settings
input_size = len(tokenizer)
hidden_size = 256
num_classes = 2
dropout_rate = 0.5
learning_rate = 2e-5

model = BiLSTMWithAttention(input_size, hidden_size, num_classes, dropout_rate)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=learning_rate)

# Learning
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Training"):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids)
        loss_fn = nn.CrossEntropyLoss()
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    val_loss = 0
    correct_predictions = 0
    total_predictions = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Validation"):
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids)
            val_loss += loss_fn(outputs, labels).item()

            predicted_labels = outputs.argmax(dim=1)
            correct_predictions += (predicted_labels == labels).sum().item()
            total_predictions += labels.size(0)

    val_accuracy = correct_predictions / total_predictions
    # Calculate average loss
    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}: Avg Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Acc: {val_accuracy:.4f}")

# Save the trained model
model_save_path = "/content/drive/MyDrive/AttBiLSTM_2K"
torch.save(model.state_dict(), model_save_path)

# Define the prediction function
def predict(text):
    encoding = tokenizer(text, truncation=True, padding=True, return_tensors="pt")
    input_ids = encoding['input_ids'].to(device)
    with torch.no_grad():
        outputs = model(input_ids)
        probabilities = torch.softmax(outputs, dim=1).tolist()[0]

    return {str(label): prob for label, prob in enumerate(probabilities)}

# Define the Gradio interface
iface = gr.Interface(fn=predict, inputs="text", outputs=gr.outputs.JSON())
iface.launch()

Epoch 1/10 - Training: 100%|██████████| 140/140 [01:12<00:00,  1.93it/s]
Epoch 1/10 - Validation: 100%|██████████| 35/35 [00:04<00:00,  8.06it/s]


Epoch 1/10: Val Loss: 0.4844, Val Acc: 0.8500


Epoch 2/10 - Training: 100%|██████████| 140/140 [01:25<00:00,  1.63it/s]
Epoch 2/10 - Validation: 100%|██████████| 35/35 [00:05<00:00,  5.91it/s]


Epoch 2/10: Val Loss: 0.4629, Val Acc: 0.8500


Epoch 3/10 - Training: 100%|██████████| 140/140 [01:08<00:00,  2.05it/s]
Epoch 3/10 - Validation: 100%|██████████| 35/35 [00:05<00:00,  6.55it/s]


Epoch 3/10: Val Loss: 0.4367, Val Acc: 0.8500


Epoch 4/10 - Training: 100%|██████████| 140/140 [01:05<00:00,  2.14it/s]
Epoch 4/10 - Validation: 100%|██████████| 35/35 [00:04<00:00,  8.09it/s]


Epoch 4/10: Val Loss: 0.4310, Val Acc: 0.8500


Epoch 5/10 - Training: 100%|██████████| 140/140 [01:05<00:00,  2.13it/s]
Epoch 5/10 - Validation: 100%|██████████| 35/35 [00:04<00:00,  8.06it/s]


Epoch 5/10: Val Loss: 0.4200, Val Acc: 0.8500


Epoch 6/10 - Training: 100%|██████████| 140/140 [01:08<00:00,  2.05it/s]
Epoch 6/10 - Validation: 100%|██████████| 35/35 [00:05<00:00,  6.36it/s]


Epoch 6/10: Val Loss: 0.4187, Val Acc: 0.8500


Epoch 7/10 - Training: 100%|██████████| 140/140 [01:05<00:00,  2.12it/s]
Epoch 7/10 - Validation: 100%|██████████| 35/35 [00:05<00:00,  6.05it/s]


Epoch 7/10: Val Loss: 0.4082, Val Acc: 0.8500


Epoch 8/10 - Training: 100%|██████████| 140/140 [01:08<00:00,  2.04it/s]
Epoch 8/10 - Validation: 100%|██████████| 35/35 [00:04<00:00,  8.04it/s]


Epoch 8/10: Val Loss: 0.4088, Val Acc: 0.8500


Epoch 9/10 - Training: 100%|██████████| 140/140 [01:05<00:00,  2.12it/s]
Epoch 9/10 - Validation: 100%|██████████| 35/35 [00:04<00:00,  7.32it/s]


Epoch 9/10: Val Loss: 0.4038, Val Acc: 0.8500


Epoch 10/10 - Training: 100%|██████████| 140/140 [01:05<00:00,  2.13it/s]
Epoch 10/10 - Validation: 100%|██████████| 35/35 [00:06<00:00,  5.30it/s]
  iface = gr.Interface(fn=predict, inputs="text", outputs=gr.outputs.JSON())


Epoch 10/10: Val Loss: 0.4039, Val Acc: 0.8500
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>



### ATTBILSTM
Dropout 직접 제거 - 여전히 0.85로 됨. 코드상의 문제이거나 런타임에 저장된 변수 문제일 수 있다.


In [25]:
import torch
import torch.nn as nn
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from transformers import ElectraTokenizer, AdamW
import gradio as gr

# Define Attention Layer
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.hidden_size = hidden_size
        self.W = nn.Linear(hidden_size, hidden_size)
        self.v = nn.Linear(hidden_size, 1, bias=False)

    def forward(self, encoder_outputs):
        energy = torch.tanh(self.W(encoder_outputs))
        attention_scores = self.v(energy).squeeze(2)
        attention_weights = torch.softmax(attention_scores, dim=1)
        context = torch.bmm(attention_weights.unsqueeze(1), encoder_outputs).squeeze(1)
        return context

# Define Bi-LSTM with Attention
class BiLSTMWithAttention(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(BiLSTMWithAttention, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, bidirectional=True)
        self.attention = Attention(hidden_size * 2)
        self.fc = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        context = self.attention(lstm_out)
        output = self.fc(context)
        return output

# Load data
data = pd.read_csv("/content/drive/MyDrive/intergrated_unbalan5.csv")

# Separate text and label columns
texts = data['text'].tolist()
labels = data['label'].tolist()

# Separate training and validation data
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Load the Electra tokenizer
tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-discriminator", use_fast=False)

# Tokenization and padding
max_length = 128
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")

# Create a PyTorch dataset
train_dataset = TensorDataset(train_encodings['input_ids'],
                              train_encodings['attention_mask'],
                              torch.tensor(train_labels))
val_dataset = TensorDataset(val_encodings['input_ids'],
                            val_encodings['attention_mask'],
                            torch.tensor(val_labels))

# Create data loader
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Model settings
input_size = len(tokenizer)
hidden_size = 256
num_classes = 2
learning_rate = 2e-5

model = BiLSTMWithAttention(input_size, hidden_size, num_classes)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=learning_rate)

# Learning
num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Training"):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids)
        loss_fn = nn.CrossEntropyLoss()
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    val_loss = 0
    correct_predictions = 0
    total_predictions = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Validation"):
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids)
            val_loss += loss_fn(outputs, labels).item()

            predicted_labels = outputs.argmax(dim=1)
            correct_predictions += (predicted_labels == labels).sum().item()
            total_predictions += labels.size(0)

    val_accuracy = correct_predictions / total_predictions
    # Calculate average loss
    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}: Avg Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Acc: {val_accuracy:.4f}")

# Save the trained model
model_save_path = "/content/drive/MyDrive/AttBiLSTM_2K"
torch.save(model.state_dict(), model_save_path)

# Define the prediction function
def predict(text):
    encoding = tokenizer(text, truncation=True, padding=True, return_tensors="pt")
    input_ids = encoding['input_ids'].to(device)
    with torch.no_grad():
        outputs = model(input_ids)
        probabilities = torch.softmax(outputs, dim=1).tolist()[0]

    return {str(label): prob for label, prob in enumerate(probabilities)}

# Define the Gradio interface
iface = gr.Interface(fn=predict, inputs="text", outputs=gr.outputs.JSON())
iface.launch()

Epoch 4/5 - Training:  23%|██▎       | 32/140 [00:15<00:50,  2.12it/s]


KeyboardInterrupt: ignored

### ATTBILSTM (KoELECTRA의 토크나이저 사용)
Dropout 삭제 요청 - 문제 해결

가장 성능 좋은 모델 도출

In [27]:
import torch
import torch.nn as nn
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from transformers import ElectraTokenizer, AdamW
import gradio as gr

# Define Attention Layer
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.hidden_size = hidden_size
        self.W = nn.Linear(hidden_size, hidden_size)
        self.v = nn.Linear(hidden_size, 1, bias=False)

    def forward(self, encoder_outputs):
        energy = torch.tanh(self.W(encoder_outputs))
        attention_scores = self.v(energy).squeeze(2)
        attention_weights = torch.softmax(attention_scores, dim=1)
        context = torch.bmm(attention_weights.unsqueeze(1), encoder_outputs).squeeze(1)
        return context

# Define Bi-LSTM with Attention
class BiLSTMWithAttention(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(BiLSTMWithAttention, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, bidirectional=True)
        self.attention = Attention(hidden_size * 2)
        self.fc = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        context = self.attention(lstm_out)
        output = self.fc(context)
        return output

# Load data
data = pd.read_csv('/content/drive/MyDrive/intergrated_unbalan5.csv')

# Separate text and label columns
texts = data['text'].tolist()
labels = data['label'].tolist()

# Separate training and validation data
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Load the Electra tokenizer
tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-discriminator", use_fast=False)

# Tokenization and padding
max_length = 128
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")

# Create a PyTorch dataset
train_dataset = TensorDataset(train_encodings['input_ids'],
                              train_encodings['attention_mask'],
                              torch.tensor(train_labels))
val_dataset = TensorDataset(val_encodings['input_ids'],
                            val_encodings['attention_mask'],
                            torch.tensor(val_labels))

# Create data loader
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Model settings
input_size = len(tokenizer)
hidden_size = 256
num_classes = 2

model = BiLSTMWithAttention(input_size, hidden_size, num_classes)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters())

# Learning
num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Training"):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids)
        loss_fn = nn.CrossEntropyLoss()
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    val_loss = 0
    correct_predictions = 0
    total_predictions = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Validation"):
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids)
            val_loss += loss_fn(outputs, labels).item()

            predicted_labels = outputs.argmax(dim=1)
            correct_predictions += (predicted_labels == labels).sum().item()
            total_predictions += labels.size(0)

    val_accuracy = correct_predictions / total_predictions
    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch+1}/{num_epochs}: Val Loss: {avg_val_loss:.4f}, Val Acc: {val_accuracy:.4f}")

# Save the trained model
model_save_path = "/content/drive/MyDrive/AttBiLSTM_2K"
torch.save(model.state_dict(), model_save_path)

# Define the prediction function
def predict(text):
    encoding = tokenizer(text, truncation=True, padding=True, return_tensors="pt")
    input_ids = encoding['input_ids'].to(device)
    with torch.no_grad():
        outputs = model(input_ids)
        probabilities = torch.softmax(outputs, dim=1).tolist()[0]

    return {str(label): prob for label, prob in enumerate(probabilities)}

# Define the Gradio interface
iface = gr.Interface(fn=predict, inputs="text", outputs=gr.outputs.JSON())
iface.launch()


Epoch 1/3 - Training: 100%|██████████| 140/140 [01:53<00:00,  1.23it/s]
Epoch 1/3 - Validation: 100%|██████████| 35/35 [00:07<00:00,  4.47it/s]


Epoch 1/3: Val Loss: 0.2774, Val Acc: 0.8982


Epoch 2/3 - Training: 100%|██████████| 140/140 [01:05<00:00,  2.14it/s]
Epoch 2/3 - Validation: 100%|██████████| 35/35 [00:04<00:00,  8.23it/s]


Epoch 2/3: Val Loss: 0.1241, Val Acc: 0.9554


Epoch 3/3 - Training: 100%|██████████| 140/140 [01:07<00:00,  2.06it/s]
Epoch 3/3 - Validation: 100%|██████████| 35/35 [00:05<00:00,  6.67it/s]
  iface = gr.Interface(fn=predict, inputs="text", outputs=gr.outputs.JSON())


Epoch 3/3: Val Loss: 0.0631, Val Acc: 0.9804
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>



In [35]:
import torch
import torch.nn as nn
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from transformers import ElectraTokenizer, AdamW
import gradient as gr


# Load the model
model_save_path = "/content/drive/MyDrive/AttBiLSTM_2K"
loaded_model = BiLSTMWithAttention(input_size, hidden_size, num_classes)
loaded_model.load_state_dict(torch.load(model_save_path))
loaded_model.to(device)
loaded_model.eval()

# Define the prediction function for the loaded model
def predict_loaded_model(text):
    encoding = tokenizer(text, truncation=True, padding=True, return_tensors="pt")
    input_ids = encoding['input_ids'].to(device)
    with torch.no_grad():
        outputs = loaded_model(input_ids)
        probabilities = torch.softmax(outputs, dim=1).tolist()[0]

    return {str(label): prob for label, prob in enumerate(probabilities)}

# Define the Gradio interface for the loaded model
loaded_iface = gr.Interface(fn=predict_loaded_model, inputs="text", outputs=gr.outputs.JSON())
loaded_iface.launch()

  loaded_iface = gr.Interface(fn=predict_loaded_model, inputs="text", outputs=gr.outputs.JSON())


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

