In [None]:
# 필요한 라이브러리 설치
!pip install transformers torch
!pip install --upgrade gluonnlp pandas tqdm
!pip install mxnet
!pip install transformers[torch]
!pip install accelerate -U

# 라이브러리 임포트
import pandas as pd
import csv
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from transformers import AutoModel, AutoTokenizer, Trainer, TrainingArguments
from transformers import ElectraModel, ElectraTokenizer
import numpy as np
from tqdm.auto import tqdm
from google.colab import drive
import glob
import os
from tqdm.auto import tqdm
from multiprocessing import Pool
import multiprocessing
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Google Drive 마운트
drive.mount('/content/drive')

In [4]:
import os
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from transformers import ElectraTokenizer, ElectraModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score
import pandas as pd
import glob

# KoElectra 토크나이저와 모델 불러오기
tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-discriminator")

# 분류를 위한 모델 정의
class ElectraForCompanyClassification(nn.Module):
    def __init__(self, model_name, num_labels):
        super(ElectraForCompanyClassification, self).__init__()
        self.electra = ElectraModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.electra.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.electra(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]  # [batch_size, hidden_size]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

# 데이터 전처리 함수
def preprocess(data, tokenizer, max_len=256):
    # content 열을 문자열 리스트로 변환
    texts = data["full_text"].astype(str).tolist()
    # esg_score 열을 0부터 10 사이의 정수로 변환
    labels = (data["esg_score"] * 10).astype(int).tolist()

    inputs = tokenizer(
        texts,
        max_length=max_len,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )

    labels = torch.tensor(labels, dtype=torch.long)  # 분류를 위해 long 타입으로 변환

    return inputs, labels

# 데이터 파일 경로 설정
file_paths = glob.glob('/content/drive/MyDrive/Kwargs/esg관련도/*.csv')

# 모델 초기화
model_name = "monologg/koelectra-base-discriminator"
num_labels = 11  # 0부터 10까지의 11개의 클래스
model = ElectraForCompanyClassification(model_name, num_labels)

# 모든 파일에 대해 처리
all_train_datasets = []
all_val_datasets = []

for file_path in file_paths:
    data = pd.read_csv(file_path)

    # 데이터 전처리
    inputs, labels = preprocess(data, tokenizer)

    # TensorDataset 생성
    dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)

    # 데이터셋 분할
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size

    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

    all_train_datasets.append(train_dataset)
    all_val_datasets.append(val_dataset)

# 모든 데이터를 합침
train_dataset = torch.utils.data.ConcatDataset(all_train_datasets)
val_dataset = torch.utils.data.ConcatDataset(all_val_datasets)

# 데이터 로더 생성
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=16)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=16)

# 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

# 훈련 함수
def train(model, train_dataloader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader, desc="Training"):
        input_ids, attention_mask, labels = [x.to(device) for x in batch]
        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = nn.CrossEntropyLoss()(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_dataloader)

# 평가 함수
def evaluate(model, val_dataloader, device):
    model.eval()
    total_loss = 0
    all_labels = []
    all_preds = []
    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc="Evaluating"):
            input_ids, attention_mask, labels = [x.to(device) for x in batch]
            logits = model(input_ids, attention_mask)
            loss = nn.CrossEntropyLoss()(logits, labels)
            total_loss += loss.item()
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds)
    accuracy = accuracy_score(all_labels, all_preds)
    return total_loss / len(val_dataloader), accuracy

# 훈련 및 평가
num_epochs = 3
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train_loss = train(model, train_dataloader, optimizer, device)
    val_loss, val_accuracy = evaluate(model, val_dataloader, device)
    print(f"Training Loss: {train_loss:.4f}")
    print(f"Validation Loss: {val_loss:.4f}")
    print(f"Validation Accuracy: {val_accuracy:.4f}")

    # 각 에포크마다 모델 저장
    epoch_output_dir = f"/content/drive/MyDrive/Kwargs/esg관련도/모델_epoch_{epoch + 1}"

    # 디렉토리가 존재하지 않으면 생성
    if not os.path.exists(epoch_output_dir):
        os.makedirs(epoch_output_dir)

    # 모델 가중치 저장
    torch.save(model.state_dict(), os.path.join(epoch_output_dir, "pytorch_model.bin"))
    tokenizer.save_pretrained(epoch_output_dir)

    print(f"Model and tokenizer saved to {epoch_output_dir} at Epoch {epoch + 1}")


Epoch 1/3


Training: 100%|██████████| 2355/2355 [06:33<00:00,  5.98it/s]
Evaluating: 100%|██████████| 590/590 [00:31<00:00, 18.60it/s]


Training Loss: 1.4277
Validation Loss: 1.3433
Validation Accuracy: 0.4792
Model and tokenizer saved to /content/drive/MyDrive/Kwargs/esg관련도/모델_epoch_1 at Epoch 1
Epoch 2/3


Training: 100%|██████████| 2355/2355 [06:33<00:00,  5.98it/s]
Evaluating: 100%|██████████| 590/590 [00:31<00:00, 18.56it/s]


Training Loss: 1.6462
Validation Loss: 1.6441
Validation Accuracy: 0.4245
Model and tokenizer saved to /content/drive/MyDrive/Kwargs/esg관련도/모델_epoch_2 at Epoch 2
Epoch 3/3


Training: 100%|██████████| 2355/2355 [06:33<00:00,  5.98it/s]
Evaluating: 100%|██████████| 590/590 [00:31<00:00, 18.60it/s]


Training Loss: 1.6746
Validation Loss: 1.6454
Validation Accuracy: 0.4245
Model and tokenizer saved to /content/drive/MyDrive/Kwargs/esg관련도/모델_epoch_3 at Epoch 3
