In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Kwargs/022. esg관련도 필터/업데이트/A2_remaster_combined_ANONYMIZED.csv')
df['esg_score'].value_counts()

In [None]:
# Initial relabeling based on the updated criteria
def updated_relabel(value):
    if 1.0 >= value >= 0.9:
        return '상당히 관련있음'
    elif value == 0.8:
        return '적절히 관련있음'
    elif 0.7 >= value >= 0.6:
        return '약간 관련있음'
    elif 0.5 >= value >= 0.3:
        return '거의 관련없음'
    elif value == 0.2:
        return '상당히 관련없음'
    else:
        return '완전히 관련없음'

# Apply the updated relabeling function
df['esg_score_6'] = df['esg_score'].apply(updated_relabel)

# Continuous encoding based on 0.2 intervals
continuous_mapping = {
    '상당히 관련있음': 1.0,
    '적절히 관련있음': 0.8,
    '약간 관련있음': 0.6,
    '거의 관련없음': 0.4,
    '상당히 관련없음': 0.2,
    '완전히 관련없음': 0.0
}

# Apply the continuous mapping
df['esg_score_6'] = df['esg_score_6'].map(continuous_mapping)

df['esg_score_6'].value_counts()

Unnamed: 0_level_0,count
esg_score_6,Unnamed: 1_level_1
0.0,24031
0.4,8022
0.2,6535
0.6,3871
0.8,2591
1.0,559


In [None]:
# 필요한 라이브러리 불러오기
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import ElectraTokenizer, ElectraForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import os
from torch.cuda.amp import GradScaler, autocast
from tqdm import tqdm
import torch.optim as optim

# GPU 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hugging Face 모델 캐시 경로를 Google Drive로 설정
os.environ['TRANSFORMERS_CACHE'] = '/content/drive/MyDrive/Kwargs/022. esg관련도 필터/업데이트/hf_cache'

# 데이터셋 클래스 정의
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]).to(device) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).to(device)
        return item

# 라벨 클래스 수 계산 및 인덱스 매핑
unique_labels = sorted(df['esg_score_6'].unique())  # num_classes 변수 정의
num_classes = len(unique_labels)
label_to_index = {label: idx for idx, label in enumerate(unique_labels)}
df['label_indices'] = df['esg_score_6'].map(label_to_index)

# Train-Test Split
train_texts = df['full_text'][:int(0.8 * len(df))].tolist()
val_texts = df['full_text'][int(0.8 * len(df)):].tolist()
train_labels_indices = df['label_indices'][:int(0.8 * len(df))].tolist()
val_labels_indices = df['label_indices'][int(0.8 * len(df)):].tolist()

# 하이퍼파라미터 설정
learning_rate = 2e-4
batch_size = 32
epochs = 3
max_length = 512
max_weight = 4.0

# 클래스별 빈도 계산 및 가중치 설정
label_counts = df['label_indices'].value_counts().sort_index()
class_freq = label_counts.values
class_weights = 1.0 / class_freq
class_weights = torch.tensor([min(weight, max_weight) for weight in class_weights], dtype=torch.float).to(device)

# 토크나이저 및 모델 불러오기
tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-discriminator")
model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-base-discriminator", num_labels=num_classes)
model.to(device)

# Train-Test 데이터셋을 미리 토크나이징
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=max_length)

# 데이터셋 생성
train_dataset = TextDataset(train_encodings, train_labels_indices)
val_dataset = TextDataset(val_encodings, val_labels_indices)

# 데이터로더 생성
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# 손실 함수 및 옵티마이저 설정 (PyTorch의 AdamW 사용)
loss_fn = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)  # PyTorch의 AdamW 사용
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Mixed Precision Training을 위한 GradScaler
scaler = GradScaler()

# 학습 함수 정의
def train_model():
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc="Training"):
        optimizer.zero_grad()
        with autocast():
            outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])
            loss = outputs.loss
            total_loss += loss.item()

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

    torch.cuda.empty_cache()

    avg_loss = total_loss / len(train_loader)
    return avg_loss

# 평가 함수 정의
def evaluate_model():
    model.eval()
    val_labels_list, val_preds_list = [], []
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Evaluating"):
            with autocast():
                outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
                logits = outputs.logits
                _, preds = torch.max(logits, dim=1)
                val_labels_list.extend(batch['labels'].cpu().numpy())
                val_preds_list.extend(preds.cpu().numpy())

    torch.cuda.empty_cache()

    return val_labels_list, val_preds_list

# 학습 루프
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    train_loss = train_model()
    val_labels_list, val_preds_list = evaluate_model()
    accuracy = accuracy_score(val_labels_list, val_preds_list)
    precision = precision_score(val_labels_list, val_preds_list, average='weighted')
    recall = recall_score(val_labels_list, val_preds_list, average='weighted')
    f1 = f1_score(val_labels_list, val_preds_list, average='weighted')

    print(f"Train Loss: {train_loss:.4f}")
    print(f"Validation Results - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")

# 모델 및 토크나이저 저장
save_directory = '/content/drive/MyDrive/Kwargs/022. esg관련도 필터/업데이트'
if not os.path.exists(save_directory):
    os.makedirs(save_directory)

model_path = os.path.join(save_directory, "A2-14_0927")
torch.save(model.state_dict(), model_path)
tokenizer.save_pretrained(save_directory)

print(f"모델과 토크나이저가 저장되었습니다: {save_directory}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/279k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/467 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()


Epoch 1/3


  with autocast():
Training:   7%|▋         | 310/4561 [24:27<5:32:23,  4.69s/it]