In [1]:
from transformers import ElectraForMaskedLM, ElectraTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import os
import random
import torch
import pandas as pd
from datasets import Dataset

# 원본 폴더와 타겟 폴더 경로
source_folder = "./Data/None"
target_folder = "./Data/cMLM"

# KoELECTRA 모델과 토크나이저 로드
model_name = "monologg/koelectra-base-v3-generator"
model = ElectraForMaskedLM.from_pretrained(model_name)
tokenizer = ElectraTokenizer.from_pretrained(model_name)

# 클래스별 데이터 분리 함수
def split_by_class(df, class_column):
    classes = df[class_column].unique()
    class_datasets = {cls: df[df[class_column] == cls] for cls in classes}
    return class_datasets

# 미세 조정 함수
def fine_tune_model(class_df, tokenizer, model, output_dir, epochs=3):
    texts = class_df["comments"].tolist()
    tokenized_data = tokenizer(texts, truncation=True, padding=True, max_length=128, return_tensors="pt")
    dataset = Dataset.from_dict({
        "input_ids": tokenized_data["input_ids"],
        "attention_mask": tokenized_data["attention_mask"]
    })

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=True,
        mlm_probability=0.15
    )

    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=epochs,
        per_device_train_batch_size=8,
        save_steps=10_000,
        save_total_limit=2,
        logging_dir=f"{output_dir}/logs",
        logging_steps=500,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset
    )

    trainer.train()
    trainer.save_model(output_dir)

# 데이터 증강 함수
def augment_text(text, model, tokenizer, mask_prob):
    tokens = tokenizer.tokenize(text)
    num_to_mask = max(1, int(len(tokens) * mask_prob))
    mask_indices = random.sample(range(len(tokens)), num_to_mask)

    # 원본 토큰 복사 및 마스크 처리
    original_tokens = tokens.copy()
    for idx in mask_indices:
        tokens[idx] = tokenizer.mask_token

    # 토큰을 입력으로 변환
    inputs = tokenizer.convert_tokens_to_ids(tokens)
    inputs = torch.tensor([inputs])

    # 모델 예측 수행
    with torch.no_grad():
        outputs = model(inputs)
        predictions = torch.argmax(outputs.logits, dim=-1)

    # 마스크된 위치만 대체
    for idx in mask_indices:
        original_tokens[idx] = tokenizer.decode([predictions[0][idx].item()], skip_special_tokens=True)

    # 디코딩하여 증강 텍스트 생성
    augmented_text = tokenizer.convert_tokens_to_string(original_tokens)
    return augmented_text.strip()

# 클래스 조건부 데이터 증강 및 저장 함수
def augment_and_save_by_class(source_folder, target_folder, class_column, mask_prob=0.15, augment_multiplier=1, fine_tune_epochs=3):
    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

    for file_name in os.listdir(source_folder):
        if file_name.startswith("None_"):
            file_number = file_name.split("_")[1]  # 파일 이름에서 번호 추출
            source_path = os.path.join(source_folder, file_name)

            # 파일 로드
            df = pd.read_csv(source_path, encoding="utf-8")

            # 클래스별 데이터 분리
            class_datasets = split_by_class(df, class_column)

            # 각 클래스별 미세 조정 및 증강
            augmented_dfs = []
            for cls, class_df in class_datasets.items():
                if "comments" in class_df.columns:
                    output_dir = f"./fine_tuned_models/{cls}"
                    fine_tune_model(class_df, tokenizer, model, output_dir, epochs=fine_tune_epochs)

                    # 미세 조정된 모델 로드
                    fine_tuned_model = ElectraForMaskedLM.from_pretrained(output_dir)

                    augmented_comments = []
                    for _ in range(augment_multiplier):
                        augmented_comments.extend(
                            class_df["comments"].apply(
                                lambda x: augment_text(x, fine_tuned_model, tokenizer, mask_prob) if isinstance(x, str) else x
                            )
                        )
                    augmented_df = class_df.copy()
                    augmented_df = augmented_df.loc[augmented_df.index.repeat(augment_multiplier)]
                    augmented_df["comments_ag"] = augmented_comments
                    augmented_dfs.append(augmented_df)

            # 증강된 데이터 합치기
            all_augmented_df = pd.concat(augmented_dfs, ignore_index=True)
            final_df = pd.concat([df, all_augmented_df], ignore_index=True)

            # 저장
            target_file_name = f"cMLM_{file_number}"
            target_path = os.path.join(target_folder, target_file_name)
            final_df.to_csv(target_path, index=False, encoding="utf-8-sig")

            # 로그 출력
            print(f"Processed {file_name}: Original={len(df)}, Augmented={len(all_augmented_df)}")

augment_and_save_by_class(source_folder, target_folder, class_column="hate", mask_prob=0.1, augment_multiplier=1, fine_tune_epochs=3)
print(f"Class-conditional data augmentation completed and saved to {target_folder}")


Step,Training Loss


Step,Training Loss


Step,Training Loss


Processed None_2.csv: Original=1000, Augmented=1000


Step,Training Loss


Step,Training Loss


Step,Training Loss


Processed None_4.csv: Original=1000, Augmented=1000


Step,Training Loss


Step,Training Loss


Step,Training Loss


Processed None_5.csv: Original=1000, Augmented=1000


Step,Training Loss


Step,Training Loss


Step,Training Loss


Processed None_3.csv: Original=1000, Augmented=1000


Step,Training Loss


Step,Training Loss


Step,Training Loss


Processed None_1.csv: Original=1000, Augmented=1000
Class-conditional data augmentation completed and saved to ./Data/cMLM


In [2]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.metrics import f1_score
import os

# 사용자 정의 데이터셋 클래스
class SimpleDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        inputs = self.tokenizer(
            self.texts[idx],
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "label": torch.tensor(self.labels[idx], dtype=torch.long),
        }

# 경로 설정
train_dir = "./Data/cMLM"
test_file = "./Data/Test.csv"
summary_file = "./Data/Summary.csv"

# 모델 및 토크나이저 초기화
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)  # 3개 클래스
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 학습 및 평가 결과 저장
results = []

# 테스트 데이터 준비
test_data = pd.read_csv(test_file)
test_texts = test_data["comments"].tolist()
test_labels = test_data["hate"].map({"none": 0, "hate": 1, "offensive": 2}).tolist()
test_dataset = SimpleDataset(test_texts, test_labels, tokenizer, max_len=128)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 학습 루프
for train_file in sorted(os.listdir(train_dir)):
    if train_file.endswith(".csv"):
        train_data = pd.read_csv(os.path.join(train_dir, train_file))
        train_texts = train_data["comments"].tolist()
        train_labels = train_data["hate"].map({"none": 0, "hate": 1, "offensive": 2}).tolist()

        train_dataset = SimpleDataset(train_texts, train_labels, tokenizer, max_len=128)
        train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

        # 옵티마이저 설정
        optimizer = AdamW(model.parameters(), lr=5e-5)

        # 학습
        model.train()
        for epoch in range(3):  # 3 에포크
            total_loss = 0
            for batch in train_loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["label"].to(device)

                optimizer.zero_grad()
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                loss.backward()
                optimizer.step()

                total_loss += loss.item()

            print(f"{train_file} - Epoch {epoch + 1} - Loss: {total_loss:.4f}")

        # 평가
        model.eval()
        all_labels = []
        all_preds = []
        with torch.no_grad():
            for batch in test_loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["label"].to(device)

                outputs = model(input_ids, attention_mask=attention_mask)
                preds = torch.argmax(outputs.logits, axis=1).cpu().numpy()
                all_preds.extend(preds)
                all_labels.extend(labels.cpu().numpy())

        f1_micro = f1_score(all_labels, all_preds, average="micro")
        f1_macro = f1_score(all_labels, all_preds, average="macro")
        print(f"Test Results for {train_file}: F1 Micro = {f1_micro:.4f}, F1 Macro = {f1_macro:.4f}")
        results.append({"File": train_file, "F1 Micro": f1_micro, "F1 Macro": f1_macro})

# 결과 저장 또는 추가
if os.path.exists(summary_file):
    # 기존 파일이 있는 경우: 기존 데이터를 읽어서 새로운 데이터를 추가
    existing_results = pd.read_csv(summary_file)
    results_df = pd.concat([existing_results, pd.DataFrame(results)], ignore_index=True)
else:
    # 기존 파일이 없는 경우: 새로운 데이터만 저장
    results_df = pd.DataFrame(results)

results_df.to_csv(summary_file, index=False)
print(f"Summary updated and saved to {summary_file}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cMLM_1.csv - Epoch 1 - Loss: 67.2733
cMLM_1.csv - Epoch 2 - Loss: 65.4214
cMLM_1.csv - Epoch 3 - Loss: 63.0245
Test Results for cMLM_1.csv: F1 Micro = 0.3694, F1 Macro = 0.2579




cMLM_2.csv - Epoch 1 - Loss: 65.1144
cMLM_2.csv - Epoch 2 - Loss: 60.9672
cMLM_2.csv - Epoch 3 - Loss: 55.0326
Test Results for cMLM_2.csv: F1 Micro = 0.3843, F1 Macro = 0.3094




cMLM_3.csv - Epoch 1 - Loss: 63.9233
cMLM_3.csv - Epoch 2 - Loss: 55.3148
cMLM_3.csv - Epoch 3 - Loss: 46.7814
Test Results for cMLM_3.csv: F1 Micro = 0.3949, F1 Macro = 0.3455




cMLM_4.csv - Epoch 1 - Loss: 60.7660
cMLM_4.csv - Epoch 2 - Loss: 48.4462
cMLM_4.csv - Epoch 3 - Loss: 32.4035
Test Results for cMLM_4.csv: F1 Micro = 0.4183, F1 Macro = 0.4045




cMLM_5.csv - Epoch 1 - Loss: 60.4952
cMLM_5.csv - Epoch 2 - Loss: 42.7284
cMLM_5.csv - Epoch 3 - Loss: 25.4380
Test Results for cMLM_5.csv: F1 Micro = 0.4331, F1 Macro = 0.4276
Summary updated and saved to ./Data/Summary.csv
