In [1]:
import pandas as pd
import os

# 데이터 파일 경로
file_path = "./Data/Train.csv"

# 데이터 로드
data = pd.read_csv(file_path)

# 랜덤 샘플링 100개씩 5번 추출
output_dir = "./Data/None"
os.makedirs(output_dir, exist_ok=True)

for i in range(1, 6):  # 1부터 5까지
    sampled_data = data.sample(n=500, random_state=i)  # 랜덤 시드 고정
    output_file = os.path.join(output_dir, f"None_{i}.csv")
    sampled_data.to_csv(output_file, index=False)
    print(f"Sample {i} saved to {output_file}")

Sample 1 saved to ./Data/None/None_1.csv
Sample 2 saved to ./Data/None/None_2.csv
Sample 3 saved to ./Data/None/None_3.csv
Sample 4 saved to ./Data/None/None_4.csv
Sample 5 saved to ./Data/None/None_5.csv


In [2]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.metrics import f1_score
import os

# 사용자 정의 데이터셋 클래스
class SimpleDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        inputs = self.tokenizer(
            self.texts[idx],
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "label": torch.tensor(self.labels[idx], dtype=torch.long),
        }

# 경로 설정
train_dir = "./Data/None"
test_file = "./Data/Test.csv"
summary_file = "./Data/Summary.csv"

# 모델 및 토크나이저 초기화
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=7)
device = torch.device("cpu")
model.to(device)

# 학습 및 평가 결과 저장
results = []

# 테스트 데이터 준비
test_data = pd.read_csv(test_file)

# `document` 열 문자열로 변환 및 결측치 처리
test_data["document"] = test_data["document"].astype(str).fillna("")

test_texts = test_data["document"].tolist()
test_labels = test_data["label"].tolist()  # 'label' 열 사용
test_dataset = SimpleDataset(test_texts, test_labels, tokenizer, max_len=128)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 학습 루프
for train_file in sorted(os.listdir(train_dir)):
    if train_file.endswith(".csv"):
        train_data = pd.read_csv(os.path.join(train_dir, train_file))
        
        # `document` 열 문자열로 변환 및 결측치 처리
        train_data["document"] = train_data["document"].astype(str).fillna("")

        train_texts = train_data["document"].tolist()
        train_labels = train_data["label"].tolist()  # 'label' 열 사용

        train_dataset = SimpleDataset(train_texts, train_labels, tokenizer, max_len=128)
        train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

        # 옵티마이저 설정
        optimizer = AdamW(model.parameters(), lr=5e-5)

        # 학습
        model.train()
        for epoch in range(3):  # 3 에포크
            total_loss = 0
            for batch in train_loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["label"].to(device)

                optimizer.zero_grad()
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                loss.backward()
                optimizer.step()

                total_loss += loss.item()

            print(f"{train_file} - Epoch {epoch + 1} - Loss: {total_loss:.4f}")

        # 평가
        model.eval()
        all_labels = []
        all_preds = []
        with torch.no_grad():
            for batch in test_loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["label"].to(device)

                outputs = model(input_ids, attention_mask=attention_mask)
                preds = torch.argmax(outputs.logits, axis=1).cpu().numpy()
                all_preds.extend(preds)
                all_labels.extend(labels.cpu().numpy())

        f1_micro = f1_score(all_labels, all_preds, average="micro")
        f1_macro = f1_score(all_labels, all_preds, average="macro")
        print(f"Test Results for {train_file}: F1 Micro = {f1_micro:.4f}, F1 Macro = {f1_macro:.4f}")
        results.append({"File": train_file, "F1 Micro": f1_micro, "F1 Macro": f1_macro})

# 결과 저장
if os.path.exists(summary_file):
    # 기존 파일 불러오기
    existing_df = pd.read_csv(summary_file)
    # 기존 데이터에 새 결과 추가
    results_df = pd.concat([existing_df, pd.DataFrame(results)], ignore_index=True)
else:
    # 기존 파일이 없으면 새 DataFrame 생성
    results_df = pd.DataFrame(results)

# 업데이트된 결과 저장
results_df.to_csv(summary_file, index=False)
print(f"Summary saved to {summary_file}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


None_1.csv - Epoch 1 - Loss: 31.4989
None_1.csv - Epoch 2 - Loss: 30.7459
None_1.csv - Epoch 3 - Loss: 30.7331
Test Results for None_1.csv: F1 Micro = 0.1932, F1 Macro = 0.1362




None_2.csv - Epoch 1 - Loss: 30.9894
None_2.csv - Epoch 2 - Loss: 29.7358
None_2.csv - Epoch 3 - Loss: 28.7873
Test Results for None_2.csv: F1 Micro = 0.2334, F1 Macro = 0.2060




None_3.csv - Epoch 1 - Loss: 30.2929
None_3.csv - Epoch 2 - Loss: 29.0246
None_3.csv - Epoch 3 - Loss: 28.3471
Test Results for None_3.csv: F1 Micro = 0.2358, F1 Macro = 0.2041




None_4.csv - Epoch 1 - Loss: 30.0550
None_4.csv - Epoch 2 - Loss: 28.5622
None_4.csv - Epoch 3 - Loss: 27.2252
Test Results for None_4.csv: F1 Micro = 0.2563, F1 Macro = 0.2190




None_5.csv - Epoch 1 - Loss: 29.5941
None_5.csv - Epoch 2 - Loss: 27.6609
None_5.csv - Epoch 3 - Loss: 26.3435
Test Results for None_5.csv: F1 Micro = 0.2547, F1 Macro = 0.2149
Summary saved to ./Data/Summary.csv
