<a href="https://colab.research.google.com/github/davechang-99/Dementia_Kor/blob/main/D_kor_ipynb%EC%9D%98_%EC%82%AC%EB%B3%B8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 패키지 설치 (Colab에서만 실행)
!pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 scikit-learn==1.3.2 transformers==4.40.1 librosa==0.10.1 matplotlib==3.7.1

Collecting torch==2.1.2
  Downloading torch-2.1.2-cp311-cp311-manylinux1_x86_64.whl.metadata (25 kB)
Collecting torchvision==0.16.2
  Downloading torchvision-0.16.2-cp311-cp311-manylinux1_x86_64.whl.metadata (6.6 kB)
Collecting torchaudio==2.1.2
  Downloading torchaudio-2.1.2-cp311-cp311-manylinux1_x86_64.whl.metadata (6.4 kB)
Collecting scikit-learn==1.3.2
  Downloading scikit_learn-1.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting transformers==4.40.1
  Downloading transformers-4.40.1-py3-none-any.whl.metadata (137 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.0/138.0 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting librosa==0.10.1
  Downloading librosa-0.10.1-py3-none-any.whl.metadata (8.3 kB)
Collecting matplotlib==3.7.1
  Downloading matplotlib-3.7.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.1.2)
  Downloadi

In [None]:
# 구글 드라이브 마운트
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 필수 패키지 임포트
import os, random, numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchaudio
import librosa
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from transformers import ViTFeatureExtractor, ViTForImageClassification

In [None]:
# 환경변수 및 경로 설정
SEED = 42
BATCH_SIZE = 32
EPOCHS = 20
LEARNING_RATE = 1e-4
AUDIO_MAX_LEN = 16000*10  # 10초 기준 패딩
N_MELS = 128
DATA_DIR = '/content/drive/MyDrive/DATASET_ko'  # 실제 데이터 위치로 변경
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x79c4dfb45e30>

In [None]:
# 데이터 준비 및 경로/레이블 수집
def get_audio_paths_and_labels(data_dir):
    paths, labels = [], []
    for label_dir in os.listdir(data_dir):
        label_path = os.path.join(data_dir, label_dir)
        if os.path.isdir(label_path):
            for fname in os.listdir(label_path):
                if fname.endswith('.wav'):
                    paths.append(os.path.join(label_path, fname))
                    labels.append(0 if label_dir == 'normal' else 1)  # normal=0, dementia=1
    return paths, labels

audio_paths, audio_labels = get_audio_paths_and_labels(DATA_DIR)
train_paths, val_paths, train_labels, val_labels = train_test_split(
    audio_paths, audio_labels, test_size=0.2, random_state=SEED, stratify=audio_labels
)

class AudioDataset(Dataset):
    def __init__(self, paths, labels, mode='cnn', feature_extractor=None, augment=False):
        self.paths = paths
        self.labels = labels
        self.mode = mode
        self.feature_extractor = feature_extractor
        self.augment = augment

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        path = self.paths[idx]
        label = self.labels[idx]
        y, sr = librosa.load(path, sr=16000)
        if len(y) < AUDIO_MAX_LEN:
            y = np.pad(y, (0, AUDIO_MAX_LEN - len(y)))
        else:
            y = y[:AUDIO_MAX_LEN]
        if self.augment:
            y = y + np.random.normal(0, 0.005, size=y.shape)
        # 수정된 부분: melspectrogram 호출 시 keyword-only argument 사용
        mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=N_MELS)
        #mel = librosa.feature.melspectrogram(y, sr=sr, n_mels=N_MELS)

        mel_db = librosa.power_to_db(mel, ref=np.max)
        mel_db = (mel_db - mel_db.mean()) / (mel_db.std() + 1e-6)
        if self.mode == 'cnn':
            return torch.tensor(mel_db, dtype=torch.float32).unsqueeze(0), torch.tensor(label, dtype=torch.long)
        elif self.mode == 'vit':
            image = np.stack([mel_db]*3, axis=0)
            if self.feature_extractor:
                image = self.feature_extractor(images=image.transpose(1,2,0), return_tensors='pt')['pixel_values'][0]
            else:
                image = torch.tensor(image, dtype=torch.float32)
            return image, torch.tensor(label, dtype=torch.long)
        elif self.mode == 'raw':
            return mel_db.flatten(), label


In [None]:
# 모델 정의 (CNN, ViT, RandomForest)
class SimpleCNN(nn.Module):
    def __init__(self, n_mels=N_MELS, n_classes=2):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 16, (5,5), stride=2, padding=2)
        self.bn1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(16, 32, (3,3), stride=2, padding=1)
        self.bn2 = nn.BatchNorm2d(32)
        self.conv3 = nn.Conv2d(32, 64, (3,3), stride=2, padding=1)
        self.bn3 = nn.BatchNorm2d(64)
        self.dropout = nn.Dropout(0.3)
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(64*(n_mels//8)*(int(AUDIO_MAX_LEN/512)), 128)
        self.fc2 = nn.Linear(128, n_classes)

    def forward(self, x):
        x = self.relu(self.bn1(self.conv1(x)))
        x = self.relu(self.bn2(self.conv2(x)))
        x = self.relu(self.bn3(self.conv3(x)))
        x = x.flatten(1)
        x = self.dropout(self.relu(self.fc1(x)))
        x = self.fc2(x)
        return x


In [None]:
# ViT: HuggingFace 모델 활용
feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
vit_model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224-in21k", num_labels=2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# 학습, 평가, 시각화 함수 및 전체 실행
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs, device):
    model.to(device)
    best_acc = 0.0
    for epoch in range(epochs):
        model.train()
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            outputs = model(x)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()
        # 검증
        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for x, y in val_loader:
                x, y = x.to(device), y.to(device)
                outputs = model(x)
                preds = outputs.argmax(1)
                correct += (preds == y).sum().item()
                total += y.size(0)
        val_acc = correct / total
        print(f"Epoch {epoch+1}, Val Accuracy: {val_acc:.4f}")
        if val_acc > best_acc:
            best_acc = val_acc
            torch.save(model.state_dict(), "best_model.pt")

def eval_model(model, loader, device):
    model.to(device)
    model.eval()
    y_true, y_pred, y_prob = [], [], []
    with torch.no_grad():
        for x, y in loader:
            x = x.to(device)
            outputs = model(x)
            probs = nn.functional.softmax(outputs, dim=1)
            preds = outputs.argmax(1).cpu().numpy()
            y_true.extend(y.numpy())
            y_pred.extend(preds)
            y_prob.extend(probs.cpu().numpy())
    return np.array(y_true), np.array(y_pred), np.array(y_prob)

def train_rf(train_dataset, val_dataset):
    X_train = np.array([train_dataset[i][0] for i in range(len(train_dataset))])
    y_train = np.array([train_dataset[i][1] for i in range(len(train_dataset))])
    X_val = np.array([val_dataset[i][0] for i in range(len(val_dataset))])
    y_val = np.array([val_dataset[i][1] for i in range(len(val_dataset))])
    rf = RandomForestClassifier(n_estimators=100, random_state=SEED)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)
    y_prob = rf.predict_proba(X_val)
    acc = accuracy_score(y_val, y_pred)
    auc = roc_auc_score(y_val, y_prob[:,1])
    print(f"RF ValAcc={acc:.4f}, ROC-AUC={auc:.4f}")
    return y_val, y_pred, y_prob

def plot_results(y_true, y_prob, title):
    if len(y_true) == 0 or len(y_prob) == 0:
        print("No data for plotting.")
        return
    from sklearn.metrics import roc_curve
    fpr, tpr, _ = roc_curve(y_true, y_prob[:,1])
    plt.figure()
    plt.plot(fpr, tpr, label=f'{title} ROC curve (AUC={roc_auc_score(y_true, y_prob[:,1]):.2f}')
    plt.plot([0,1],[0,1],'--',color='gray')
    plt.xlabel('FPR'); plt.ylabel('TPR'); plt.title(title); plt.legend(); plt.show()

In [None]:
# CNN
train_cnn_ds = AudioDataset(train_paths, train_labels, mode='cnn', augment=True)
val_cnn_ds = AudioDataset(val_paths, val_labels, mode='cnn')
train_cnn_loader = DataLoader(train_cnn_ds, batch_size=BATCH_SIZE, shuffle=True)
val_cnn_loader = DataLoader(val_cnn_ds, batch_size=BATCH_SIZE)
cnn_model = SimpleCNN(n_mels=N_MELS)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(cnn_model.parameters(), lr=LEARNING_RATE)
train_model(cnn_model, train_cnn_loader, val_cnn_loader, criterion, optimizer, EPOCHS, DEVICE)
y_true_cnn, y_pred_cnn, y_prob_cnn = eval_model(cnn_model, val_cnn_loader, DEVICE)
plot_results(y_true_cnn, y_prob_cnn, "CNN")

TypeError: melspectrogram() takes 0 positional arguments but 1 positional argument (and 1 keyword-only argument) were given

In [None]:
# ViT
train_vit_ds = AudioDataset(train_paths, train_labels, mode='vit', feature_extractor=feature_extractor, augment=True)
val_vit_ds = AudioDataset(val_paths, val_labels, mode='vit', feature_extractor=feature_extractor)
train_vit_loader = DataLoader(train_vit_ds, batch_size=BATCH_SIZE, shuffle=True)
val_vit_loader = DataLoader(val_vit_ds, batch_size=BATCH_SIZE)
optimizer_vit = optim.Adam(vit_model.parameters(), lr=LEARNING_RATE)
train_model(vit_model, train_vit_loader, val_vit_loader, criterion, optimizer_vit, EPOCHS, DEVICE)
y_true_vit, y_pred_vit, y_prob_vit = eval_model(vit_model, val_vit_loader, DEVICE)
plot_results(y_true_vit, y_prob_vit, "ViT")

In [None]:
# RandomForest
train_rf_ds = AudioDataset(train_paths, train_labels, mode='raw', augment=True)
val_rf_ds = AudioDataset(val_paths, val_labels, mode='raw')
y_true_rf, y_pred_rf, y_prob_rf = train_rf(train_rf_ds, val_rf_ds)
plot_results(y_true_rf, y_prob_rf, "RandomForest")

In [None]:
# 전체 성능 비교 리포트
def print_report(name, y_true, y_pred):
    print(f"\n{name} Classification Report:")
    print(classification_report(y_true, y_pred))
    print(f"Confusion Matrix:\n{confusion_matrix(y_true, y_pred)}")

print_report("CNN", y_true_cnn, y_pred_cnn)
print_report("ViT", y_true_vit, y_pred_vit)
print_report("RandomForest", y_true_rf, y_pred_rf)