# 1.config/config.yaml

In [None]:
data:
  root: /Users/taehayeong/Desktop/data/deepvoice_data/data
  sample_rate: 16000
  n_mels: 80
  segment_frames: 300   # frame 기준
  hop_length: 256

model:
  hidden_dim: 128
  latent_dim: 64
  num_layers: 2

training:
  batch_size: 8
  lr: 0.001
  epochs: 20
  save_path: checkpoints/lstm_ae.pth
  classifier_save_path: checkpoints/classifier.pth


# 2. src/data_split.py

In [None]:
import os
import random

def get_data(model="anomaly", root=None, seed=42):
    random.seed(seed)

    speaker_dirs = [
        os.path.join(root, d)
        for d in os.listdir(root)
        if d.isdigit() and os.path.isdir(os.path.join(root, d))
    ]

    random.shuffle(speaker_dirs)

    train_speakers = speaker_dirs[:100]
    test_speakers  = speaker_dirs[100:200]

    def collect_files(speakers):
        flacs, wavs = [], []
        for spk in speakers:
            for r, _, files in os.walk(spk):
                for f in files:
                    if f.endswith(".flac"):
                        flacs.append(os.path.join(r, f))
        wav_root = os.path.join(root, "wavs")
        if os.path.isdir(wav_root):
            for r, _, files in os.walk(wav_root):
                for f in files:
                    if f.endswith(".wav"):
                        wavs.append(os.path.join(r, f))
        return flacs, wavs

    train_flac, train_wav = collect_files(train_speakers)
    test_flac,  test_wav  = collect_files(test_speakers)

    # ---------------- ANOMALY ----------------
    if model == "anomaly":
        train = train_flac

        test = (
            [(p, 0) for p in test_flac] +
            [(p, 1) for p in test_wav]
        )
        random.shuffle(test)

        return train, test

    # ---------------- CLASSIFIER ----------------
    elif model == "classifier":
        train = (
            [(p, 0) for p in train_flac] +
            [(p, 1) for p in train_wav]
        )
        test = (
            [(p, 0) for p in test_flac] +
            [(p, 1) for p in test_wav]
        )

        random.shuffle(train)
        random.shuffle(test)

        n = len(train)
        n_val = int(0.2 * n)

        return train[:-n_val], train[-n_val:], test

    else:
        raise ValueError("model must be 'anomaly' or 'classifier'")



# 4.src/dataset.py

In [None]:
import torch
from torch.utils.data import Dataset

class VoiceDataset(Dataset):
    def __init__(self, file_list, preprocessor, feature_fn, segment_len, mode):
        self.items = []
        self.wav_cache = {}
        self.feature_fn = feature_fn
        self.segment_len = segment_len
        self.mode = mode

        for wid, item in enumerate(file_list):
            if mode == "anomaly_train":
                path, label = item, 0
            else:
                path, label = item

            wav = preprocessor.preprocess(path)
            self.wav_cache[path] = wav

            n_seg = len(wav) // segment_len
            for i in range(n_seg):
                self.items.append((path, label, i, wid))

    def __len__(self):
        return len(self.items)

    def __getitem__(self, idx):
        path, label, seg_idx, wid = self.items[idx]
        wav = self.wav_cache[path]

        seg = wav[
            seg_idx*self.segment_len:(seg_idx+1)*self.segment_len
        ]

        feat = self.feature_fn(seg)

        if self.mode == "anomaly_train":
            return feat
        elif "test" in self.mode:
            return feat, label, wid
        else:
            return feat, label






# 5.src/utils.py

In [None]:
import yaml
import librosa
import numpy as np

def load_config(path):
    with open(path) as f:
        return yaml.safe_load(f)

def extract_logmel(wav, sr, n_mels):
    mel = librosa.feature.melspectrogram(
        y=wav,
        sr=sr,
        n_fft=1024,
        hop_length=256,
        n_mels=n_mels
    )
    return librosa.power_to_db(mel)



# 6.src/preprocess.py

In [None]:
import librosa
import numpy as np

class AudioPreprocessor:
    def __init__(self, sr):
        self.sr = sr

    def preprocess(self, path):
        wav, _ = librosa.load(path, sr=self.sr)
        return wav / (np.max(np.abs(wav)) + 1e-9)




# 7.model.py

In [None]:
import torch
import torch.nn as nn

class LSTMAutoEncoder(nn.Module):
    def __init__(
        self,
        n_mels=80,
        hidden_dim=128,
        latent_dim=64,
        num_layers=2
    ):
        super().__init__()

        # -------- Encoder --------
        self.encoder = nn.LSTM(
            input_size=n_mels,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True
        )

        self.to_latent = nn.Linear(hidden_dim, latent_dim)

        # -------- Decoder --------
        self.from_latent = nn.Linear(latent_dim, hidden_dim)

        self.decoder = nn.LSTM(
            input_size=hidden_dim,
            hidden_size=n_mels,
            num_layers=num_layers,
            batch_first=True
        )

    def forward(self, x):
        """
        x: (B, n_mels, T)
        return: reconstructed x (B, n_mels, T)
        """

        # (B, n_mels, T) → (B, T, n_mels)
        x = x.permute(0, 2, 1)

        # -------- Encoder --------
        enc_out, (h_n, _) = self.encoder(x)
        # h_n: (num_layers, B, hidden_dim)

        h_last = h_n[-1]                 # (B, hidden_dim)
        z = self.to_latent(h_last)       # (B, latent_dim)

        # -------- Decoder --------
        h_dec = self.from_latent(z)      # (B, hidden_dim)

        # repeat for each timestep
        T = x.size(1)
        h_dec_seq = h_dec.unsqueeze(1).repeat(1, T, 1)

        recon, _ = self.decoder(h_dec_seq)
        # recon: (B, T, n_mels)

        # (B, T, n_mels) → (B, n_mels, T)
        recon = recon.permute(0, 2, 1)

        return recon


# 8.model2.py


In [None]:
# src/model2.py
import torch
import torch.nn as nn
import torch.nn.functional as F


class DeepVoiceClassifier(nn.Module):
    def __init__(
        self,
        n_mels=80,
        lstm_hidden=128,
        num_classes=2
    ):
        super().__init__()

        # -------- CNN Encoder --------
        self.conv = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(16, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2),
        )

        # -------- BiLSTM --------
        self.lstm = nn.LSTM(
            input_size=(n_mels // 4) * 32,
            hidden_size=lstm_hidden,
            batch_first=True,
            bidirectional=True
        )

        # -------- Classifier --------
        self.fc = nn.Linear(lstm_hidden * 2, num_classes)

    def forward(self, x):
        """
        x: (B, 1, n_mels, T)
        """

        # CNN
        x = self.conv(x)
        # (B, C, n_mels//4, T//4)

        B, C, F, T = x.shape
        x = x.permute(0, 3, 1, 2)  # (B, T, C, F)
        x = x.reshape(B, T, C * F) # (B, T, feature)

        # LSTM
        out, _ = self.lstm(x)

        # 마지막 timestep
        out = out[:, -1, :]

        # Classifier
        logits = self.fc(out)
        return logits


# 9.train.py

In [None]:
import torch, os
import torch.nn as nn
from torch.utils.data import DataLoader
import time
import numpy as np
from src.data_split import get_data
from src.dataset import VoiceDataset
from src.preprocess import AudioPreprocessor
from src.utils import load_config, extract_logmel
from src.models import LSTMAutoEncoder

cfg = load_config("config/config.yaml")
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

train_files, _ = get_data("anomaly", cfg["data"]["root"])
prep = AudioPreprocessor(cfg["data"]["sample_rate"])

dataset = VoiceDataset(
    train_files,
    prep,
    lambda x: torch.tensor(
        extract_logmel(x, cfg["data"]["sample_rate"], cfg["data"]["n_mels"]),
        dtype=torch.float32
    ),
    cfg["data"]["segment_frames"] * cfg["data"]["hop_length"],
    "anomaly_train"
)

loader = DataLoader(dataset, batch_size=cfg["training"]["batch_size"], shuffle=True)

model = LSTMAutoEncoder(
    cfg["data"]["n_mels"],
    cfg["model"]["hidden_dim"],
    cfg["model"]["latent_dim"],
    cfg["model"]["num_layers"]
).to(device)

opt = torch.optim.Adam(model.parameters(), lr=cfg["training"]["lr"])
loss_fn = nn.MSELoss()
epoch_times = []

for e in range(cfg["training"]["epochs"]):
    start = time.time()

    for x in loader:
        x = x.to(device)
        loss = loss_fn(model(x), x)
        opt.zero_grad()
        loss.backward()
        opt.step()

    elapsed = time.time() - start
    epoch_times.append(elapsed)

    print(f"[Epoch {e+1}] loss={loss.item():.4f}, time={elapsed:.2f}s")

total_train_time = sum(epoch_times)
print("Total AE train time:", total_train_time)

np.save("ae_train_times.npy", np.array(epoch_times))

# src/train2.py

In [None]:
import torch, os
import torch.nn as nn
from torch.utils.data import DataLoader
import time
import numpy as np
from src.data_split import get_data
from src.dataset import VoiceDataset
from src.preprocess import AudioPreprocessor
from src.utils import load_config, extract_logmel
from src.models2 import DeepVoiceClassifier

cfg = load_config("config/config.yaml")
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

train, _, _ = get_data("classifier", cfg["data"]["root"])
prep = AudioPreprocessor(cfg["data"]["sample_rate"])

dataset = VoiceDataset(
    train,
    prep,
    lambda x: torch.tensor(
        extract_logmel(x, cfg["data"]["sample_rate"], cfg["data"]["n_mels"]),
        dtype=torch.float32
    ).unsqueeze(0),
    cfg["data"]["segment_frames"] * cfg["data"]["hop_length"],
    "classifier"
)

loader = DataLoader(dataset, batch_size=cfg["training"]["batch_size"], shuffle=True)

model = DeepVoiceClassifier(
    cfg["data"]["n_mels"],
    cfg["model"]["hidden_dim"]
).to(device)

opt = torch.optim.Adam(model.parameters(), lr=cfg["training"]["lr"])
loss_fn = nn.CrossEntropyLoss()

epoch_times = []

for e in range(cfg["training"]["epochs"]):
    start = time.time()
    correct = total = 0

    for x, y in loader:
        x, y = x.to(device), y.to(device)
        loss = loss_fn(model(x), y)
        opt.zero_grad()
        loss.backward()
        opt.step()

    elapsed = time.time() - start
    epoch_times.append(elapsed)

    print(f"[Epoch {e+1}] acc={correct/total:.4f}, time={elapsed:.2f}s")

np.save("clf_train_times.npy", np.array(epoch_times))

os.makedirs("checkpoints", exist_ok=True)
torch.save(model.state_dict(), cfg["training"]["classifier_save_path"])


# 9.test.py

In [None]:
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader
from collections import defaultdict
import time
from src.data_split import get_data
from src.dataset import VoiceDataset
from src.preprocess import AudioPreprocessor
from src.utils import load_config, extract_logmel
from src.models import LSTMAutoEncoder
from eval import plot_roc

cfg = load_config("config/config.yaml")
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

_, test = get_data("anomaly", cfg["data"]["root"])
prep = AudioPreprocessor(cfg["data"]["sample_rate"])

dataset = VoiceDataset(
    test,
    prep,
    lambda x: torch.tensor(
        extract_logmel(x, cfg["data"]["sample_rate"], cfg["data"]["n_mels"]),
        dtype=torch.float32
    ),
    cfg["data"]["segment_frames"] * cfg["data"]["hop_length"],
    "anomaly_test"
)

loader = DataLoader(dataset, batch_size=1, shuffle=False)

model = LSTMAutoEncoder(
    cfg["data"]["n_mels"],
    cfg["model"]["hidden_dim"],
    cfg["model"]["latent_dim"],
    cfg["model"]["num_layers"]
).to(device)

model.load_state_dict(torch.load(cfg["training"]["save_path"], map_location=device))
model.eval()

criterion = nn.MSELoss(reduction="none")

wav_scores = defaultdict(list)
wav_labels = {}


start_test = time.time()

with torch.no_grad():
    for x, label, wid in loader:
        x = x.to(device)
        recon = model(x)
        score = criterion(recon, x).mean().item()
        wav_scores[wid.item()].append(score)
        wav_labels[wid.item()] = label.item()

total_test_time = time.time() - start_test
print("Total AE test time:", total_test_time)

final_scores, final_labels = [], []
for wid, scores in wav_scores.items():
    final_scores.append(np.mean(sorted(scores, reverse=True)[:5]))
    final_labels.append(wav_labels[wid])

auc = plot_roc(np.array(final_scores), np.array(final_labels))
print("AUC:", auc)

np.save("ae_test_time.npy", np.array([total_test_time]))




# 10.test2.py

In [None]:
import torch
import numpy as np
from torch.utils.data import DataLoader
from collections import defaultdict
import time
import numpy as np
from src.data_split import get_data
from src.dataset import VoiceDataset
from src.preprocess import AudioPreprocessor
from src.utils import load_config, extract_logmel
from src.models2 import DeepVoiceClassifier
from eval import plot_roc

cfg = load_config("config/config.yaml")
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

_, _, test = get_data("classifier", cfg["data"]["root"])
prep = AudioPreprocessor(cfg["data"]["sample_rate"])

dataset = VoiceDataset(
    test,
    prep,
    lambda x: torch.tensor(
        extract_logmel(x, cfg["data"]["sample_rate"], cfg["data"]["n_mels"]),
        dtype=torch.float32
    ).unsqueeze(0),
    cfg["data"]["segment_frames"] * cfg["data"]["hop_length"],
    "classifier_test"
)

loader = DataLoader(dataset, batch_size=1, shuffle=False)

model = DeepVoiceClassifier(
    cfg["data"]["n_mels"],
    cfg["model"]["hidden_dim"]
).to(device)

model.load_state_dict(torch.load(cfg["training"]["classifier_save_path"], map_location=device))
model.eval()

wav_scores = defaultdict(list)
wav_labels = {}

start_test = time.time()

with torch.no_grad():
    for x, label, wid in loader:
        x = x.to(device)
        prob = torch.softmax(model(x), dim=1)[0, 1].item()
        wav_scores[wid.item()].append(prob)
        wav_labels[wid.item()] = label.item()

total_test_time = time.time() - start_test
print("Total classifier test time:", total_test_time)

np.save("clf_test_time.npy", np.array([total_test_time]))


final_scores, final_labels = [], []
for wid, scores in wav_scores.items():
    final_scores.append(np.mean(sorted(scores, reverse=True)[:5]))
    final_labels.append(wav_labels[wid])

auc = plot_roc(np.array(final_scores), np.array(final_labels))
print("AUC:", auc)



# 10.eval.py / plot_distributiin.py

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

def plot_roc(scores, labels):
    """
    scores: anomaly scores (higher = more anomalous)
    labels: 0 (normal), 1 (fake)
    """

    fpr, tpr, thresholds = roc_curve(labels, scores)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(6, 6))
    plt.plot(
        fpr,
        tpr,
        color="darkorange",
        lw=2,
        label=f"ROC curve (AUC = {roc_auc:.3f})"
    )
    plt.plot([0, 1], [0, 1], color="navy", lw=1, linestyle="--")

    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve (DeepVoice Anomaly Detection)")
    plt.legend(loc="lower right")
    plt.grid(True)
    plt.show()

    return roc_auc

import numpy as np
import matplotlib.pyplot as plt

def plot_score_distribution(scores, labels):
    normal_scores = scores[labels == 0]
    fake_scores = scores[labels == 1]

    plt.figure(figsize=(8, 5))

    plt.hist(
        normal_scores,
        bins=50,
        alpha=0.6,
        label="Normal (Human)",
        density=True
    )
    plt.hist(
        fake_scores,
        bins=50,
        alpha=0.6,
        label="DeepVoice (Fake)",
        density=True
    )

    plt.xlabel("Anomaly Score (Reconstruction Error)")
    plt.ylabel("Density")
    plt.title("Anomaly Score Distribution")
    plt.legend()
    plt.grid(True)
    plt.show()
