<a href="https://colab.research.google.com/github/chaeyoooo/capstondesign_voicefishing/blob/main/voicefishing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import re
import librosa
import soundfile as sf
import numpy as np
import pandas as pd

# 1) 설정
SRC_DIR     = 'voicefishing'
TRIM_DIR    = os.path.join(SRC_DIR, 'trimmed_samples')
CSV_FEATURE = os.path.join(SRC_DIR, 'audio_features.csv')

os.makedirs(TRIM_DIR, exist_ok=True)

# 2) 무음 트리밍 함수
def trim_silence(y, top_db=20):
    return librosa.effects.trim(y, top_db=top_db)[0]

# 3) 오디오 파일 리스트
audio_files = sorted([
    f for f in os.listdir(SRC_DIR)
    if f.lower().endswith(('.mp3', '.wav'))
])

features = []

for fname in audio_files:
    path = os.path.join(SRC_DIR, fname)
    # 로드 (sr=None → 원본 sr 유지)
    y, sr = librosa.load(path, sr=None)
    # 트리밍
    y_trim = trim_silence(y)

    # 안전한 파일명 생성 (특수문자 제거, 길이 제한)
    base = os.path.splitext(fname)[0]
    safe = re.sub(r'[^0-9A-Za-z_-]', '_', base)[:50]
    out_fname = f"{safe}_trimmed.wav"
    out_path  = os.path.join(TRIM_DIR, out_fname)

    # 저장
    sf.write(out_path, y_trim, sr, format='WAV')
    print(f"✔ Saved trimmed: {out_fname}")

    # 4) 특징 추출
    duration_sec = len(y_trim) / sr
    mfcc         = librosa.feature.mfcc(y=y_trim, sr=sr, n_mfcc=13).mean(axis=1)
    mel_spec     = librosa.feature.melspectrogram(y=y_trim, sr=sr).mean()
    rmse         = librosa.feature.rms(y=y_trim).mean()       # ← sr 제거
    pitches, mags= librosa.piptrack(y=y_trim, sr=sr)
    pitch_vals   = pitches[mags > np.median(mags)]
    pitch_mean   = float(np.mean(pitch_vals)) if pitch_vals.size else 0.0

    feat = {
        'file': fname,
        'sr': sr,
        'duration_sec': duration_sec,
        'rmse': rmse,
        'pitch_mean': pitch_mean,
        'mel_spec': mel_spec
    }
    # MFCC 항목 추가
    for i, c in enumerate(mfcc, 1):
        feat[f'mfcc_{i}'] = c

    features.append(feat)

# 5) CSV로 저장
df = pd.DataFrame(features)
df.to_csv(CSV_FEATURE, index=False)
print(f"✔ Features saved to {CSV_FEATURE}")



✔ Saved trimmed: ___________________________________________________trimmed.wav
✔ Saved trimmed: ___________________________________________________trimmed.wav
✔ Saved trimmed: _____________________________________________1__trimmed.wav
✔ Saved trimmed: ____________________________________________trimmed.wav
✔ Saved trimmed: _______________________________OO__________________trimmed.wav
✔ Saved trimmed: _______________________________OO__________________trimmed.wav
✔ Saved trimmed: ___________________________________________________trimmed.wav
✔ Saved trimmed: ___________________________________________________trimmed.wav
✔ Saved trimmed: _______________KB__________________________________trimmed.wav
✔ Saved trimmed: 1__________________________________________________trimmed.wav
✔ Saved trimmed: 1_____________NR0509_______________________________trimmed.wav
✔ Saved trimmed: 10FF__trimmed.wav
✔ Saved trimmed: 11FF__trimmed.wav
✔ Saved trimmed: 12FF__trimmed.wav
✔ Saved trimmed: 13FF__tr