In [1]:
import os
import glob
import time
import numpy as np
import pandas as pd
import librosa
import matplotlib.pyplot as plt
from scipy.signal import butter, filtfilt, hilbert, find_peaks
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, recall_score, accuracy_score
import seaborn as sns

def butter_bandpass_filter(data, lowcut=25.0, highcut=400.0, fs=1000, order=4):
    nyquist = 0.5 * fs
    low = lowcut / nyquist
    high = highcut / nyquist
    b, a = butter(order, [low, high], btype='band')
    y = filtfilt(b, a, data)
    return y

def get_binary_label(fname):
    fname = fname.lower()
    if "normal" in fname:
        return 0  # Normal
    elif any(x in fname for x in ["murmur", "artifact", "extrahls"]):
        return 1  # Abnormal
    else:
        return None

def extract_mfcc(cycle, sr, n_mfcc=13, max_len=260):
    mfcc = librosa.feature.mfcc(y=cycle.astype(np.float32), sr=sr, n_mfcc=n_mfcc, n_fft=512, hop_length=128)
    if mfcc.shape[1] < max_len:
        pad_width = max_len - mfcc.shape[1]
        mfcc = np.pad(mfcc, ((0, 0), (0, pad_width)), mode='constant')
    else:
        mfcc = mfcc[:, :max_len]
    return mfcc

# Load and Process Data
base_path = "/kaggle/input/heartbeat-sounds"
data_dirs = [os.path.join(base_path, "set_a"), os.path.join(base_path, "set_b")]
all_files = [file for folder in data_dirs for file in glob.glob(os.path.join(folder, "*.wav"))]

X_features = []
y_labels = []

for file_path in all_files:
    audio, sr = librosa.load(file_path, sr=None)
    if sr != 1000:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=1000)
        sr = 1000

    audio = audio / np.max(np.abs(audio))
    filtered_audio = butter_bandpass_filter(audio, fs=sr)
    envelope = np.abs(hilbert(filtered_audio))
    peaks, _ = find_peaks(envelope, distance=int(0.4 * sr), height=np.mean(envelope) * 1.2)

    cardiac_cycles = []
    if len(peaks) >= 3:
        for i in range(len(peaks) - 2):
            start, end = peaks[i], peaks[i + 2]
            if end > start:
                cycle = filtered_audio[start:end]
                cardiac_cycles.append(cycle)

    label = get_binary_label(file_path)
    if label is not None:
        for cycle in cardiac_cycles:
            mfcc = extract_mfcc(cycle, sr)
            X_features.append(mfcc.flatten())
            y_labels.append(label)

X = np.array(X_features)
y = np.array(y_labels)

# Train/Test Split and Scaling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# --- RBF Grid ---
rbf_param_grid = {
    'C': list(np.linspace(0.001, 10, 10)),
    'gamma': list(np.linspace(0.001, 10, 10)),
    'kernel': ['rbf']
}

# --- Linear Grid ---
lin_param_grid = {
    'C': list(np.linspace(0.001, 10, 10)),
    'kernel': ['linear']
}

# --- Function to Run GridSearch and Save Results ---
def run_svm_grid(grid_params):
    grid = GridSearchCV(
        estimator=SVC(),
        param_grid=grid_params,
        cv=5,
        n_jobs=-1,
        verbose=1
    )
    grid.fit(X_train, y_train)

    result = []
    for params in grid.cv_results_['params']:
        model = SVC(**params)
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
        acc = accuracy_score(y_test, pred)
        rec = recall_score(y_test, pred)
        result.append((acc, rec, params))

    return result

# --- Run Both Grids ---
rbf_results = run_svm_grid(rbf_param_grid)
lin_results = run_svm_grid(lin_param_grid)

# --- Format for CSV ---
all_svm_results = []

for acc, rec, params in rbf_results + lin_results:
    all_svm_results.append({
        'C': params['C'],
        'gamma': params['gamma'] if 'gamma' in params else None,
        'kernel': params['kernel'],
        'accuracy': acc,
        'recall': rec,
        'model_type': 'SVM'
    })

svm_df = pd.DataFrame(all_svm_results)
svm_df.to_csv("svm_all_results.csv", index=False)
print("\nAll SVM (RBF + Linear) results saved to: svm_all_results.csv")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits

All SVM (RBF + Linear) results saved to: svm_all_results.csv
