In [1]:
import os
import glob
import time
import numpy as np
import pandas as pd
import librosa
import matplotlib.pyplot as plt
from scipy.signal import butter, filtfilt, hilbert, find_peaks
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, recall_score, accuracy_score
import seaborn as sns

def butter_bandpass_filter(data, lowcut=25.0, highcut=400.0, fs=1000, order=4):
    nyquist = 0.5 * fs
    low = lowcut / nyquist
    high = highcut / nyquist
    b, a = butter(order, [low, high], btype='band')
    y = filtfilt(b, a, data)
    return y

def get_binary_label(fname):
    fname = fname.lower()
    if "normal" in fname:
        return 0  # Normal
    elif any(x in fname for x in ["murmur", "artifact", "extrahls"]):
        return 1  # Abnormal
    else:
        return None

def extract_mfcc(cycle, sr, n_mfcc=13, max_len=260):
    mfcc = librosa.feature.mfcc(y=cycle.astype(np.float32), sr=sr, n_mfcc=n_mfcc, n_fft=512, hop_length=128)
    if mfcc.shape[1] < max_len:
        pad_width = max_len - mfcc.shape[1]
        mfcc = np.pad(mfcc, ((0, 0), (0, pad_width)), mode='constant')
    else:
        mfcc = mfcc[:, :max_len]
    return mfcc

# Load and Process Data
base_path = "/kaggle/input/heartbeat-sounds"
data_dirs = [os.path.join(base_path, "set_a"), os.path.join(base_path, "set_b")]
all_files = [file for folder in data_dirs for file in glob.glob(os.path.join(folder, "*.wav"))]

X_features = []
y_labels = []

for file_path in all_files:
    audio, sr = librosa.load(file_path, sr=None)
    if sr != 1000:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=1000)
        sr = 1000

    audio = audio / np.max(np.abs(audio))
    filtered_audio = butter_bandpass_filter(audio, fs=sr)
    envelope = np.abs(hilbert(filtered_audio))
    peaks, _ = find_peaks(envelope, distance=int(0.4 * sr), height=np.mean(envelope) * 1.2)

    cardiac_cycles = []
    if len(peaks) >= 3:
        for i in range(len(peaks) - 2):
            start, end = peaks[i], peaks[i + 2]
            if end > start:
                cycle = filtered_audio[start:end]
                cardiac_cycles.append(cycle)

    label = get_binary_label(file_path)
    if label is not None:
        for cycle in cardiac_cycles:
            mfcc = extract_mfcc(cycle, sr)
            X_features.append(mfcc.flatten())
            y_labels.append(label)

X = np.array(X_features)
y = np.array(y_labels)

# Train/Test Split and Scaling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# --- Combined SVM Grid Search (Sigmoid + Poly) ---
param_grid = {
    'C': list(np.linspace(0.001, 10, 10)),
    'gamma': list(np.linspace(0.001, 10, 10)),
    'kernel': ['sigmoid', 'poly']
}

svm = SVC()
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

# --- Training ---
start_train = time.time()
grid_search.fit(X_train, y_train)
end_train = time.time()
print(f"\n GridSearch Training Time: {end_train - start_train:.4f} seconds")

# --- Collect Accuracy & Recall for All Combinations ---
all_results = []
cv_results = grid_search.cv_results_

print("\n Collecting Accuracy & Recall for Each Hyperparameter Combination...")
for mean_score, params in zip(cv_results['mean_test_score'], cv_results['params']):
    model = SVC(**params)
    model.fit(X_train, y_train)
    y_temp_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_temp_pred)
    rec = recall_score(y_test, y_temp_pred)
    all_results.append((acc, rec, params))

# --- Format & Save Results ---
df = pd.DataFrame([
    {
        'C': params['C'],
        'gamma': params['gamma'],
        'kernel': params['kernel'],
        'accuracy': acc,
        'recall': rec,
        'model_type': 'SVM'
    }
    for acc, rec, params in all_results
])

df.to_csv("svm_sigmoid_poly_grid_results.csv", index=False)
print("\nGrid search results saved to: svm_sigmoid_poly_grid_results.csv")

Fitting 5 folds for each of 200 candidates, totalling 1000 fits

 GridSearch Training Time: 9124.0871 seconds

 Collecting Accuracy & Recall for Each Hyperparameter Combination...

Grid search results saved to: svm_sigmoid_poly_grid_results.csv
