# Health Classifier Demo
Using kagglehub to download the COUGHVID dataset


In [10]:
# Install kagglehub if not already installed
try:
    import kagglehub
except ImportError:
    !pip install kagglehub

from health_classifier import HealthClassifier
from coughvid_dataset import CoughvidDataset
import os
from scipy.io import wavfile
import matplotlib.pyplot as plt
import numpy as np

# Initialize the classifier
classifier = HealthClassifier()


# Download and Prepare Dataset


In [None]:
# Simple direct download test
import kagglehub
print("Testing direct kagglehub download...")
path = kagglehub.dataset_download("nasrulhakim86/coughvid-wav")
print("Path to dataset files:", path)

# Initialize dataset handler and prepare data
dataset = CoughvidDataset(data_dir="coughvid_data")

# Download and prepare dataset (limiting to 200 samples for demonstration)
healthy_files, sick_files = dataset.prepare_data_for_training(limit=200)

print(f"Number of healthy audio files: {len(healthy_files)}")
print(f"Number of sick audio files: {len(sick_files)}")

# Visualize a sample
if len(healthy_files) > 0:
    sample_rate, audio = wavfile.read(healthy_files[0])
    
    plt.figure(figsize=(10, 4))
    plt.plot(np.arange(len(audio)) / sample_rate, audio)
    plt.title("Sample Healthy Cough Audio")
    plt.xlabel("Time (s)")
    plt.ylabel("Amplitude")
    plt.tight_layout()
    plt.show()


Testing direct kagglehub download...
Downloading from https://www.kaggle.com/api/v1/datasets/download/nasrulhakim86/coughvid-wav?dataset_version_number=1...


  0%|          | 15.0M/8.61G [00:08<1:47:06, 1.44MB/s]

In [None]:
# Train the classifier
accuracy = classifier.train(
    healthy_audio_files=healthy_files,
    unhealthy_audio_files=sick_files,
    save_path='coughvid_classifier.joblib'
)

print(f"Training complete with accuracy: {accuracy:.4f}")


In [None]:
# Function to test on a random sample
def test_random_sample(file_list, expected_label):
    import random
    
    # Select a random file
    test_file = random.choice(file_list)
    
    # Load audio
    sample_rate, audio = wavfile.read(test_file)
    
    # Make prediction
    result = classifier.predict(audio, sample_rate)
    
    # Print results
    print(f"File: {os.path.basename(test_file)}")
    print(f"Expected: {'Sick' if expected_label == 1 else 'Healthy'}")
    print(f"Prediction: {result['status']}")
    print(f"Confidence: {result['confidence']:.2%}")
    
    # Visualize audio
    plt.figure(figsize=(10, 4))
    plt.plot(np.arange(len(audio)) / sample_rate, audio)
    plt.title(f"Audio Sample - Predicted: {result['status']}")
    plt.xlabel("Time (s)")
    plt.ylabel("Amplitude")
    plt.tight_layout()
    plt.show()
    
    return result['prediction'] == expected_label

# Test on a few samples
correct = 0
total = 6
for _ in range(3):
    correct += test_random_sample(healthy_files, 0)
    
for _ in range(3):
    correct += test_random_sample(sick_files, 1)

print(f"Accuracy on test samples: {correct/total:.2%}")


## Using a Pre-trained Model
We can also load our saved model and use it directly.


In [None]:
# Initialize with our pre-trained model
pretrained_classifier = HealthClassifier(model_path='coughvid_classifier.joblib')

# Test on a sample
if len(sick_files) > 0:
    test_file = sick_files[0]
    sample_rate, audio = wavfile.read(test_file)
    
    result = pretrained_classifier.predict(audio, sample_rate)
    print(f"Prediction using pre-trained model: {result['status']}")
    print(f"Confidence: {result['confidence']:.2%}")

