### Nyoba bikin prediksi aja dulu, tanpa pre-trained model dan preprocessing aneh-aneh

In [8]:
import os
import numpy as np
import librosa
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from collections import Counter

In [9]:
# Paths
base_dir = "./dataset/Synaptic"
disease_dir = os.path.join(base_dir, "Disease")
healthy_dir = os.path.join(base_dir, "Healthy")

In [7]:
# Collect files and labels
files, labels = [], []

# Add disease classes
for disease in os.listdir(disease_dir):
    class_path = os.path.join(disease_dir, disease)
    if os.path.isdir(class_path):
        for fname in os.listdir(class_path):
            if fname.endswith(".wav"):
                files.append(os.path.join(class_path, fname))
                labels.append(disease)

# Add healthy class
for fname in os.listdir(healthy_dir):
    if fname.endswith(".wav"):
        files.append(os.path.join(healthy_dir, fname))
        labels.append("Healthy")

print(f"Total files: {len(files)}, Classes: {set(labels)}")

Total files: 2026, Classes: {'Lung_Fibrosis', 'Pneumonia', 'COPD', 'Plueral_Effusion', 'Bronchiolitis', 'Asthma', 'URTI', 'Bronchiectasis', 'Healthy', 'Bronchitis'}


In [10]:
# After building labels list
class_counts = Counter(labels)

print("Class distribution:")
for cls, count in class_counts.items():
    print(f"{cls}: {count}")

Class distribution:
Asthma: 111
Bronchiectasis: 105
Bronchiolitis: 164
Bronchitis: 53
COPD: 115
Lung_Fibrosis: 114
Plueral_Effusion: 52
Pneumonia: 160
URTI: 101
Healthy: 1051


In [11]:
# Feature extraction function
def extract_features(filepath, n_mfcc=20):
    y, sr = librosa.load(filepath, sr=None)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    # Take mean over time (so it's fixed-length vector)
    return np.mean(mfcc, axis=1)

# Build dataset
X, y = [], []
for f, label in zip(files, labels):
    try:
        feat = extract_features(f)
        X.append(feat)
        y.append(label)
    except Exception as e:
        print(f"Error processing {f}: {e}")

X = np.array(X)
y = np.array(y)

print("Feature matrix shape:", X.shape)

Feature matrix shape: (2026, 20)


In [13]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Baseline model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluation
y_pred = model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Classification Report:
                  precision    recall  f1-score   support

          Asthma       0.70      0.32      0.44        22
  Bronchiectasis       1.00      1.00      1.00        21
   Bronchiolitis       0.94      1.00      0.97        33
      Bronchitis       0.80      0.73      0.76        11
            COPD       0.93      0.61      0.74        23
         Healthy       0.89      1.00      0.94       211
   Lung_Fibrosis       1.00      0.74      0.85        23
Plueral_Effusion       0.82      0.90      0.86        10
       Pneumonia       0.93      0.88      0.90        32
            URTI       0.95      0.90      0.92        20

        accuracy                           0.90       406
       macro avg       0.90      0.81      0.84       406
    weighted avg       0.90      0.90      0.89       406

Confusion Matrix:
[[  7   0   0   2   1   9   0   1   2   0]
 [  0  21   0   0   0   0   0   0   0   0]
 [  0   0  33   0   0   0   0   0   0   0]
 [  0   0   0  