In [None]:
# ------------------------------------------
# 1) Imports & Setup
# ------------------------------------------

import os
import numpy as np
import librosa
import tensorflow as tf
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# ------------------------------------------
# 2) Parameters
# ------------------------------------------

DATA_DIR = "nepali_dataset"
CHUNK_DURATION = 1.5  # seconds
SAMPLE_RATE = 16000
CHUNK_SIZE = int(SAMPLE_RATE * CHUNK_DURATION)
TARGET_SHAPE = (128, 128)

class_names = ['fluent', 'stutter']

# ------------------------------------------
# 3) Audio to Log-Mel Conversion
# ------------------------------------------

def extract_log_mel(audio, sr):
    audio = audio / (np.max(np.abs(audio)) + 1e-6)
    mel = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=TARGET_SHAPE[0])
    log_mel = librosa.power_to_db(mel)
    h, w = log_mel.shape
    pad_h = max(0, TARGET_SHAPE[0] - h)
    pad_w = max(0, TARGET_SHAPE[1] - w)
    log_mel = np.pad(log_mel, ((0, pad_h), (0, pad_w)), mode='constant')
    return log_mel[:TARGET_SHAPE[0], :TARGET_SHAPE[1]]

# ------------------------------------------
# 4) Load and Process Dataset
# ------------------------------------------

X = []
y = []

for label, class_name in enumerate(class_names):
    folder = os.path.join(DATA_DIR, class_name)
    if not os.path.isdir(folder):
        continue

    for file in os.listdir(folder):
        if not file.endswith(".wav"):
            continue

        file_path = os.path.join(folder, file)
        try:
            audio, sr = librosa.load(file_path, sr=SAMPLE_RATE)
            total_samples = len(audio)

            for start in range(0, total_samples - CHUNK_SIZE + 1, CHUNK_SIZE):
                chunk = audio[start:start + CHUNK_SIZE]
                X.append(extract_log_mel(chunk, sr))
                y.append(label)

                # Augment 1: Add noise
                noise = np.random.normal(0, 0.005, chunk.shape)
                X.append(extract_log_mel(chunk + noise, sr))
                y.append(label)

                # Augment 2: Pitch shift
                try:
                    pitched = librosa.effects.pitch_shift(chunk, sr=sr, n_steps=2)
                    X.append(extract_log_mel(pitched, sr))
                    y.append(label)
                except:
                    pass

                # Augment 3: Time stretch
                try:
                    stretched = librosa.effects.time_stretch(chunk, rate=1.1)
                    if len(stretched) >= CHUNK_SIZE:
                        stretched = stretched[:CHUNK_SIZE]
                        X.append(extract_log_mel(stretched, sr))
                        y.append(label)
                except:
                    pass

        except Exception as e:
            print(f"[ERROR] {file}: {e}")

# Finalize dataset
X = np.stack(X).astype(np.float32)[..., np.newaxis]
y = np.array(y)
print("Total samples:", len(X))

# ------------------------------------------
# 5) Train/Test Split
# ------------------------------------------

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# ------------------------------------------
# 6) CNN Model
# ------------------------------------------

model = tf.keras.Sequential([
    tf.keras.layers.Conv2D(32, (3,3), activation='relu', input_shape=(TARGET_SHAPE[0], TARGET_SHAPE[1], 1)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling2D((2,2)),
    tf.keras.layers.Dropout(0.3),

    tf.keras.layers.Conv2D(64, (3,3), activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling2D((2,2)),
    tf.keras.layers.Dropout(0.3),

    tf.keras.layers.Conv2D(128, (3,3), activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling2D((2,2)),
    tf.keras.layers.Dropout(0.3),

    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.summary()

# ------------------------------------------
# 7) Train
# ------------------------------------------

callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=8, restore_best_weights=True),
    tf.keras.callbacks.ReduceLROnPlateau(patience=4, factor=0.5),
    tf.keras.callbacks.ModelCheckpoint("best_stutter_model.h5", save_best_only=True)
]

history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=30,
    batch_size=32,
    callbacks=callbacks
)

# ------------------------------------------
# 8) Save and Plot
# ------------------------------------------

model.save("final_stutter_detection_flutter_cnn.h5")

# Plot training curves
plt.plot(history.history['accuracy'], label='Train Acc')
plt.plot(history.history['val_accuracy'], label='Val Acc')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)
plt.savefig("accuracy_plot.png")
plt.show()

plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.savefig("loss_plot.png")
plt.show()

print("Training complete. Model and plots saved.")

Total samples: 12906


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


ValueError: The filepath provided must end in `.keras` (Keras model format). Received: filepath=best_stutter_model.h5

In [None]:
print(X_train.shape)

In [None]:
# ------------------------------------------
# 11) Confusion Matrix & Classification Report
# ------------------------------------------

from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Get predictions
y_pred = (model.predict(X_test) > 0.5).astype(int).flatten()

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)

# Plot Confusion Matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.savefig("confusion_matrix.png")
plt.show()

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=class_names))

In [None]:
print(X_train.shape)  # should show (num_samples, height, width, 1)