In [None]:
import os
import numpy as np
import librosa
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import joblib




In [None]:
def extract_mfcc_features(file_path, sr=16000, duration=2.5, n_mfcc=13):
    try:
        y, _ = librosa.load(file_path, sr=sr, duration=duration)
        if len(y) < int(sr * duration):
            y = np.pad(y, (0, int(sr * duration) - len(y)))
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        return np.mean(mfcc.T, axis=0)
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None


In [None]:
def load_dataset(base_path='data'):
    X, y = [], []
    labels = {'fluent': 0, 'stuttered': 1}
    
    for label_name, label_id in labels.items():
        folder = os.path.join(base_path, label_name)
        for file in os.listdir(folder):
            if file.endswith('.flac'):
                file_path = os.path.join(folder, file)
                features = extract_mfcc_features(file_path)
                if features is not None:
                    X.append(features)
                    y.append(label_id)
                    
    return np.array(X), np.array(y)


RuntimeError: mat1 and mat2 shapes cannot be multiplied (16x327360 and 16000x64)

In [None]:
X, y = load_dataset()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=["Fluent", "Stuttered"]))


In [None]:
joblib.dump(clf, "rf_stutter_model.pkl")
print("Model saved as rf_stutter_model.pkl")
