In [1]:
import os
import librosa
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

First dataset

In [2]:
path = "02 Classified data fluent_non fluent\\"
fluent = "fluent\\"
nonfluent = "non fluent\\"
accents = ["ArabMagribMP3", "IraqMP3", "JordanMP3", "SaudiMP3"]

In [3]:
# Function to summarize various MFCC features
def summarize(mfcc):
    summary = np.concatenate([
        np.mean(mfcc, axis=1),
        np.std(mfcc, axis=1),
        np.min(mfcc, axis=1),
        np.max(mfcc, axis=1)
    ])
    return summary

# Function to extract MFCC features, including deltas and delta-deltas
def extract_features(file_path, n_mfcc=13, sr=16000):
    try:
        y, sr = librosa.load(file_path, sr=sr)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        mfcc_delta = librosa.feature.delta(mfcc, order=1)
        mfcc_delta2 = librosa.feature.delta(mfcc, order=2)
        feature = summarize(mfcc)
        feature_delta = summarize(mfcc_delta)
        feature_delta2 = summarize(mfcc_delta2)
        features = np.concatenate([feature, feature_delta, feature_delta2])
        return features
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

In [4]:
# Parameters for MFCC feature extraction
n_mfcc = 20
sr = 16000

In [5]:
# Extract features from mp3 files
records = []
for accent in accents:
    for filename in tqdm(os.listdir(path + fluent + accent)):
        if filename.endswith(".mp3"):
            file_path = os.path.join(path, fluent, accent, filename)
            feats = extract_features(file_path, n_mfcc)
            if feats is not None:
                records.append([filename, "1"] + feats.tolist())
    for filename in tqdm(os.listdir(path + nonfluent + accent)):
        if filename.endswith(".mp3"):
            file_path = os.path.join(path, nonfluent, accent, filename)
            feats = extract_features(file_path, n_mfcc)
            if feats is not None:
                records.append([filename, "0"] + feats.tolist())

100%|██████████████████████████████████████████████████████████████████████████████████| 38/38 [00:13<00:00,  2.90it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 74/74 [00:07<00:00, 10.55it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 39/39 [00:02<00:00, 14.49it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 52/52 [00:04<00:00, 11.28it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 53/53 [00:04<00:00, 12.80it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:06<00:00, 10.24it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 49/49 [00:03<00:00, 15.31it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 103/103 [00:09<00:00, 10.79it/s]


In [6]:
# Summarize F0 features
def extract_f0(file_path, sr=16000):
    try:
        y, sr = librosa.load(file_path, sr=sr)
        f0, voiced_flag, _ = librosa.pyin(y=y, sr=sr,
                                          fmin=librosa.note_to_hz('C2'),
                                          fmax=librosa.note_to_hz('C7'))
        f0_voiced = f0[~np.isnan(f0)]
        if len(f0_voiced) > 0:
            f0_mean  = np.mean(f0_voiced)
            f0_std   = np.std(f0_voiced)
            f0_min   = np.min(f0_voiced)
            f0_max   = np.max(f0_voiced)
            f0_range = f0_max - f0_min
        else:
            f0_mean = f0_std = f0_min = f0_max = f0_range = 0.0
        voicing_ratio = np.mean(voiced_flag)
        features = np.array([
            f0_mean,
            f0_std,
            f0_min,
            f0_max,
            f0_range,
            voicing_ratio
        ])
        return features
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

In [7]:
# Summarize RMS features
def extract_rms(file_path):
    try:
        y, sr = librosa.load(file_path)
        rms = librosa.feature.rms(y=y).flatten()
        rms_mean = np.mean(rms)
        rms_std  = np.std(rms)
        rms_min  = np.min(rms)
        rms_max  = np.max(rms)
        silence_threshold = 0.1 * rms_max
        silence_ratio = np.mean(rms < silence_threshold)
        features = np.array([
            rms_mean,
            rms_std,
            rms_min,
            rms_max,
            silence_ratio
        ])
        return features
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

In [8]:
# Extract F0 and RMS features
f0_records = []
rms_records = []
for accent in accents:
    for filename in tqdm(os.listdir(path + fluent + accent)):
        if filename.endswith(".mp3"):
            file_path = os.path.join(path, fluent, accent, filename)
            f0_feats = extract_f0(file_path)
            rms_feats = extract_rms(file_path)
            if f0_feats is not None:
                f0_records.append([filename, "1"] + f0_feats.tolist())
            if rms_feats is not None:
                rms_records.append([filename, "1"] + rms_feats.tolist())
    for filename in tqdm(os.listdir(path + nonfluent + accent)):
        if filename.endswith(".mp3"):
            file_path = os.path.join(path, nonfluent, accent, filename)
            f0_feats = extract_f0(file_path)
            rms_feats = extract_rms(file_path)
            if f0_feats is not None:
                f0_records.append([filename, "0"] + f0_feats.tolist())
            if rms_feats is not None:
                rms_records.append([filename, "0"] + rms_feats.tolist())

100%|██████████████████████████████████████████████████████████████████████████████████| 38/38 [04:24<00:00,  6.97s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 74/74 [10:15<00:00,  8.32s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 39/39 [03:48<00:00,  5.87s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 52/52 [06:39<00:00,  7.68s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 53/53 [05:55<00:00,  6.71s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [09:54<00:00,  9.43s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 49/49 [05:14<00:00,  6.41s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 103/103 [16:14<00:00,  9.46s/it]


In [9]:
# Construct dataframe for F0 features
f0_features = ["f0_mean", "f0_std", "f0_min", "f0_max", "f0_range", "voicing_ratio"]
f0_columns = ['filename', 'fluent'] + f0_features
df_f0 = pd.DataFrame(f0_records, columns=f0_columns)
df_f0.head()

Unnamed: 0,filename,fluent,f0_mean,f0_std,f0_min,f0_max,f0_range,voicing_ratio
0,alg2 (kabyle1).mp3,1,249.655258,51.402558,65.406391,401.157895,335.751504,0.729487
1,mor1 (arabic16).mp3,1,123.014564,13.720698,84.821595,160.12185,75.300255,0.664557
2,mor13 (arabic199).mp3,1,254.295179,44.157383,175.625645,369.994423,194.368778,0.594268
3,mor15 (amazigh2).mp3,1,125.542651,19.172727,65.406391,210.065153,144.658762,0.704986
4,mor2 (arabic17).mp3,1,127.699402,11.174934,90.909535,176.643034,85.733499,0.69


In [10]:
# Construct dataframe for RMS features
rms_features = ["rms_mean", "rms_std", "rms_min", "rms_max", "rms_silence_ratio"]
rms_columns = ['filename', 'fluent'] + rms_features
df_rms = pd.DataFrame(rms_records, columns=rms_columns)
df_rms.head()

Unnamed: 0,filename,fluent,rms_mean,rms_std,rms_min,rms_max,rms_silence_ratio
0,alg2 (kabyle1).mp3,1,0.057814,0.04848,2.230768e-05,0.250718,0.340465
1,mor1 (arabic16).mp3,1,0.043067,0.026,5.113224e-07,0.143564,0.13908
2,mor13 (arabic199).mp3,1,0.03339,0.026803,1.820597e-07,0.144091,0.274918
3,mor15 (amazigh2).mp3,1,0.052135,0.039502,8.08093e-06,0.204652,0.261307
4,mor2 (arabic17).mp3,1,0.075353,0.061941,4.152006e-07,0.235493,0.280829


In [11]:
# Constract dataframe for MFCC features
stats = ["mean", "std", "min", "max"]
blocks = ["mfcc", "mfcc_delta", "mfcc_delta2"]
feature_columns = []
for block in blocks:
    for i in range(n_mfcc):
        for stat in stats:
            feature_columns.append(f"{block}_{i+1}_{stat}")
columns = ["filename", "fluent"] + feature_columns
df_mfcc = pd.DataFrame(records, columns=columns)
df_mfcc.head()

Unnamed: 0,filename,fluent,mfcc_1_mean,mfcc_1_std,mfcc_1_min,mfcc_1_max,mfcc_2_mean,mfcc_2_std,mfcc_2_min,mfcc_2_max,...,mfcc_delta2_18_min,mfcc_delta2_18_max,mfcc_delta2_19_mean,mfcc_delta2_19_std,mfcc_delta2_19_min,mfcc_delta2_19_max,mfcc_delta2_20_mean,mfcc_delta2_20_std,mfcc_delta2_20_min,mfcc_delta2_20_max
0,alg2 (kabyle1).mp3,1,-249.161224,82.340042,-0.009177,11.734279,-5.877758,3.598008,-13.824122,7.582335,...,4.158702,4.278359,4.318278,4.421778,3.077568,3.625714,4.804006,3.497211,5.426595,6.177319
1,mor1 (arabic16).mp3,1,-335.557739,36.862637,56.727058,32.562382,7.290244,-13.6953,-5.23685,3.017725,...,8.43731,8.465884,8.034334,7.336616,5.096448,5.487182,7.805944,5.404134,5.190119,4.98966
2,mor13 (arabic199).mp3,1,-346.90509,59.868484,20.586031,10.38027,-9.471201,10.675188,-5.354011,-19.770229,...,4.268238,6.112103,3.445806,4.803235,5.751132,4.804028,3.685207,4.457691,5.011315,5.532834
3,mor15 (amazigh2).mp3,1,-301.734253,113.287285,19.388758,22.271446,15.049973,-11.677243,5.139485,-3.861141,...,4.234007,3.85443,4.418768,3.523063,3.390488,3.573723,3.159323,3.018483,2.657592,2.592747
4,mor2 (arabic17).mp3,1,-324.137604,62.490353,50.893974,21.06472,3.662904,-5.984343,-2.097588,7.498139,...,5.912486,5.189554,5.346256,4.310682,4.553317,6.034519,5.878642,5.742945,6.040529,5.033487


In [12]:
df_tmp = pd.merge(df_mfcc, df_f0, on=['filename', 'fluent'], how='inner')
df_final = pd.merge(df_tmp, df_rms, on=['filename', 'fluent'], how='inner')

In [13]:
df_final.head()

Unnamed: 0,filename,fluent,mfcc_1_mean,mfcc_1_std,mfcc_1_min,mfcc_1_max,mfcc_2_mean,mfcc_2_std,mfcc_2_min,mfcc_2_max,...,f0_std,f0_min,f0_max,f0_range,voicing_ratio,rms_mean,rms_std,rms_min,rms_max,rms_silence_ratio
0,alg2 (kabyle1).mp3,1,-249.161224,82.340042,-0.009177,11.734279,-5.877758,3.598008,-13.824122,7.582335,...,51.402558,65.406391,401.157895,335.751504,0.729487,0.057814,0.04848,2.230768e-05,0.250718,0.340465
1,mor1 (arabic16).mp3,1,-335.557739,36.862637,56.727058,32.562382,7.290244,-13.6953,-5.23685,3.017725,...,13.720698,84.821595,160.12185,75.300255,0.664557,0.043067,0.026,5.113224e-07,0.143564,0.13908
2,mor13 (arabic199).mp3,1,-346.90509,59.868484,20.586031,10.38027,-9.471201,10.675188,-5.354011,-19.770229,...,44.157383,175.625645,369.994423,194.368778,0.594268,0.03339,0.026803,1.820597e-07,0.144091,0.274918
3,mor15 (amazigh2).mp3,1,-301.734253,113.287285,19.388758,22.271446,15.049973,-11.677243,5.139485,-3.861141,...,19.172727,65.406391,210.065153,144.658762,0.704986,0.052135,0.039502,8.08093e-06,0.204652,0.261307
4,mor2 (arabic17).mp3,1,-324.137604,62.490353,50.893974,21.06472,3.662904,-5.984343,-2.097588,7.498139,...,11.174934,90.909535,176.643034,85.733499,0.69,0.075353,0.061941,4.152006e-07,0.235493,0.280829


In [14]:
df_mfcc.to_csv('audio_mfcc_features.csv', index=False)
df_final.to_csv('audio_features.csv', index=False)

In [None]:
# Sample speech wavelengths
file_path1 = "02 Classified data fluent_non fluent\\fluent\\JordanMP3\\jor11.mp3"
file_path2 = "02 Classified data fluent_non fluent\\non fluent\\SaudiMP3\\ksa127.mp3"

y1, sr1 = librosa.load(file_path1, sr=sr)
t1 = np.arange(len(y1)) / sr1
y2, sr2 = librosa.load(file_path2, sr=sr)
t2 = np.arange(len(y2)) / sr2
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(6, 4))
fig.suptitle("Speech Wavelengths", fontsize=16)

axes[0].plot(t1, y1)
axes[0].set_xlabel("Time (s)")
axes[0].set_ylabel("Amplitude")

axes[1].plot(t2, y2)
axes[1].set_xlabel("Time (s)")
axes[1].set_ylabel("Amplitude")

plt.tight_layout()
plt.show()

Second dataset

In [None]:
# Get select accents
path =  "01 All Accent detection data\\"
accents = ["iraq (irq)", "jordan (jor)", "saudi arabia (ksa)", "tunisia (tun)"]

In [None]:
# Preprocess all mp3 files
records = []
for accent in accents:
    for filename in tqdm(os.listdir(path + accent)):
        if filename.endswith(".mp3"):
            file_path = os.path.join(path, accent, filename)
            feats = extract_features(file_path, n_mfcc)
            if feats is not None:
                records.append([filename, ] + [accent] + feats.tolist())

In [None]:
# Constract dataframe
stats = ["mean", "std", "min", "max"]
blocks = ["mfcc", "mfcc_delta", "mfcc_delta2"]
feature_columns = []
for block in blocks:
    for i in range(n_mfcc):
        for stat in stats:
            feature_columns.append(f"{block}_{i+1}_{stat}")
columns = ["filename", "accent"] + feature_columns
df = pd.DataFrame(records, columns=columns)

In [None]:
df.head()

In [None]:
df.to_csv('accent_features.csv', index=False)