# **Installation**

In [1]:
!pip install audb




[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# !pip install shap

In [2]:
!pip install opensmile




[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
# !pip install alibi

In [3]:
!pip install pandas




[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
!pip install numpy




[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
!pip install seaborn




[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
!pip install matplotlib




[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
import os
import time
import numpy as np
import pandas as pd
import seaborn as sns
import audb
import audiofile
import opensmile
from sklearn.utils import shuffle
import random
# import shap
# from alibi.explainers import ALE
# from alibi.explainers import plot_ale
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
%matplotlib inline

# **Documentation**


mRMR : https://freedium.cfd/https://towardsdatascience.com/mrmr-explained-exactly-how-you-wished-someone-explained-to-you-9cf4ed27458b

mRMR Documentation: https://feature-engine.trainindata.com/en/1.8.x/user_guide/selection/MRMR.html and https://feature-engine.trainindata.com/en/1.8.x/api_doc/selection/MRMR.html#feature_engine.selection.MRMR

# **Functionals**

In [8]:
hs_path = r"C:\Users\Hp\Desktop\Lecture Files\DH604\mRMR\Italian a\Dataset_1s\HS"
pd_path = r"C:\Users\Hp\Desktop\Lecture Files\DH604\mRMR\Italian a\Dataset_1s\PD"
smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.ComParE_2016,
    feature_level=opensmile.FeatureLevel.Functionals,
)
# smile.feature_names

In [9]:
def process_audio_folder(folder_path, y_value, duration=10):
    results = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".wav"):
            file_path = os.path.join(folder_path, file_name)
            try:
                signal, sampling_rate = audiofile.read(
                    file_path,
                    duration=duration,
                    always_2d=True
                )
                features = smile.process_signal(signal, sampling_rate)
                features.insert(0, 'file_name', file_name)
                features['Y'] = y_value
                results.append(features)
            except Exception as e:
                print(f"Error processing {file_name}: {e}")
    df_results = pd.concat(results, ignore_index=True)
    print("Done")
    return df_results

In [10]:
def concatenate_and_shuffle(df1, df2):
    concatenated_df = pd.concat([df1, df2], ignore_index=True)
    shuffled_df = concatenated_df.sample(frac=1).reset_index(drop=True)
    return shuffled_df

In [33]:
hs_df = process_audio_folder(hs_path, 0, duration=10)

Done


In [34]:
hs_df.shape

(77, 6375)

In [35]:
pd_df = process_audio_folder(pd_path, 1, duration=10)

Done


In [40]:
pd_df.shape

(77, 6375)

In [37]:
def balance_dataframes(hs_df, pd_df):
    min_length = min(len(hs_df), len(pd_df))
    
    if len(hs_df) > min_length:
        hs_df = hs_df.sample(n=min_length, random_state=42).reset_index(drop=True)
    
    if len(pd_df) > min_length:
        pd_df = pd_df.sample(n=min_length, random_state=42).reset_index(drop=True)
    
    return hs_df, pd_df

In [38]:
hs_df, pd_df = balance_dataframes(hs_df, pd_df)

In [39]:
df_functionals = concatenate_and_shuffle(hs_df, pd_df)

In [41]:
df_functionals.head()

Unnamed: 0,file_name,audspec_lengthL1norm_sma_range,audspec_lengthL1norm_sma_maxPos,audspec_lengthL1norm_sma_minPos,audspec_lengthL1norm_sma_quartile1,audspec_lengthL1norm_sma_quartile2,audspec_lengthL1norm_sma_quartile3,audspec_lengthL1norm_sma_iqr1-2,audspec_lengthL1norm_sma_iqr2-3,audspec_lengthL1norm_sma_iqr1-3,...,mfcc_sma_de[14]_peakRangeRel,mfcc_sma_de[14]_peakMeanAbs,mfcc_sma_de[14]_peakMeanMeanDist,mfcc_sma_de[14]_peakMeanRel,mfcc_sma_de[14]_minRangeRel,mfcc_sma_de[14]_meanRisingSlope,mfcc_sma_de[14]_stddevRisingSlope,mfcc_sma_de[14]_meanFallingSlope,mfcc_sma_de[14]_stddevFallingSlope,Y
0,VA1cdaopmoe67M2605161906_1.wav,1.737388,0.316062,0.0,1.298277,1.387176,1.559847,0.088899,0.172671,0.26157,...,0.681077,0.831712,0.80122,15.750338,0.292815,50.18417,27.437109,37.83535,15.782556,1
1,VA1ACNUTCOC40M230320171125_1.wav,0.865035,0.031088,0.979275,0.72467,0.793456,0.934875,0.068786,0.141419,0.210205,...,0.610465,1.302754,1.316529,-19.28879,0.578008,68.996712,32.219372,63.904922,27.51194,0
2,VA1GBIAORVI48M230320171236_4.wav,0.345587,0.300518,0.725389,0.67284,0.728825,0.773324,0.055985,0.0445,0.100485,...,0.518003,1.119048,1.067203,14.69551,0.526193,46.71077,19.187614,44.479095,25.896019,0
3,VA1APNOTROC49M230320170928_1.wav,0.300496,0.07772,0.53886,0.2983,0.335848,0.397874,0.037548,0.062026,0.099575,...,0.576485,1.211085,1.200336,19.504444,0.514629,59.023403,27.411112,60.256046,33.831272,0
4,VA2MBAUROIN45M100220171008_1.wav,0.22808,0.0,0.658031,0.676202,0.703438,0.730181,0.027236,0.026742,0.053978,...,0.468196,1.001835,1.027505,-17.143257,0.489243,44.406948,21.239164,43.583591,21.970304,1


In [42]:
df_functionals.drop(columns=['file_name'], inplace=True)

# **Data Preprocessing for Functionals**

In [43]:
df_functionals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 154 entries, 0 to 153
Columns: 6374 entries, audspec_lengthL1norm_sma_range to Y
dtypes: float32(6373), int64(1)
memory usage: 3.7 MB


In [44]:
df_functionals.head(2)

Unnamed: 0,audspec_lengthL1norm_sma_range,audspec_lengthL1norm_sma_maxPos,audspec_lengthL1norm_sma_minPos,audspec_lengthL1norm_sma_quartile1,audspec_lengthL1norm_sma_quartile2,audspec_lengthL1norm_sma_quartile3,audspec_lengthL1norm_sma_iqr1-2,audspec_lengthL1norm_sma_iqr2-3,audspec_lengthL1norm_sma_iqr1-3,audspec_lengthL1norm_sma_percentile1.0,...,mfcc_sma_de[14]_peakRangeRel,mfcc_sma_de[14]_peakMeanAbs,mfcc_sma_de[14]_peakMeanMeanDist,mfcc_sma_de[14]_peakMeanRel,mfcc_sma_de[14]_minRangeRel,mfcc_sma_de[14]_meanRisingSlope,mfcc_sma_de[14]_stddevRisingSlope,mfcc_sma_de[14]_meanFallingSlope,mfcc_sma_de[14]_stddevFallingSlope,Y
0,1.737388,0.316062,0.0,1.298277,1.387176,1.559847,0.088899,0.172671,0.26157,0.032025,...,0.681077,0.831712,0.80122,15.750338,0.292815,50.18417,27.437109,37.83535,15.782556,1
1,0.865035,0.031088,0.979275,0.72467,0.793456,0.934875,0.068786,0.141419,0.210205,0.547266,...,0.610465,1.302754,1.316529,-19.28879,0.578008,68.996712,32.219372,63.904922,27.51194,0


In [23]:
# def remove_outliers_iqr(df, lower_bound=0.25, upper_bound=0.75):
#     filtered_df = df.copy()
#     for column in filtered_df.select_dtypes(include=['number']).columns:
#         Q1 = filtered_df[column].quantile(lower_bound)
#         Q3 = filtered_df[column].quantile(upper_bound)
#         IQR = Q3 - Q1
#         lower_limit = Q1 - 1.5* IQR
#         upper_limit = Q3 + 1.5 * IQR
#         filtered_df = filtered_df[(filtered_df[column] >= lower_limit) & (filtered_df[column] <= upper_limit)]
#     return filtered_df

In [24]:
# df_functionals = remove_outliers_iqr(df_functionals, lower_bound=0.1, upper_bound=0.9)

In [25]:
# df_functionals.head(2)

In [45]:
def normalize_dataframe(df):
    normalized_df = df.copy()
    for column in normalized_df.select_dtypes(include=['number']).columns:
        min_val = normalized_df[column].min()
        max_val = normalized_df[column].max()
        normalized_df[column] = (normalized_df[column] - min_val) / (max_val - min_val)
    return normalized_df


In [None]:
# def standardize_dataframe(df):
#     standardized_df = df.copy()
#     for column in standardized_df.select_dtypes(include=['number']).columns:
#         mean = standardized_df[column].mean()
#         std = standardized_df[column].std()
#         standardized_df[column] = (standardized_df[column] - mean) / std
#     return standardized_df

In [46]:
X = df_functionals.drop(columns=['Y'])
y = df_functionals['Y']

In [47]:
X = normalize_dataframe(X)

In [48]:
X.head()

Unnamed: 0,audspec_lengthL1norm_sma_range,audspec_lengthL1norm_sma_maxPos,audspec_lengthL1norm_sma_minPos,audspec_lengthL1norm_sma_quartile1,audspec_lengthL1norm_sma_quartile2,audspec_lengthL1norm_sma_quartile3,audspec_lengthL1norm_sma_iqr1-2,audspec_lengthL1norm_sma_iqr2-3,audspec_lengthL1norm_sma_iqr1-3,audspec_lengthL1norm_sma_percentile1.0,...,mfcc_sma_de[14]_peakRangeAbs,mfcc_sma_de[14]_peakRangeRel,mfcc_sma_de[14]_peakMeanAbs,mfcc_sma_de[14]_peakMeanMeanDist,mfcc_sma_de[14]_peakMeanRel,mfcc_sma_de[14]_minRangeRel,mfcc_sma_de[14]_meanRisingSlope,mfcc_sma_de[14]_stddevRisingSlope,mfcc_sma_de[14]_meanFallingSlope,mfcc_sma_de[14]_stddevFallingSlope
0,0.440548,0.319372,0.0,0.548101,0.570526,0.540249,0.070401,0.145827,0.144089,0.006603,...,0.30101,0.946065,0.103556,0.088195,0.893773,0.154152,0.288522,0.309565,0.162983,0.086997
1,0.212391,0.031414,0.984375,0.259486,0.271727,0.279107,0.052885,0.11842,0.113944,0.2478,...,0.257187,0.841606,0.25588,0.246402,0.017781,0.689393,0.463041,0.376347,0.394275,0.222215
2,0.076534,0.303665,0.729167,0.233407,0.239201,0.211604,0.041737,0.033425,0.049552,0.284268,...,0.217799,0.704825,0.196474,0.169856,0.867402,0.592148,0.2563,0.194365,0.221927,0.203587
3,0.064741,0.078534,0.541667,0.044955,0.041429,0.054724,0.02568,0.048795,0.049018,0.114085,...,0.273941,0.791339,0.226237,0.210729,0.987628,0.570446,0.370521,0.309202,0.361902,0.295066
4,0.045802,0.0,0.661458,0.235099,0.226424,0.193577,0.016699,0.017853,0.022259,0.284491,...,0.173728,0.631144,0.15857,0.157667,0.07142,0.522802,0.234928,0.223014,0.213982,0.158331


In [49]:
X_na = X.dropna(axis=1)

In [50]:
X_na.shape

(154, 6373)

In [42]:
# X_na_ = X_na[np.random.choice(X_na.columns, 1000, replace=False)]

In [51]:
X_na.head()

Unnamed: 0,audspec_lengthL1norm_sma_range,audspec_lengthL1norm_sma_maxPos,audspec_lengthL1norm_sma_minPos,audspec_lengthL1norm_sma_quartile1,audspec_lengthL1norm_sma_quartile2,audspec_lengthL1norm_sma_quartile3,audspec_lengthL1norm_sma_iqr1-2,audspec_lengthL1norm_sma_iqr2-3,audspec_lengthL1norm_sma_iqr1-3,audspec_lengthL1norm_sma_percentile1.0,...,mfcc_sma_de[14]_peakRangeAbs,mfcc_sma_de[14]_peakRangeRel,mfcc_sma_de[14]_peakMeanAbs,mfcc_sma_de[14]_peakMeanMeanDist,mfcc_sma_de[14]_peakMeanRel,mfcc_sma_de[14]_minRangeRel,mfcc_sma_de[14]_meanRisingSlope,mfcc_sma_de[14]_stddevRisingSlope,mfcc_sma_de[14]_meanFallingSlope,mfcc_sma_de[14]_stddevFallingSlope
0,0.440548,0.319372,0.0,0.548101,0.570526,0.540249,0.070401,0.145827,0.144089,0.006603,...,0.30101,0.946065,0.103556,0.088195,0.893773,0.154152,0.288522,0.309565,0.162983,0.086997
1,0.212391,0.031414,0.984375,0.259486,0.271727,0.279107,0.052885,0.11842,0.113944,0.2478,...,0.257187,0.841606,0.25588,0.246402,0.017781,0.689393,0.463041,0.376347,0.394275,0.222215
2,0.076534,0.303665,0.729167,0.233407,0.239201,0.211604,0.041737,0.033425,0.049552,0.284268,...,0.217799,0.704825,0.196474,0.169856,0.867402,0.592148,0.2563,0.194365,0.221927,0.203587
3,0.064741,0.078534,0.541667,0.044955,0.041429,0.054724,0.02568,0.048795,0.049018,0.114085,...,0.273941,0.791339,0.226237,0.210729,0.987628,0.570446,0.370521,0.309202,0.361902,0.295066
4,0.045802,0.0,0.661458,0.235099,0.226424,0.193577,0.016699,0.017853,0.022259,0.284491,...,0.173728,0.631144,0.15857,0.157667,0.07142,0.522802,0.234928,0.223014,0.213982,0.158331


In [52]:
y.head()

0    1
1    0
2    0
3    0
4    1
Name: Y, dtype: int64

In [53]:
y.value_counts()

Y
1    77
0    77
Name: count, dtype: int64

# **mRMR**

In [36]:
# !pip install feature_engine

In [54]:
from feature_engine.selection import MRMR

In [55]:
sel = MRMR(method="MID", max_features = 30, regression=False)

In [56]:
sel.fit(X_na, y)

In [57]:
mutual_information = pd.DataFrame({
    "Variable": sel.variables_,
    "mutual information": sel.relevance_
})
mutual_information = mutual_information.sort_values(by="mutual information", ascending=False).reset_index(drop=True)

In [58]:
mutual_information.head(30)

Unnamed: 0,Variable,mutual information
0,pcm_fftMag_spectralRollOff25.0_sma_quartile1,0.46791
1,pcm_fftMag_spectralRollOff25.0_sma_quartile2,0.457249
2,pcm_zcr_sma_peakMeanAbs,0.43931
3,pcm_fftMag_spectralRollOff25.0_sma_percentile99.0,0.421391
4,pcm_zcr_sma_percentile99.0,0.413409
5,pcm_zcr_sma_quartile3,0.4036
6,pcm_fftMag_spectralRollOff25.0_sma_amean,0.400594
7,pcm_fftMag_spectralRollOff25.0_sma_quartile3,0.398346
8,pcm_fftMag_spectralRollOff25.0_sma_lpgain,0.391357
9,pcm_fftMag_spectralRollOff25.0_sma_rqmean,0.388337


In [59]:
mutual_information.to_csv('mutual information.csv', index=False)

In [60]:
len(sel.variables_)   # The variables that will be considered for the feature selection procedure. # should be 6373

6373

In [61]:
len(sel.feature_names_in_)  #List with the names of features seen during fit. # should be 6373

6373

In [62]:
len(sel.features_to_drop_) #Dropped features # should be (6373 - 30)

6343

In [63]:
selected_features = sel.get_feature_names_out() # n selected features

In [64]:
len(selected_features)  # should be 30

30

In [65]:
selected_features

['pcm_zcr_sma_percentile1.0',
 'pcm_zcr_sma_percentile99.0',
 'audspec_lengthL1norm_sma_de_lpc3',
 'pcm_zcr_sma_de_quartile1',
 'pcm_zcr_sma_de_iqr1-3',
 'audSpec_Rfilt_sma[8]_quartile2',
 'pcm_fftMag_spectralRollOff25.0_sma_quartile1',
 'pcm_fftMag_spectralRollOff25.0_sma_quartile2',
 'pcm_fftMag_spectralRollOff25.0_sma_skewness',
 'pcm_fftMag_spectralRollOff25.0_sma_segLenStddev',
 'pcm_fftMag_spectralRollOff90.0_sma_iqr1-2',
 'pcm_fftMag_spectralEntropy_sma_iqr1-2',
 'mfcc_sma[1]_percentile99.0',
 'mfcc_sma[3]_lpgain',
 'audSpec_Rfilt_sma_de[10]_segLenStddev',
 'audSpec_Rfilt_sma_de[11]_quartile3',
 'audSpec_Rfilt_sma_de[12]_skewness',
 'audSpec_Rfilt_sma_de[13]_lpc2',
 'pcm_fftMag_spectralRollOff25.0_sma_de_lpc0',
 'pcm_fftMag_spectralFlux_sma_de_segLenStddev',
 'pcm_fftMag_spectralEntropy_sma_de_iqr1-2',
 'pcm_fftMag_spectralEntropy_sma_de_iqr2-3',
 'pcm_fftMag_spectralSkewness_sma_de_lpgain',
 'logHNR_sma_quartile1',
 'pcm_zcr_sma_peakMeanAbs',
 'audSpec_Rfilt_sma[1]_stddevRising

In [66]:
selected_features_df = pd.DataFrame(selected_features, columns=["Selected Features"])
selected_features_df.to_csv("selected_features.csv", index=False)