# Feature Extraction with eGeMAPS (openSMILE)

In [None]:
import sox
import opensmile
import pandas as pd
import moviepy.editor as mp

### Extract audio from video recordings

In [None]:
subject_id = mp.VideoFileClip(r"file_name.mp4")
subject_id.audio.write_audiofile(r"file_name_result.mp3")

### Feauture Extraction

For more info, go to https://github.com/audeering/opensmile-python

In [None]:
#define a function to extract the acoustic features with eGeMAPS
def feature_extraction(files, csv_name=None, features=opensmile.FeatureSet.eGeMAPSv02):
    
    # Initialize the opensmile enviroment
    smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.eGeMAPSv02,
    feature_level=opensmile.FeatureLevel.Functionals,
    )
    
    # Initalize empty feature list
    data = []
    
    # For each file, extract features
    for file in files:
        data.append(smile.process_file(file+'.wav'))
        
    # Generate a single dataframe for all of the features
    df = pd.concat(data)
    
    # If given an output filename, save the dataframe as a csv
    if csv_name is not None:
        df.to_csv(csv_name, sep="\t")
        
    return df

In [None]:
# Create a dataframe per group with all the audio files
group1_files = ["subj_01", "subj_02",
                "subj_03","subj_04", 
                "subj_05", "subj_06", 
                "subj_07","subj_08",
                "subj_09", "subj_10",
                "subj_11", "subj_12", 
                "subj_13", "subj_14"]
group2_files = ["subj_15", "subj_16", 
                "subj_17","subj_18", 
                "subj_19","subj_20", 
                "subj_21", "subj_22",
                "subj_23", "subj_24", 
                "subj_25", "subj_26", 
                "subj_27", "subj_28"]

featureset = opensmile.FeatureSet.eGeMAPSv02

In [None]:
# Implement the function 'feature_extraction on both dataframe and save them into csv files
df_group1 = feature_extraction(group1_files, "../files/ASDchildren.tsv", features=featureset)
df_group2 = feature_extraction(group2_files, "../Audio_analisi/TDchildren.tsv", features=featureset)

# Selection

### Mann-Whitney U Test

In [None]:
from scipy.stats import mannwhitneyu

In [None]:
from scipy.stats import mannwhitneyu

# Carrying out the Wilcoxon–Mann–Whitney test
results = mannwhitneyu(df_ASD, df_TD)
results_df = pd.DataFrame(results)
results_df.columns = df_ASD.columns
results_df = results_df.T

#create a new dataframe with the columns "statistic" and "p-value"
results_df.columns = ["statistic", "p-value"]

#convert the dataframe into a csv file
results_df.to_csv("../Audio_analysis/wilkoxon-mann-whitney.xlsx", sep="\t") 

# Visualisation

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#create a new dataframe with only the selected features for each group. 
#Here, we reported the features selected during our study as example.
df_ASD_sel = df_ASD [['spectralFluxUV_sma3nz_amean', 
                      'shimmerLocaldB_sma3nz_stddevNorm',
                      'HNRdBACF_sma3nz_amean',
                      'shimmerLocaldB_sma3nz_amean', 
                      'HNRdBACF_sma3nz_stddevNorm','
                      'slopeUV500-1500_sma3nz_amean',
                      'F2frequency_sma3nz_stddevNorm',
                      'loudness_sma3_percentile20.0',
                      'jitterLocal_sma3nz_amean',
                      'jitterLocal_sma3nz_stddevNorm',
                      'F2bandwidth_sma3nz_stddevNorm',
                      'spectralFluxV_sma3nz_amean',
                      'spectralFlux_sma3_amean',
                      'F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope',
                      'loudness_sma3_stddevRisingSlope', 
                      'slopeUV0-500_sma3nz_amean',
                      'loudness_sma3_amean'
                     ]]
df_TD_sel = df_TD [['spectralFluxUV_sma3nz_amean', 
                    'shimmerLocaldB_sma3nz_stddevNorm',
                    'HNRdBACF_sma3nz_amean',
                    'shimmerLocaldB_sma3nz_amean', 
                    'HNRdBACF_sma3nz_stddevNorm',
                    'slopeUV500-1500_sma3nz_amean',
                    'F2frequency_sma3nz_stddevNorm',
                    'loudness_sma3_percentile20.0',
                    'jitterLocal_sma3nz_amean',
                    'jitterLocal_sma3nz_stddevNorm',
                    'F2bandwidth_sma3nz_stddevNorm',
                    'spectralFluxV_sma3nz_amean',
                    'spectralFlux_sma3_amean',
                    'F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope',
                    'loudness_sma3_stddevRisingSlope', 
                    'slopeUV0-500_sma3nz_amean',
                    'loudness_sma3_amean'
                   ]]

In [None]:
#calculate and print mean and standard deviation between the two groups
#e.g., 'F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope'
print('ASD mean', df_ASD_sel['F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope'].mean())
print('ASD sd', df_ASD_sel['F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope'].std())
print('TD mean', df_TD_sel['F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope'].mean())
print('TD sd', df_TD_sel['F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope'].std())

In [None]:
#plot the data distribution for each selected feature by creating two boxplot
#e.g., 'F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope'

#create two dataframe selecting the column of the feature 
cf_ASD_pitch = df_ASD[['F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope']].assign(Subjects='ASD')
cf_TD_pitch = df_TD[['F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope']].assign(Subjects='TD')

#concatenate the two dataframe
cdf_cf_pitch = pd.concat([cf_ASD_pitch, cf_TD_pitch])

#rename columns
newcols_cf_pitch = {'F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope': 
                    'Pitch falling slope (sd)'}

cdf_cf_pitch= cdf_cf_pitch.rename(columns=newcols_cf_pitch)

#create a new dataframe for the boxplot
mdf_pitch = pd.melt(cdf_cf_pitch, id_vars=['Subjects'], 
                    var_name=['Acoustic feature'])
mdf_pitch.columns = ['Subjects', 'Acoustic feature', 'Measure feature']

import seaborn as sns
boxplot_cf_pitch = sns.boxplot(x="Acoustic feature", y="Measure feature", 
                               hue="Subjects", data=mdf_pitch, linewidth=2.0)    
#save the boxplot  
plt.savefig('Pitch falling slope (sd).png')  

### Outliers Visualisation

In [None]:
#select the columns of the outliers from the two dataframe
df_ASD_outliers = df_ASD[['F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope', 
                          'jitterLocal_sma3nz_stddevNorm', 
                          'F2frequency_sma3nz_stddevNorm', 
                          'F2bandwidth_sma3nz_stddevNorm',
                          'shimmerLocaldB_sma3nz_stddevNorm', 
                          'slopeUV500-1500_sma3nz_amean'
                         ]]

df_TD_outliers = df_TD[['F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope', 
                        'jitterLocal_sma3nz_stddevNorm', 
                        'F2frequency_sma3nz_stddevNorm', 
                        'F2bandwidth_sma3nz_stddevNorm',
                        'shimmerLocaldB_sma3nz_stddevNorm',
                        'slopeUV500-1500_sma3nz_amean'
                       ]]

#create a new dataframe of the outliers by concatenate the previous one
df_outliers = pd.concat([df_ASD_outliers,df_TD_outliers],axis=0)

#rename each column for the visualisation 
#(e.g., 'F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope' into 
#'Pitch falling slope (sd)')

newcols_outliers = {'F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope': 
                    'Pitch falling slope (sd)'}
df_outliers=df_outliers.rename(columns=newcols_outliers)

In [None]:
#plot the data distribution curve for each outlier
#then calculate skew and kurtosis of the distribution, print the values 

df_outliers_pitch.plot(kind = 'density')
print('This distribution has skew', df_outliers_pitch.skew())
print('This distribution has kurtosis', df_outliers_pitch.kurt())