### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import librosa
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle
from algorithms import label_encode_columns, svm_model, accuracy_calculator, random_forest_model, array_column_spread, one_hot_encode_columns
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

### Loading the dataset using Pandas
The data is found in development.csv (development set): a comma-separated values file containing the records from the development set. This portion does have the action and object columns, which you should use to obtain the labels to train and validate your models.
The dataset consists in a collection of audio file in a WAV format. 

Each record is characterized by several attributes. The following is a short description for each of them.
- path: the path of the audio file.
- speakerId: the id of the speaker.
- action: the type of action required through the intent.
- object: the device involved by intent.
- Self-reported fluency level: the speaking fluency of the speaker.
- First Language spoken: the first language spoken by the speaker.
- Current language used for work/school: the main language spoken by the speaker during daily activities.
- gender: the gender of the speaker.
- ageRange: the age range of the speaker.

In [2]:
df = pd.read_csv("dsl_data/development.csv")
df_eval = pd.read_csv("dsl_data/evaluation.csv")

In [None]:
df.columns

In [3]:
df2 = pd.read_csv("dsl_data/development.csv")
from sklearn.preprocessing import OneHotEncoder
y = df2.values[:,3:5].sum(axis=1)
enc = OneHotEncoder()
y = enc.fit_transform(y.reshape(-1, 1))
y_decoded = enc.inverse_transform(y)

In [4]:
columns = ["Self-reported fluency level ","First Language spoken", "Current language used for work/school", 'gender']


In [None]:
one_hot_encode_columns(df, columns)

In [5]:
from sklearn.preprocessing import OneHotEncoder
for column in columns:
    encoder = OneHotEncoder()
     # Fit and transform the data
    encoded_data = encoder.fit_transform(df[[column]])

    # Get the feature names
    feature_names = encoder.get_feature_names_out([column])

    # Create a new DataFrame with the encoded data
    encoded_df = pd.DataFrame(encoded_data.toarray(), columns=feature_names)

    # Drop the original column
    df.drop(column, axis=1, inplace=True)

    #update the dataframe with the encoded dataframe
    df = pd.concat([df, encoded_df], axis=1)

In [6]:
def audio_feature_extraction(df):
        data_array= []
        rate_array = []
        for audio in df['path']:
                data, rate = librosa.load(audio)
                data_array.append(data)
                rate_array.append(rate)

        df['data'] = data_array
        df['rate'] = rate_array

In [7]:
audio_feature_extraction(df)
audio_feature_extraction(df_eval)

In [8]:
from scipy.stats import skew, kurtosis

def time_domain(df, column):
    df[f'{column}_mean'] = df[column].apply(lambda x: np.mean(x))
    df[f'{column}_min'] = df[column].apply(lambda x: np.min(x))
    df[f'{column}_max'] = df[column].apply(lambda x: np.max(x))
    df[f'{column}_skew'] = df[column].apply(lambda x: skew(x))
    df[f'{column}_kurtosis'] = df[column].apply(lambda x: kurtosis(x))
    df[f'{column}_std'] = df[column].apply(lambda x: np.std(x))

### Chroma feature

In [9]:
def chroma_feature(df):
    chroma_array = []
    for data, rate in zip(df['data'], df['rate']):
        chroma = librosa.feature.chroma_stft(y=data, sr=rate)
        chroma_mean = np.mean(chroma, axis=1)
        chroma_array.append(chroma_mean)

    df['chroma'] = chroma_array

In [10]:
chroma_feature(df)
chroma_feature(df_eval)

In [11]:
time_domain(df, 'chroma')
time_domain(df_eval, 'chroma')

### Tonnetz feature

In [12]:
def tonnetz_feature(df):
    tonnetz_array = []
    for data, rate in zip(df['data'], df['rate']):
        tonnetz = librosa.feature.tonnetz(y=data, sr=rate)
        tonnetz_mean = np.mean(tonnetz, axis=1)
        tonnetz_array.append(tonnetz_mean)

    df['tonnetz'] = tonnetz_array

In [None]:
tonnetz_feature(df)
tonnetz_feature(df_eval)

In [14]:
time_domain(df, 'tonnetz')
time_domain(df_eval, 'tonnetz')

### Spectral Contrast

In [15]:
def spectral_contrast(df):
    spectral_contrast_array = []
    for data, rate in zip(df['data'], df['rate']):
        spectral_contrast = librosa.feature.spectral_contrast(y=data, sr=rate)
        spectral_contrast_mean = np.mean(spectral_contrast, axis=1)
        spectral_contrast_array.append(spectral_contrast_mean)
    df['spectral_contrast'] = spectral_contrast_array    

In [16]:
spectral_contrast(df)
spectral_contrast(df_eval)


In [17]:
time_domain(df,'spectral_contrast')
time_domain(df_eval,'spectral_contrast')

### Root Mean Square Energy(RMSE)

In [18]:
def rmse_feature(df):
    # create an empty list to store the RMSE values
    rmse_list = []

    # iterate through the audio files in the dataset
    for data, rate in zip(df['data'], df['rate']):
        # calculate the root mean square energy
        rmse = librosa.feature.rms(y=data)
        # append the rmse mean to the rmse_list
        rmse_list.append(rmse[0])

    # add the rmse_list as a new column to the dataframe
    df['rmse'] = rmse_list

In [19]:
rmse_feature(df)
rmse_feature(df_eval)

In [20]:
time_domain(df, 'rmse')
time_domain(df_eval, 'rmse')

### Spectral Flatness (SF)

In [21]:
def spectral_flatness(df):
    # create an empty list to store the SF values
    sf_list = []

    # iterate through the audio files in the dataset
    for data, rate in zip(df['data'], df['rate']):
        # calculate the spectral flatness
        sf = librosa.feature.spectral_flatness(y=data)
        # append the SF mean to the sf_list
        sf_list.append(sf[0])

    # add the sf_list as a new column to the dataframe
    df['sf'] = sf_list

In [22]:
spectral_flatness(df)
spectral_flatness(df_eval)

In [23]:
time_domain(df,'sf')
time_domain(df_eval,'sf')

### Spectral Roll-off (SRO)

In [24]:
def sro_feature(df):
    # Create an empty list to store the spectral roll-off values
    spectral_rolloff_array = []

    for data, rate in zip(df['data'], df['rate']):
        spectral_rolloff = librosa.feature.spectral_rolloff(y=data, sr=rate)
        spectral_rolloff_array.append(spectral_rolloff[0])

    # Add the spectral roll-off values to the dataframe as a new column
    df['spectral_rolloff'] = spectral_rolloff_array

In [25]:
sro_feature(df)
sro_feature(df_eval)

In [27]:
time_domain(df,'spectral_rolloff')
time_domain(df_eval,'spectral_rolloff')

### Zero-Crossing Rate

In [28]:
def zcr_feature(df):
    # Create arrays to store the zero-crossing rate values
    zero_crossing_rate_array = []

    for data in df['data']:
        
        # Compute the zero-crossing rate for the current audio file
        zero_crossing_rate = sum(librosa.zero_crossings(data))
        # Append the zero-crossing rate to the zero_crossing_rate_array
        zero_crossing_rate_array.append(zero_crossing_rate)

    # Add the zero-crossing rate arrays as new columns in the dataframe
    df['zero_crossing_rate'] = zero_crossing_rate_array

In [29]:
zcr_feature(df)
zcr_feature(df_eval)

### Mel-frequency cepstral coefficients (MFCC)

In [30]:
def mfcc_feature(df):
    # Create arrays to store the mfcc rate values
    mfcc_array = []

    for data, rate in zip(df['data'], df['rate']):
        
        # Compute the mfccs for the current audio file
        mfcc = librosa.feature.mfcc(y=data, sr=rate, n_mfcc=50)
        # Compute the mean of the mfccs and append it to the mfcc_array
        mfcc_mean = np.mean(mfcc, axis=1)
        mfcc_array.append(mfcc_mean)

    # Add the mfcc as a new column in the dataframe
    df['mfcc'] = mfcc_array

In [31]:
mfcc_feature(df)
mfcc_feature(df_eval)

In [32]:
time_domain(df,'mfcc')
time_domain(df_eval,'mfcc')

In [None]:
df.columns

In [37]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
 
 # Using .fit_transform function to fit label
# encoder and return encoded label
label = le.fit_transform(df['ageRange'])
# removing the column 'Purchased' from df
# as it is of no use now.
df.drop('ageRange', axis=1, inplace=True)
    
# Appending the array to our dataFrame
# with column name 'Purchased'
df['ageRange'] = label

In [39]:

features = [ 'ageRange',
       'Self-reported fluency level _advanced',
       'Self-reported fluency level _basic',
       'Self-reported fluency level _intermediate',
       'Self-reported fluency level _native',
       'First Language spoken_English (Canada)',
       'First Language spoken_English (United States)',
       'First Language spoken_French (Canada)',
       'First Language spoken_Spanish (Venezuela)',
       'First Language spoken_Telugu',
       'Current language used for work/school_English (Australia)',
       'Current language used for work/school_English (Canada)',
       'Current language used for work/school_English (United States)',
       'Current language used for work/school_Spanish (Venezuela)',
       'gender_female', 'gender_male', 'chroma_mean',
       'chroma_min', 'chroma_max', 'chroma_skew', 'chroma_kurtosis',
       'chroma_std','tonnetz_mean', 'tonnetz_min', 'tonnetz_max',
       'tonnetz_skew', 'tonnetz_kurtosis', 'tonnetz_std',
       'spectral_contrast_mean', 'spectral_contrast_min',
       'spectral_contrast_max', 'spectral_contrast_skew',
       'spectral_contrast_kurtosis', 'spectral_contrast_std',
       'rmse_mean', 'rmse_min', 'rmse_max', 'rmse_skew', 'rmse_kurtosis',
       'rmse_std','sf_mean', 'sf_min', 'sf_max', 'sf_skew',
       'sf_kurtosis', 'sf_std','spectral_rolloff_mean',
       'spectral_rolloff_min', 'spectral_rolloff_max', 'spectral_rolloff_skew',
       'spectral_rolloff_kurtosis', 'spectral_rolloff_std',
       'zero_crossing_rate', 'mfcc_mean', 'mfcc_min', 'mfcc_max',
       'mfcc_skew', 'mfcc_kurtosis', 'mfcc_std']


len(features)

       

59

In [None]:
X_eval = df_eval[features].copy()
X_eval

X_eval.to_csv('x_eval.csv')

In [134]:
X = df[features].copy()
X
X.to_csv('x.csv')

In [65]:
y_encoded = pd.get_dummies(y_decoded.flatten())
y_encoded = y_encoded.values.ravel()

### SelectKBest:

In [None]:
from sklearn.feature_selection import SelectKBest, chi2

# Set the number of features to select
k = 10

# Create a SelectKBest object
selector = SelectKBest(score_func=chi2, k=k)

# Fit the selector to the data and get the selected features
X_new = selector.fit_transform(X, y)

### SelectFromModel

In [79]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

# Create a RandomForestClassifier object
clf = RandomForestClassifier()

# Create a SelectFromModel object and specify the threshold
selector = SelectFromModel(clf, threshold=0.25)

# Fit the selector to the data and get the selected features
X_new = selector.fit_transform(X, y_encoded)



In [78]:
X_new

array([], shape=(9854, 0), dtype=float64)

### Recursive Feature Elimination (RFE)

In [156]:
rom sklearn.feature_selection import RFE
from sklearn.svm import SVR

# Set the number of features to select
n_features = 25

# Create an SVR object
estimator = SVR(kernel="linear")

# Create an RFE object
selector = RFE(estimator, n_features_to_select=n_features)

# Fit the RFE object to the data and get the selected features
X_new = selector.fit_transform(X, y)

SyntaxError: invalid syntax (1299462321.py, line 1)

### Recursive Feature Elimination (RFE) Random forest

In [166]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

# Set the number of features to select
n_features = 20

# Create an RandomForestClassifier object
estimator = RandomForestClassifier()

# Create an RFE object
selector = RFE(estimator, n_features_to_select=n_features)

# Fit the RFE object to the data and get the selected features
X_new = selector.fit_transform(X, y_labeled)

In [None]:
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV

# Initialize the feature matrix and target variable

# Initialize the SVM model
svm = SVC(kernel="linear")

# Initialize RFE 
rfe = RFE(svm)

# Define the grid of values for the number of features to select and the accuracy required
param_grid = {'n_features_to_select':[10,20,30], 
              'estimator__C':[0.1, 1, 10], 
              'estimator__kernel':['linear', 'rbf', 'poly'], 
              'estimator__gamma': [0.1, 1, 10]}

# Define the scoring function
acc_scorer = make_scorer(accuracy_score)

# Initialize GridSearchCV
grid_search = GridSearchCV(rfe, param_grid, scoring=acc_scorer)

# Fit the grid_search to the data
grid_search.fit(X, y_labeled)

# Print the best parameters and the best score
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

# Print the selected features
print(grid_search.best_estimator_.support_)


In [169]:
X_new.shape

(9854, 20)

### VarianceThreshold

In [158]:
from sklearn.feature_selection import VarianceThreshold

# Set the threshold
threshold = 0.8

# Create a VarianceThreshold object
selector = VarianceThreshold(threshold)

# Fit the selector to the data and get the selected features
X_new = selector.fit_transform(X)

### PCA

In [144]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Scale the data
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Set the number of components
n_components = 20

# Create a PCA object
pca = PCA(n_components=n_components)

# Fit the PCA to the data and get the selected features
X_pca = pca.fit_transform(df_scaled)

# Get the column names of the selected features
original_cols = df_scaled.columns[pca.components_[:n_components].argmax(axis=1)]

# Create a new dataframe with the top n features
top_n_features = pd.DataFrame(X_pca, columns=original_cols)


In [151]:
top_n_features.columns

top_features = ['ageRange', 'gender_male', 'gender_female',
       'First Language spoken_English (Canada)',
       'First Language spoken_English (United States)',
       'First Language spoken_French (Canada)',
       'First Language spoken_Spanish (Venezuela)',
       'First Language spoken_Telugu',
       'Self-reported fluency level _advanced',
       'Self-reported fluency level _basic',
       'Self-reported fluency level _intermediate',
       'Self-reported fluency level _native', 'ageRange',
       'chroma_kurtosis', 'mfcc_skew', 'mfcc_std', 'sf_kurtosis',
       'sf_mean', 'sf_min', 'spectral_rolloff_min', 'tonnetz_kurtosis',
       'tonnetz_max', 'tonnetz_std']
top_n_features = df[top_features].copy()
top_n_features

Unnamed: 0,ageRange,gender_male,gender_female,First Language spoken_English (Canada),First Language spoken_English (United States),First Language spoken_French (Canada),First Language spoken_Spanish (Venezuela),First Language spoken_Telugu,Self-reported fluency level _advanced,Self-reported fluency level _basic,...,chroma_kurtosis,mfcc_skew,mfcc_std,sf_kurtosis,sf_mean,sf_min,spectral_rolloff_min,tonnetz_kurtosis,tonnetz_max,tonnetz_std
0,0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,-1.558790,-6.438284,72.498337,0.490249,0.005090,1.581386e-06,764.428711,-1.518961,0.091127,0.052848
1,0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,-0.931596,-6.571524,84.531403,6.866546,0.008828,2.605675e-06,602.929688,-0.586217,-0.006839,0.016361
2,0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,-0.842212,-6.420152,90.399910,1.309730,0.005855,9.617157e-07,430.664062,0.890048,0.018604,0.038942
3,0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,-0.312001,-6.430761,75.115974,37.705633,0.011705,2.732195e-06,527.563477,-1.138267,0.035558,0.016468
4,0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,-1.498014,-6.503889,95.182053,1.129594,0.011597,5.307173e-07,279.931641,-0.680903,0.098910,0.039231
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9849,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.945737,-6.434830,66.559837,79.509039,0.000036,8.637427e-07,10.766602,-0.487377,0.040010,0.019216
9850,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.003275,-6.570049,66.754639,21.130139,0.000121,1.060718e-06,10.766602,-0.474305,0.039316,0.022992
9851,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-1.531671,-6.387176,62.616745,4.251839,0.000120,5.094823e-07,10.766602,-1.581338,0.035522,0.018345
9852,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.197633,-6.636601,69.866089,13.304608,0.000139,5.860047e-07,10.766602,0.084694,0.011484,0.013880


In [None]:
from sklearn.feature_selection import RFE
from sklearn.svm import SVR


# One-hot encode the y variable
y_encoded = pd.get_dummies(y_decoded)

# Check the shape of y_encoded
print(y_encoded.shape)

# Check the shape of X
print(X.shape)

# Flatten y_encoded to 1D array with the same dimensionality as X
y_encoded = y_encoded.values.ravel()

# check the shape of y_encoded after flatten
print(y_encoded.shape)

# Create an SVR object
estimator = SVR(kernel="linear")

# Create an RFE object and specify the number of features to select
selector = RFE(estimator, n_features_to_select=20)

# Fit the RFE object to the data and get the selected features
X_new = selector.fit_transform(X, y_encoded)

# Get the selected feature names
feature_names = X.columns[selector.get_support()]




In [None]:
feature_names

In [118]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
  
# Encode labels in column 'species'.
y_labeled = label_encoder.fit_transform(y)
np.unique(y_labeled)

  y = column_or_1d(y, warn=True)


array([0, 1, 2, 3, 4, 5, 6])

In [119]:
X_train, X_test, y_train, y_test = train_test_split(top_n_features, y_labeled, test_size=0.2, random_state=42)

In [None]:
y_pred_eval = svm_model(X, y, X_eval)

predictions = pd.DataFrame(y_pred_eval,columns=["Predicted"])
predictions.to_csv('my_data.csv', index=True)

In [None]:
from sklearn.svm import SVC


def svm_model2(X_train, y_train, X_test):
    clf = SVC(C=0.1, kernel='linear', gamma=0.1)
    # train the model on the training data
    clf.fit(X_train, y_train)
    # predict the target values for the test data
    # returning the y_predict
    return clf.predict(X_test)
y_pred = svm_model2(X_train, y_train, X_test)
svm_accuracy = accuracy_calculator(y_test, y_pred)

In [123]:
y_labeled

array([6, 4, 3, ..., 5, 1, 0])

In [None]:
y_pred_eval = random_forest_model(X , X_eval, y)

predictions = pd.DataFrame(y_pred_eval,columns=["Predicted"])
predictions.to_csv('predictions.csv', index=True)


In [None]:
y_pred = random_forest_model(X_train, X_test, y_train)
random_forest_accuracy = accuracy_calculator(y_test, y_pred)
random_forest_accuracy

In [126]:
y_labeled

array([6, 4, 3, ..., 5, 1, 0])

In [153]:
y_labeled

array([6, 4, 3, ..., 5, 1, 0])

In [154]:
## Implementation using k-fold
from sklearn.model_selection import KFold, cross_val_score
# define the number of folds and whether to shuffle the data
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# create an instance of the SVM model
clf = SVC()

# use cross_val_score function to perform k-fold cross-validation
scores = cross_val_score(clf, top_n_features, y_labeled, cv=kf, scoring='accuracy')

# print the mean accuracy and standard deviation
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))

Accuracy: 0.26 (+/- 0.01)


In [155]:
clf = RandomForestClassifier(n_estimators=100, random_state=0)

# shuffle the data before performing k-fold cross validation
X_shuffled, y_shuffled = shuffle(top_n_features, y_labeled)

# perform k-fold cross validation with 5 folds
scores = cross_val_score(clf, X_shuffled, y_shuffled, cv=5)

# calculate the mean accuracy of the model across all folds
accuracy = np.mean(scores)
print("Accuracy:", accuracy)


Accuracy: 0.24649813153672412
