### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import librosa
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle
from algorithms import encode_columns, svm_model, accuracy_calculator, random_forest_model
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

### Loading the dataset using Pandas
The data is found in development.csv (development set): a comma-separated values file containing the records from the development set. This portion does have the action and object columns, which you should use to obtain the labels to train and validate your models.
The dataset consists in a collection of audio file in a WAV format. 

Each record is characterized by several attributes. The following is a short description for each of them.
- path: the path of the audio file.
- speakerId: the id of the speaker.
- action: the type of action required through the intent.
- object: the device involved by intent.
- Self-reported fluency level: the speaking fluency of the speaker.
- First Language spoken: the first language spoken by the speaker.
- Current language used for work/school: the main language spoken by the speaker during daily activities.
- gender: the gender of the speaker.
- ageRange: the age range of the speaker.

In [2]:
df = pd.read_csv("dsl_data/development.csv")

In [3]:
y = df.values[:,3:5].sum(axis=1)
y

array(['change languagenone', 'activatemusic', 'deactivatelights', ...,
       'deactivatelights', 'deactivatelights', 'increasevolume'],
      dtype=object)

In [4]:
column_names = ["Self-reported fluency level ","First Language spoken", "Current language used for work/school", "speakerId", "gender","ageRange"]
encode_columns(df,column_names)

In [5]:
path = Path('./out.csv')

if path.is_file():
    df = pd.read_csv("out.csv")
else:
    df["mfccs"] = df.apply(lambda row: librosa.feature.mfcc(y=librosa.load(row["path"])[0], sr=librosa.load(row["path"])[1]), axis=1)
    df["mfccs_mean"] = df["mfccs"].apply(lambda x: np.mean(x))
    df.to_csv('out.csv')

In [6]:
X = df[["speakerId","Self-reported fluency level ","First Language spoken","Current language used for work/school","gender","ageRange", "mfccs_mean"]]
X

Unnamed: 0,speakerId,Self-reported fluency level,First Language spoken,Current language used for work/school,gender,ageRange,mfccs_mean
0,0,0,1,2,0,0,-26.103281
1,0,0,1,2,0,0,-27.985989
2,0,0,1,2,0,0,-27.983374
3,0,0,1,2,0,0,-26.391945
4,0,0,1,2,0,0,-28.545034
...,...,...,...,...,...,...,...
9849,76,3,1,2,1,0,-16.623365
9850,76,3,1,2,1,0,-18.091902
9851,76,3,1,2,1,0,-16.031946
9852,76,3,1,2,1,0,-18.633825


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
y_pred = svm_model(X_train, y_train, X_test)
svm_accuracy = accuracy_calculator(y_test, y_pred)

In [11]:
y_pred = random_forest_model(X_train, X_test, y_train)
random_forest_accuracy = accuracy_calculator(y_test, y_pred)
random_forest_accuracy

0.2085235920852359

In [None]:
## Implementation using k-fold
from sklearn.model_selection import KFold, cross_val_score
# define the number of folds and whether to shuffle the data
kf = KFold(n_splits=3, shuffle=True, random_state=42)

# create an instance of the SVM model
clf = SVC()

# use cross_val_score function to perform k-fold cross-validation
scores = cross_val_score(clf, X, y, cv=kf, scoring='accuracy')

# print the mean accuracy and standard deviation
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))

In [None]:
clf = RandomForestClassifier(n_estimators=100, random_state=0)

# shuffle the data before performing k-fold cross validation
X, y = shuffle(X, y)

# perform k-fold cross validation with 5 folds
scores = cross_val_score(clf, X, y, cv=5)

# calculate the mean accuracy of the model across all folds
accuracy = np.mean(scores)
print("Accuracy:", accuracy)


Accuracy: 0.2007317782980115
