### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
# Importing LabelEncoder from Sklearn
# library from preprocessing Module.
from sklearn.preprocessing import LabelEncoder
import librosa

### Loading the dataset using Pandas
The data is found in development.csv (development set): a comma-separated values file containing the records from the development set. This portion does have the action and object columns, which you should use to obtain the labels to train and validate your models.
The dataset consists in a collection of audio file in a WAV format. 

Each record is characterized by several attributes. The following is a short description for each of them.
- path: the path of the audio file.
- speakerId: the id of the speaker.
- action: the type of action required through the intent.
- object: the device involved by intent.
- Self-reported fluency level: the speaking fluency of the speaker.
- First Language spoken: the first language spoken by the speaker.
- Current language used for work/school: the main language spoken by the speaker during daily activities.
- gender: the gender of the speaker.
- ageRange: the age range of the speaker.

In [2]:
df = pd.read_csv("dsl_data/development.csv")

In [3]:
y = df.values[:,3:5].sum(axis=1)
y

array(['change languagenone', 'activatemusic', 'deactivatelights', ...,
       'deactivatelights', 'deactivatelights', 'increasevolume'],
      dtype=object)

In [4]:
def label_encoder(column_name):
    le = LabelEncoder()
 
    # Using .fit_transform function to fit label
    # encoder and return encoded label
    label = le.fit_transform(df[column_name])
    # removing the column 'Purchased' from df
    # as it is of no use now.
    df.drop(column_name, axis=1, inplace=True)
    
    # Appending the array to our dataFrame
    # with column name 'Purchased'
    df[column_name] = label
    

In [5]:
from collections import Counter
label_encoder("Self-reported fluency level ")
label_encoder("First Language spoken")
label_encoder("Current language used for work/school")
label_encoder("speakerId")
label_encoder("gender")
label_encoder("ageRange")

In [None]:
df["mfccs"] = df.apply(lambda row: librosa.feature.mfcc(y=librosa.load(row["path"])[0], sr=librosa.load(row["path"])[1]), axis=1)

In [None]:
df["mfccs_mean"] = df["mfccs"].apply(lambda x: np.mean(x))

In [None]:
X = df[["speakerId","Self-reported fluency level ","First Language spoken","Current language used for work/school","gender","ageRange", "mfccs_mean"]]
X

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn import metrics


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# create an instance of the SVM model
clf = SVC()

# train the model on the training data
clf.fit(X_train, y_train)
# # predict the target values for the test data
y_pred = clf.predict(X_test)

# # evaluate the model using metrics such as accuracy
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
df.to_csv('out.csv')