# Load and test Model

Use this notebook to load a Spanish Accent classifier

Here is a list of available options

ES_ANDINO

ANDINO_CARIBE

ES_MX

ES_MX2

MX_ANDINO

MX_CARIBE

Multiclass model using OVO Classifier SVM

MULTI

MULTI_OVO

ES_MX_ANDINO_OVO

In [65]:
import python_speech_features as mfcc
import os
import numpy as np
import pandas as pd
import librosa
import pickle

from IPython.display import Audio 

# load model

# Binary Models

# ES_ANDINO
# ANDINO_CARIBE
# ES_MX
# ES_MX2
# MX_ANDINO
# MX_CARIBE

# Multiclass model using OVO Classifier SVM

# MULTI
# MULTI_OVO
# ES_MX_ANDINO_OVO

model_name = 'MULTI_OVO'

model_path = './models/'+model_name+'/'

model = pickle.load(open(model_path + 'model.pkl','rb'))
encoder = pickle.load( open(model_path + 'encoder.pkl','rb'))
scaler = pickle.load( open(model_path + 'scaler.pkl','rb'))
f_selector = pickle.load( open(model_path + 'f_selector.pkl','rb'))


df_features = pd.read_csv('mfcc_features.csv')
df_features = df_features.drop(['Unnamed: 0'],axis=1)
df_features.accent.value_counts()

df_new_data = pd.read_csv('./preprocessed/train.csv')

classes = encoder.classes_


In [8]:
ds_path = "./clips/"

train_csv = df_new_data

#this function is used to extract audio frequency features
def feature_extraction(filename, sampling_rate=48000, mfcc_num=20):
    path = "{}{}".format(ds_path, filename)
    features = list()
    audio, _ = librosa.load(path, sr=sampling_rate)
    
    accents = train_csv[train_csv['path'] == filename].accents.values[0]
    spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio, sr=sampling_rate))
    spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=audio, sr=sampling_rate))
    spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sampling_rate))
    features.append(accents)
    features.append(spectral_centroid)
    features.append(spectral_bandwidth)
    features.append(spectral_rolloff)
    
    # Extract mfcc features. First experiment
    
    mfcc = librosa.feature.mfcc(y=audio, sr=sampling_rate, n_mfcc=mfcc_num) # it returns 20 by default.
    for el in mfcc:
        features.append(np.mean(el))
    
    return features

In [9]:
inx = 18765
df_new_data.iloc[inx]

Unnamed: 0                                                44613
client_id     0b7ae4a12ec82a5964a5e1bb271ea0106b65e34dc36c65...
path                               common_voice_es_19747165.wav
sentence      De nuevo, Crátero inició el asedio y Alejandro...
up_votes                                                      2
down_votes                                                    0
age                                                    fourties
gender                                                   female
accents                                                  México
locale                                                       es
segment                                                     NaN
Name: 18765, dtype: object

In [10]:

path = './clips/'+ df_new_data.iloc[inx]['path']

Audio(path, autoplay=True)

In [11]:
file_path = df_new_data.iloc[inx]['path']

file_features = feature_extraction(file_path,mfcc_num = 13)
print("features: ", file_features)

# This data needs to be scaled!!!!
# Next, use the feature selector transform


features:  ['México', 1884.622756923559, 1742.0640012319645, 3505.8549259681095, -353.76996, 116.289505, -11.551197, 8.730063, 2.543114, -8.989427, -3.0980225, -29.225952, -15.900945, 4.048154, -14.510168, -2.7871382, 3.3598535]


In [12]:

file_features = scaler.transform([file_features[1:]])

X_new_features = f_selector.transform(file_features)

X_new_features

array([[ 0.40235425,  0.31483296, -0.63673173, -0.28356908, -0.68603444,
        -0.55988915, -0.92406295, -3.53359861, -1.2531482 , -1.29361711,
         0.01495023,  0.60038402]])

In [13]:
print(classes)

prediction = model.predict(X_new_features)

print(classes[prediction[0]])

['Andino' 'Caribe' 'España' 'México' 'Rioplatense']
México


In [14]:
df_new_data.iloc[inx]['path']

'common_voice_es_19747165.wav'

# Test with Recording

Read a phrase or just say something.

In [15]:
import sounddevice as sd
from scipy.io.wavfile import write
import wavio as wv

In [16]:
def getMFCC(filename, accent, sampling_rate=48000, mfcc_num=20):
    path = filename
    features = list()
    audio, _ = librosa.load(path, sr=sampling_rate)
    
    #accents = train_csv[train_csv['path'] == filename].accents.values[0]
    spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio, sr=sampling_rate))
    spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=audio, sr=sampling_rate))
    spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sampling_rate))
    features.append(accent)
    features.append(spectral_centroid)
    features.append(spectral_bandwidth)
    features.append(spectral_rolloff)
    
    # Extract mfcc features. First experiment
    
    mfcc = librosa.feature.mfcc(y=audio, sr=sampling_rate, n_mfcc=mfcc_num) # it returns 20 by default.
    for el in mfcc:
        features.append(np.mean(el))
    
    return features

In [55]:
def recordAudio():
    # Sampling frequency
    freq = 44100

    # Recording duration
    duration = 5

    # Start recorder with the given values 
    # of duration and sample frequency
    recording = sd.rec(int(duration * freq), 
                       samplerate=freq, channels=2)

    # Record audio for the given number of seconds
    sd.wait()

    # This will convert the NumPy array to an audio
    # file with the given sampling frequency
    write("./recordings/recording0.wav", freq, recording)

    # Convert the NumPy array to audio file
    wv.write("./recordings/recording1.wav", recording, freq, sampwidth=2)
    
recordAudio()

In [67]:
# load the audio file

# Pre recorded files

# caribe
# caribe2
# homero
# homero2
# betty


path = './recordings/homero2.wav'

Audio(path, autoplay=True)

In [68]:
test_features = getMFCC(path,'_',mfcc_num=13)
print("features: ", test_features)

features:  ['_', 2312.5406827367206, 2308.5433866485523, 4970.149253731343, -310.47733, 156.2968, -79.37292, 45.262936, 11.481743, -31.064074, 18.476171, -34.53757, -21.497284, 12.535708, -28.35438, 0.5465615, 4.8835387]


In [69]:
file_features = scaler.transform([test_features[1:]])

X_new_features = f_selector.transform(file_features)

X_new_features

array([[ 1.4460859 ,  2.3113424 ,  0.99019158, -3.58830987,  2.07150158,
        -2.76321295,  1.28852196, -4.25762897, -2.04408124, -3.61779604,
         0.61988571,  0.87512649]])

In [70]:
print(classes)
prediction = model.predict(X_new_features)

print(classes[prediction[0]])


['Andino' 'Caribe' 'España' 'México' 'Rioplatense']
México
