In [5]:
import numpy as np
import pandas as pd
import librosa
# from librosa.feature import mfcc
import scipy.io.wavfile as wav
from scipy.io.wavfile import write as wav_write
import os

In [6]:
from python_speech_features.base import mfcc
from python_speech_features import logfbank

In [7]:
# read in wav file, get out signal (np array) and sampling rate (int)
def read_in_audio(filename):
    (rate, sig) = wav.read(filename)
    return sig, rate


In [8]:
# read in signal, take absolute value and slice seconds 1-3 from beginning
def get_two_secs(filename):
    sig, rate = read_in_audio(filename)
    abs_sig = np.abs(sig)
    two_secs = abs_sig[rate:3*rate]
    return two_secs



In [9]:
# calculates moving average for a specified window (number of samples)
def take_moving_average(sig, window_width):
    cumsum_vec = np.cumsum(np.insert(sig, 0, 0))
    ma_vec = (cumsum_vec[window_width:] - cumsum_vec[:-window_width])/float(window_width)
    return ma_vec

In [69]:
# read in signal, change sample rate to outrate (samples/sec), use write_wav=True to save wav file to disk
def downsample(filename, outrate=8000, write_wav = False):
    (rate, sig) = wav.read(filename)
    rate = float(rate)
    sig = sig.astype(float)
    down_sig = librosa.core.resample(sig, rate, outrate, scale=True)
    if not write_wav:
        return down_sig, outrate
    if write_wav:
        wav_write('{}_down_{}.wav'.format(filename, outrate), outrate, down_sig)


In [70]:
# change total number of samps for downsampled file to n_samps by trimming or zero-padding and standardize them
def make_standard_length(filename, n_samps=240000):
    down_sig, rate = downsample(filename)
    normed_sig = librosa.util.fix_length(down_sig, n_samps)
    normed_sig = (normed_sig - np.mean(normed_sig))/np.std(normed_sig)
    return normed_sig

In [71]:
# from a folder containing wav files, normalize each, divide into num_splits-1 chunks and write the resulting np.arrays to a single matrix
def make_split_audio_array(folder, num_splits = 5):
    lst = []
    for filename in os.listdir(folder):
        if filename.endswith('wav'):
            normed_sig = make_standard_length(filename)
            chunk = normed_sig.shape[0]/num_splits
            for i in range(num_splits - 1):
                lst.append(normed_sig[i*chunk:(i+2)*chunk])
    lst = np.array(lst)
    lst = lst.reshape(lst.shape[0], -1)
    return lst

In [72]:
# for input wav file outputs (13, 2999) mfcc np array
def make_normed_mfcc(filename, outrate=8000):
    normed_sig = make_standard_length(filename)
    normed_mfcc_feat = mfcc(normed_sig, outrate)
    normed_mfcc_feat = normed_mfcc_feat.T
    return normed_mfcc_feat

In [73]:
# make mfcc np array from wav file using librosa package
def make_librosa_mfcc(filename):
     y, sr = librosa.load(filename)
     mfcc_feat = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
     return mfcc_feat


In [74]:
# make mfcc np array from wav file using speech features package
def make_mfcc(filename):
    (rate, sig) = wav.read(filename)
    mfcc_feat = mfcc(sig, rate)
    mfcc_feat = mfcc_feat.T
    return mfcc_feat


In [90]:
# for folder containing wav files, output numpy array of normed mfcc
def make_class_array(folder):
    lst = []
    for filename in os.listdir(folder):
        try:
            lst.append(make_normed_mfcc(folder + '/' + filename))
        except:
            continue
    class_array = np.array(lst)
    class_array = np.reshape(class_array, (class_array.shape[0], class_array.shape[2], class_array.shape[1]))
    return class_array

In [91]:
# read in wav file, output (1,13) numpy array of mean mfccs for each of 13 features
def make_mean_mfcc(filename):
    try:
        (rate, sig) = wav.read(filename)
        mfcc_feat = mfcc(sig, rate)
        avg_mfcc = np.mean(mfcc_feat, axis = 0)
        return avg_mfcc
    except:
        pass

In [77]:
# write new csv corresponding to dataframe of given language and gender
def make_df_language_gender(df, language, gender):
    newdf = df.query("native_language == @language").query("sex == @gender")
    newdf.to_csv('df_{}_{}.csv'.format(language, gender))

In [78]:
# write new directories to disk containing the male and female speakers from the most common languages
def make_folders_from_csv():
    top_15_langs = ['english', 'spanish', 'arabic', 'mandarin', 'french', 'german', 'korean', 'russian', 'portuguese', 'dutch', 'turkish', 'italian', 'polish', 'japanese', 'vietnamese']
    for lang in top_15_langs:
        os.makedirs('{}/{}_male'.format(lang, lang))
        os.makedirs('{}/{}_female'.format(lang, lang))

In [79]:
# copy files to the corresponding directories
def copy_files_from_csv():
    top_15_langs = ['english', 'spanish', 'arabic', 'mandarin', 'french', 'german', 'korean', 'russian', 'portuguese', 'dutch', 'turkish', 'italian', 'polish', 'japanese', 'vietnamese']
    for lang in top_15_langs:
        df_male = pd.read_csv('df_{}_male.csv'.format(lang))
        df_female = pd.read_csv('df_{}_female.csv'.format(lang))
        m_list = df_male['filename'].values
        f_list = df_female['filename'].values
        for filename in f_list:
            shutil.copy2('big_langs/{}/{}.wav'.format(lang, filename), 'big_langs/{}/{}_female/{}.wav'.format(lang, lang, filename))

In [80]:
# input folder of wav files, output pandas dataframe of mean mfcc values
def make_mean_mfcc_df(folder):
    norms = []
    for filename in os.listdir(folder):
        try:
            (rate, sig) = wav.read(folder + '/' + filename)
            mfcc_feat = mfcc(sig, rate)
            mean_mfcc = np.mean(mfcc_feat, axis = 0)
            #mean_mfcc = np.reshape(mean_mfcc, (1,13))
            norms.append(mean_mfcc)
        except:
            continue
    flat = [a.ravel() for a in norms]
    stacked = np.vstack(flat)
    df = pd.DataFrame(stacked)
    return df

In [81]:
df = make_mean_mfcc_df('arabic')





In [84]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,15.462919,-7.173271,-5.480319,5.298063,-2.952756,8.531075,-14.137056,-1.940234,-18.510789,-3.796445,-16.447797,3.267091,-16.264742
1,14.906111,-10.804429,-11.308154,-8.722498,-6.342787,-13.751986,-20.363653,-8.949669,-14.942745,-6.159790,-8.601578,2.305138,-6.575853
2,16.143533,7.666176,-24.640385,34.110321,-12.880481,8.912144,-9.567223,-26.935310,11.133043,-19.775000,-1.845973,-6.397466,-12.064251
3,14.852614,-0.253819,-0.080831,8.560063,2.710978,16.163058,-10.562176,-3.238067,-18.535487,1.555757,-11.328219,0.416907,-11.374253
4,13.293418,-9.654012,-16.634627,-2.865135,-5.719453,9.355622,-26.460160,0.033995,-19.177963,2.004586,-6.207538,-3.936715,-7.062127
5,14.182783,-0.226640,-2.653217,6.779579,-14.406145,5.111942,-13.917446,-11.185381,-16.236315,-6.797180,-9.969224,2.875632,-10.850188
6,15.055802,16.010252,-30.491827,11.383468,4.436257,-17.703273,9.534037,-25.115267,-14.831134,11.180805,-18.070256,1.245088,0.508111
7,13.430258,-2.287963,-21.820838,6.164107,22.183203,-26.028401,1.700625,0.461472,-19.192944,-1.835067,-1.491692,-4.792838,-4.036096
8,14.388315,-10.598031,-7.593693,12.054368,-22.228430,-3.004672,-9.562168,-6.821188,-17.940583,-8.612288,-9.826096,3.921764,-9.778781
9,16.080817,-17.938211,5.208343,2.039648,-20.194115,-1.198062,-10.642324,-17.959311,-14.007863,-6.736632,-8.195681,5.737094,-9.044448


In [85]:
type(df[0])

pandas.core.series.Series

In [86]:
file = 'Audio_wav/arabic70.wav'

In [87]:
nparr = make_mfcc(file)



In [88]:
type(nparr)

numpy.ndarray

In [92]:
arr = make_class_array('arabic')

In [93]:
arr

array([[[ -1.34867823,  -0.18089621,   0.04456709, ...,  -0.27211229,
           0.35351059,   0.46386848],
        [ -0.30270777,  -0.71587098,  -1.29197734, ...,  -1.21592306,
          -0.9179681 ,  -0.89666179],
        [ -1.01050111,  -1.13836412,  -0.88422643, ...,   1.64884394,
           1.96162447,   2.46315045],
        ...,
        [ -5.01595571,  -2.31226681,   2.8281573 , ...,  -7.69572805,
          -7.80834525, -12.28052623],
        [ -9.16423415, -16.9222069 ,  -6.8396188 , ...,  20.43530953,
          13.26054674,   1.55079883],
        [ -4.25485807, -22.06631425, -23.54941212, ...,  -7.38808281,
          -5.3084077 ,  -3.56917898]],

       [[ -6.0052277 ,  -6.25395734,  -6.42224457, ...,  -6.44663898,
          -6.43745057,  -6.2685451 ],
        [ -5.98278334,  -5.46271036,  -4.93146553, ...,  -2.41866614,
          -2.58453937,  -2.38234746],
        [ -2.41265795,  -2.48688517,  -1.23341577, ...,  -1.53689864,
          -1.70164995,  -1.69090547],
        ...,


In [106]:
def saveNpy(language, arr): 
    count = 1
    folder = language + '_npy'
    for item in arr:
        filename = folder + '/' + language + str(count)
        np.save(filename, item)
        count += 1

In [107]:
saveNpy('arabic', arr)

In [109]:
languages = ['dutch', 'english']

for language in languages:
    saveNpy(language, make_class_array(language))