In [1]:
import sounddevice as sd
import soundfile as sf
import numpy as np
import matplotlib.pyplot as plt
import librosa
import hmmlearn.hmm as hmm
from math import exp
import os
import pickle

In [51]:
def record_sound(filename, duration=1, fs=44100, play=False):
    sd.play( np.sin( 2*np.pi*940*np.arange(fs)/fs )  , samplerate=fs, blocking=True)
#     sd.play( np.zeros( int(fs*0.2) ), samplerate=fs, blocking=True)
    data = sd.rec(frames=duration*fs, samplerate=fs, channels=1, blocking=True)
    if play:
        sd.play(data, samplerate=fs, blocking=True)
    sf.write(filename, data=data, samplerate=fs)

In [52]:
def record_data(prefix, n=5, start=0, duration=1):
    print('Recording {} {} times'.format(prefix, n))
    for i in range(n):
        print('{}_{}.wav'.format(prefix, i+start))
        record_sound(prefix+'/{}_{}.wav'.format(prefix, i+start), duration=duration)
        if i % 5 == 4:
            input("Press Enter to continue...")

In [53]:
def get_mfcc(filename):
    data, fs = librosa.load(filename, sr=None)
    mfcc = librosa.feature.mfcc(data, sr=fs, n_fft=1024, hop_length=128)
    return mfcc.T

In [20]:
mapping = ['tien', 'lui', 'len', 'xuong', 'trai', 'phai', 'quay', 'dung', 'thoat']
n_sample = 20

In [21]:
for item in mapping:
    if not os.path.exists(item):
        os.mkdir(item)
    record_data(prefix=item, n=n_sample, start=0)

Recording tien 20 times
tien_0.wav
tien_1.wav
tien_2.wav
tien_3.wav
tien_4.wav
Press Enter to continue...
tien_5.wav
tien_6.wav
tien_7.wav
tien_8.wav
tien_9.wav
Press Enter to continue...
tien_10.wav
tien_11.wav
tien_12.wav
tien_13.wav
tien_14.wav
Press Enter to continue...
tien_15.wav
tien_16.wav
tien_17.wav
tien_18.wav
tien_19.wav
Press Enter to continue...
Recording lui 20 times
lui_0.wav
lui_1.wav
lui_2.wav
lui_3.wav
lui_4.wav
Press Enter to continue...
lui_5.wav
lui_6.wav
lui_7.wav
lui_8.wav
lui_9.wav
Press Enter to continue...
lui_10.wav
lui_11.wav
lui_12.wav
lui_13.wav
lui_14.wav
Press Enter to continue...
lui_15.wav
lui_16.wav
lui_17.wav
lui_18.wav
lui_19.wav
Press Enter to continue...
Recording len 20 times
len_0.wav
len_1.wav
len_2.wav
len_3.wav
len_4.wav
Press Enter to continue...
len_5.wav
len_6.wav
len_7.wav
len_8.wav
len_9.wav
Press Enter to continue...
len_10.wav
len_11.wav
len_12.wav
len_13.wav
len_14.wav
Press Enter to continue...
len_15.wav
len_16.wav
len_17.wav
len_1

In [22]:
model = []
for item in mapping:
    print('Training {}'.format(item))
    data = [get_mfcc('{}/{}_{}.wav'.format(item, item, idx)) for idx in range(n_sample)]
    model_temp = hmm.GaussianHMM(n_components=50, verbose=True, n_iter=200)
    model_temp.fit(X=np.vstack(data), lengths=[x.shape[0] for x in data])
    model.append(model_temp)

Training tien


         1     -506929.0413             +nan
         2     -458324.2793      +48604.7620
         3     -446197.2591      +12127.0203
         4     -443563.4809       +2633.7782
         5     -442717.6953        +845.7856
         6     -442239.1387        +478.5566
         7     -441966.4470        +272.6917
         8     -441824.9528        +141.4942
         9     -441741.6677         +83.2851
        10     -441697.4870         +44.1808
        11     -441659.2229         +38.2641
        12     -441621.7643         +37.4586
        13     -441599.7130         +22.0513
        14     -441571.4860         +28.2270
        15     -441534.0632         +37.4228
        16     -441504.4055         +29.6577
        17     -441486.5171         +17.8884
        18     -441473.0325         +13.4847
        19     -441461.0227         +12.0097
        20     -441448.6869         +12.3359
        21     -441440.0511          +8.6358
        22     -441434.5033          +5.5478
        23

Training lui


         1     -502964.9755             +nan
         2     -455371.8940      +47593.0815
         3     -444565.4749      +10806.4191
         4     -442356.0126       +2209.4622
         5     -441644.7116        +711.3010
         6     -441320.3015        +324.4101
         7     -441141.9853        +178.3162
         8     -441028.3855        +113.5998
         9     -440961.0505         +67.3350
        10     -440915.5670         +45.4835
        11     -440876.0762         +39.4908
        12     -440835.1288         +40.9474
        13     -440787.1710         +47.9577
        14     -440738.9034         +48.2676
        15     -440697.9995         +40.9039
        16     -440673.1804         +24.8191
        17     -440653.5094         +19.6710
        18     -440632.1938         +21.3156
        19     -440615.9319         +16.2619
        20     -440604.8938         +11.0381
        21     -440593.9692         +10.9245
        22     -440581.7066         +12.2626
        23

Training len


         1     -503823.6591             +nan
         2     -458729.9458      +45093.7133
         3     -447151.9861      +11577.9597
         4     -444096.1179       +3055.8682
         5     -443123.4818        +972.6362
         6     -442653.2849        +470.1969
         7     -442394.0421        +259.2428
         8     -442165.3972        +228.6449
         9     -441988.2594        +177.1378
        10     -441859.2442        +129.0152
        11     -441747.0472        +112.1971
        12     -441600.9341        +146.1131
        13     -441451.8027        +149.1314
        14     -441335.7782        +116.0245
        15     -441272.0993         +63.6789
        16     -441172.7077         +99.3916
        17     -441127.3165         +45.3912
        18     -441109.9889         +17.3276
        19     -441102.0990          +7.8899
        20     -441095.4163          +6.6827
        21     -441082.0464         +13.3699
        22     -441063.9676         +18.0788
        23

Training xuong


         1     -516799.7681             +nan
         2     -468560.9086      +48238.8595
         3     -456709.5316      +11851.3770
         4     -453711.8932       +2997.6383
         5     -452586.5977       +1125.2955
         6     -452193.4461        +393.1516
         7     -451785.3321        +408.1141
         8     -451549.9447        +235.3874
         9     -451395.6267        +154.3179
        10     -451225.9868        +169.6399
        11     -451174.0225         +51.9644
        12     -451119.7092         +54.3132
        13     -451075.1676         +44.5416
        14     -451034.1512         +41.0164
        15     -451004.7507         +29.4005
        16     -450962.6481         +42.1026
        17     -450935.6346         +27.0134
        18     -450913.9307         +21.7040
        19     -450900.4673         +13.4633
        20     -450897.3200          +3.1473
        21     -450889.8106          +7.5095
        22     -450874.4451         +15.3655
        23

Training trai


         1     -509426.8409             +nan
         2     -453692.2942      +55734.5467
         3     -441300.1569      +12392.1373
         4     -438272.1051       +3028.0518
         5     -437432.5520        +839.5532
         6     -437059.1285        +373.4235
         7     -436786.3934        +272.7351
         8     -436585.2028        +201.1906
         9     -436464.5149        +120.6879
        10     -436402.9425         +61.5724
        11     -436358.2628         +44.6797
        12     -436320.9264         +37.3363
        13     -436294.4061         +26.5204
        14     -436274.1607         +20.2454
        15     -436249.6779         +24.4828
        16     -436231.6509         +18.0270
        17     -436215.7425         +15.9085
        18     -436204.2845         +11.4579
        19     -436195.2443          +9.0402
        20     -436183.9130         +11.3313
        21     -436173.9435          +9.9695
        22     -436167.7741          +6.1694
        23

       184     -435585.2213          +0.2586
       185     -435584.9974          +0.2239
       186     -435584.7988          +0.1986
       187     -435584.4944          +0.3044
       188     -435583.7243          +0.7701
       189     -435582.5225          +1.2018
       190     -435581.3259          +1.1966
       191     -435579.8393          +1.4866
       192     -435578.1854          +1.6539
       193     -435576.6778          +1.5077
       194     -435575.0098          +1.6679
       195     -435572.9010          +2.1089
       196     -435571.1530          +1.7480
       197     -435569.7809          +1.3721
       198     -435566.7981          +2.9828
       199     -435563.4641          +3.3340
       200     -435562.2474          +1.2168


Training phai


         1     -511141.6666             +nan
         2     -459012.5989      +52129.0677
         3     -446988.6817      +12023.9172
         4     -444180.7885       +2807.8931
         5     -443204.9253        +975.8632
         6     -442752.5491        +452.3762
         7     -442479.9024        +272.6468
         8     -442332.3080        +147.5943
         9     -442218.9062        +113.4018
        10     -442165.2061         +53.7001
        11     -442130.1205         +35.0856
        12     -442079.9334         +50.1871
        13     -442040.5013         +39.4321
        14     -442001.5230         +38.9783
        15     -441944.2420         +57.2810
        16     -441884.0499         +60.1921
        17     -441845.0692         +38.9808
        18     -441795.9971         +49.0721
        19     -441737.0024         +58.9947
        20     -441675.3643         +61.6381
        21     -441632.8202         +42.5441
        22     -441612.7193         +20.1009
        23

Training quay


         1     -513698.1669             +nan
         2     -469996.8274      +43701.3395
         3     -458357.7516      +11639.0758
         4     -454200.4259       +4157.3257
         5     -452795.2236       +1405.2023
         6     -452326.5631        +468.6606
         7     -452016.6095        +309.9536
         8     -451687.2842        +329.3253
         9     -451498.9017        +188.3825
        10     -451354.9242        +143.9775
        11     -451292.5929         +62.3313
        12     -451248.7822         +43.8107
        13     -451214.4414         +34.3408
        14     -451185.6121         +28.8293
        15     -451154.3428         +31.2693
        16     -451144.3873          +9.9555
        17     -451134.9320          +9.4554
        18     -451121.2263         +13.7056
        19     -451110.0474         +11.1789
        20     -451104.6220          +5.4253
        21     -451099.1557          +5.4663
        22     -451090.9749          +8.1808
        23

Training dung


         1     -503493.7715             +nan
         2     -459642.7781      +43850.9933
         3     -450318.9018       +9323.8763
         4     -447575.9508       +2742.9510
         5     -446591.0450        +984.9057
         6     -446048.0251        +543.0199
         7     -445693.5027        +354.5225
         8     -445496.9724        +196.5302
         9     -445372.5109        +124.4615
        10     -445270.5524        +101.9586
        11     -445191.5668         +78.9855
        12     -445132.2080         +59.3588
        13     -445078.1514         +54.0566
        14     -445027.8087         +50.3427
        15     -445001.1742         +26.6345
        16     -444972.3899         +28.7843
        17     -444952.2580         +20.1320
        18     -444941.3400         +10.9180
        19     -444934.4578          +6.8822
        20     -444928.9291          +5.5287
        21     -444922.6220          +6.3071
        22     -444915.5142          +7.1078
        23

Training thoat


         1     -504289.7536             +nan
         2     -460085.3647      +44204.3889
         3     -450514.7827       +9570.5820
         4     -447799.8029       +2714.9799
         5     -446824.1210        +975.6818
         6     -446408.4844        +415.6366
         7     -446219.5878        +188.8966
         8     -446102.5889        +116.9988
         9     -446018.5464         +84.0425
        10     -445958.4448         +60.1016
        11     -445900.8453         +57.5995
        12     -445850.9774         +49.8678
        13     -445808.4074         +42.5700
        14     -445768.4943         +39.9131
        15     -445728.6487         +39.8457
        16     -445708.5119         +20.1368
        17     -445696.6987         +11.8132
        18     -445683.6064         +13.0922
        19     -445669.7281         +13.8784
        20     -445657.1513         +12.5767
        21     -445643.7615         +13.3899
        22     -445629.3074         +14.4541
        23

       184     -445468.8531          +0.0361
       185     -445468.8099          +0.0433
       186     -445468.7499          +0.0600
       187     -445468.6710          +0.0789
       188     -445468.5858          +0.0852
       189     -445468.5144          +0.0714
       190     -445468.4649          +0.0495
       191     -445468.4325          +0.0323
       192     -445468.4111          +0.0215
       193     -445468.3962          +0.0149
       194     -445468.3854          +0.0108
       195     -445468.3774          +0.0080


In [24]:
for item in mapping:
    print('actual {}'.format(item))
    record_sound('{}.wav'.format(item))
    mfcc = get_mfcc('{}.wav'.format(item))
    score = []
    for i in range(len(mapping)):
        score.append(model[i].score(mfcc))

    print('predict: {}'.format(mapping[score.index(max(score))]))
    print(score)

actual tien
predict: tien
[-23661.59705113398, -25483.131725546147, -25430.865227327773, -26205.08036419946, -27398.6505633732, -25961.70274722885, -25930.62330377547, -25861.18970658604, -29725.181062962718]
actual lui
predict: lui
[-27213.413383840976, -25133.772528894693, -26596.809862101825, -26595.366266794073, -29948.8440153576, -26900.827452859652, -26793.895027829756, -28128.401419412872, -26947.59262476596]
actual len
predict: len
[-27006.360228694168, -28429.725409252627, -24932.514253066725, -28160.205531392578, -31447.57437200141, -28397.2597798471, -26157.071542879065, -26459.419317007454, -29978.35877671495]
actual xuong
predict: xuong
[-26684.849486671115, -25219.605411676595, -26920.790363530003, -24347.121294614128, -27368.5488773054, -26257.803806103937, -27280.39180621681, -27275.560123063522, -26648.963423323858]
actual trai
predict: thoat
[-29018.07019291123, -29682.185303690356, -28347.2227258761, -27565.418439323676, -26886.31389295178, -27999.975080519227, -2830

In [25]:
pickle.dump(model, open('hmm.pk', 'wb'))

In [26]:
model = pickle.load(open('hmm.pk', 'rb'))

In [48]:
# def record_sound(filename, duration=1, fs=44100, play=False):
#     sd.play( np.sin( 2*np.pi*940*np.arange(fs)/fs )  , samplerate=fs, blocking=True)
# #     sd.play( np.zeros( int(fs*0.2) ), samplerate=fs, blocking=True)
#     data = sd.rec(frames=duration*fs, samplerate=fs, channels=1, blocking=True)
#     mfcc = librosa.feature.mfcc(data, sr=fs, n_fft=1024, hop_length=128)
#     return mfcc.T

In [54]:
def test():
    mfcc = record_sound('test.wav')
    mfcc = get_mfcc('test.wav')
    score = []
    for i in range(len(mapping)):
        score.append(model[i].score(mfcc))
    print('predict: {} \n'.format(mapping[score.index(max(score))]))
    print(score)

In [56]:
test()

predict: xuong 

[-30308.512699671923, -27557.859525158416, -29135.373331544608, -25618.60626228744, -29509.30575680952, -28111.57414858887, -28418.410526302763, -28982.141270008415, -32759.8278367663]
