In [2]:
from hmmlearn import hmm
class ModelHMM(object):
    def __init__(self, num_components=8, num_iter=1000):
        self.n_components = num_components
        self.n_iter = num_iter
    # Define the covariance type and the type of HMM:
        self.cov_type = 'diag'
        self.model_name = 'GaussianHMM'
    # Initialize the variable in which we will store the models for each word:
        self.models = []
    # Define the model using the specified parameters:
        self.model = hmm.GaussianHMM(n_components=self.n_components,
                covariance_type=self.cov_type,n_iter=self.n_iter)

    # Define a method to train the model
    # 'training_data' is a 2D numpy array where each row has length of number of mfcc coefficients
    def train(self, training_data):
        np.seterr(all='ignore')
        cur_model = self.model.fit(training_data)
        self.models.append(cur_model)

    # Define a method to compute log likelihood score for input features
    def compute_score(self, input_data):
        return self.model.score(input_data)  # model.score returns log likelihood of sample input_data

In [3]:
def detect_leading_silence(sound, silence_threshold=-50.0, chunk_size=10):
    '''
    sound is a pydub.AudioSegment
    silence_threshold in dB
    chunk_size in ms

    iterate over chunks until you find the first one with sound
    '''
    trim_ms = 0 # ms

    assert chunk_size > 0 # to avoid infinite loop
    while sound[trim_ms:trim_ms+chunk_size].dBFS < silence_threshold and trim_ms < len(sound):
        trim_ms += chunk_size

    return trim_ms

In [4]:
import os
import numpy as np
from scipy.io import wavfile
from pydub import AudioSegment
import warnings
from python_speech_features import mfcc

def build_one_model(input_folder, num_states, num_cep_coeff):
    # input_folder: path to the folder containing training wav files with the word
    # num_states: number of hidden states in HMM
    # num_cep_coeff: number of MFCC features extracted from each time window
    X = np.array([])  # features
    training_files = [x for x in os.listdir(input_folder) if x.endswith('.wav')]
    for filename in training_files:
        # Extract the current filepath and read the file
        filepath = os.path.join(input_folder, filename)
        sampling_freq, signal = wavfile.read(filepath)
        sound = AudioSegment.from_file(filepath, format="wav")
        duration = len(sound)

        start_trim = detect_leading_silence(sound)
        end_trim = detect_leading_silence(sound.reverse())
        trimmed_sound = sound[start_trim:duration - end_trim]
        trimmed_sound.export("tmp.wav", format="wav")
        # Extract features
        # Default values:
        # winlen=0.025, winstep=0.01, nfft=512,
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            features_mfcc = mfcc(signal, sampling_freq, numcep=num_cep_coeff)

        # Append features to the variable X
        if len(X) == 0:
            X = features_mfcc
        else:
            X = np.append(X, features_mfcc, axis=0)

    # Initiate HMM model object
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', category=DeprecationWarning)
        model = ModelHMM(num_components=num_states)

    # Train HMM model, calculate likelihood of the sample by the trained model
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', category=DeprecationWarning)
        model.train(X)
        model_score = model.compute_score(X)

    return model, model_score, num_cep_coeff



In [6]:
num_components = 22
num_cepstr = 10
def build_models(input_folder):

    # input_folder contains subfolders with samples of words in wav files

    # Initialize the variable to store all the models
    speech_models = []

    # Parse the input directory
    for dirname in os.listdir(input_folder):

        # Get name of subfolder
        subfolder = os.path.join(input_folder, dirname)

        if not os.path.isdir(subfolder):
            continue

        # Extract label
        label = subfolder[subfolder.rfind('/') + 1:]

        # Fit model for label
        model = build_one_model(subfolder, num_states=num_components, num_cep_coeff=num_cepstr)

        # Add the model to the list
        speech_models.append((model, label))

        # Reset model variable
        model = None
        print("Fitted "+dirname)
    return speech_models

In [7]:
input_folder = "./MNIST_9-25"
digit_models = build_models(input_folder)

Fitted num0
Fitted num1
Fitted num2
Fitted num3
Fitted num4
Fitted num5
Fitted num6
Fitted num7
Fitted num8
Fitted num9


In [8]:
from sklearn.externals import joblib
joblib.dump(digit_models,'saved.pkl')



['saved.pkl']

In [9]:
speech_models = joblib.load('saved.pkl')

In [10]:
from pydub.silence import split_on_silence

def new_chunks(filename):
    sound_file = AudioSegment.from_wav(filename)
    n = 20
    silence_len = 100        
    thresh = -40
    audio_chunks = split_on_silence(sound_file, min_silence_len=silence_len, silence_thresh=thresh)
    num_signals = len(audio_chunks)
    check1 = num_signals 
    check2 = 0
    while num_signals != 10:
        if num_signals > 10:
            silence_len += n
        else:
            silence_len -= n
        if n > 1:
            n //= 2
        audio_chunks = split_on_silence(sound_file, min_silence_len=silence_len, silence_thresh=thresh)
        num_signals = len(audio_chunks)
        #чтобы исключить зацикливание, когда при разнице в одну секунду определяются 9 или 11 цифр:
        if num_signals == check2 and check2 == 11: 
            break
        check2 = check1
        check1 = num_signals
    return audio_chunks

In [11]:
def score_one_word(trained_model, test_file_path):
    # trained_model: ModelHMM object with trained model
    # test_file_path: path to wav file

    sampling_freq, signal = wavfile.read(test_file_path)
    num_cep_coeff = trained_model[2]

    # Extract features
    # Default values:
    # winlen=0.025, winstep=0.01, nfilt=26, nfft=512,
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        features_mfcc = mfcc(signal, sampling_freq, numcep=num_cep_coeff)

        # Calculate log likelihood
        word_score = trained_model[0].compute_score(features_mfcc)
    return word_score

In [12]:
from scipy.io import wavfile
def teln(audio_chunks):
    tel = ""
    count = 0
    for chunk in audio_chunks:
        count += 1
        if count == 11: #если не смог определить 10 цифр, и пришлось брать 11
            break
        max_score = -float('inf')
                # Run the current feature vector through all the HMM
                #  models and pick the one with the highest score
        chunk.export("tmp.wav", format="wav")
        for item in speech_models:
            model, label = item
                    # Evaluate the score and compare against the maximum score:
            score = score_one_word(model, "tmp.wav")
            if score > max_score:
                max_score = score
                predicted_label = label
        tel = tel + predicted_label[-1]
    return tel

In [13]:
input_folder = "./test/6/"
test_files = [x for x in os.listdir(input_folder) if x.endswith('.wav')]
tels = dict()
for filename in test_files:
    #разделяет номер:
    a_ch = new_chunks("./test/6/" + filename)
    #обрабатывает его:
    num = teln(a_ch)
    #определяет номер файла:
    n = int(filename[0])
    if filename[1] != '.':
        n = n*10 + int(filename[1])
        if filename[2] != '.':
            n = n*10 + int(filename[2])
    #записывает в словарь:
    tels[n] = num

In [14]:
with open("answ.txt", "w") as inf:
    for i in range(300):
        inf.write(str(i))
        inf.write(",")
        inf.write(tels[i])
        inf.write("\n")