In [6]:
# pip install git+https://github.com/onnx/onnx-tensorflow.git

In [2]:
import io
import numpy as np
import onnx
from onnx_tf.backend import prepare
from torch import nn
import tensorflow as tf
from tensorflow.keras import models
import torch.utils.model_zoo as model_zoo
import torch.onnx
import torchaudio
import librosa

In [8]:
import numpy as np
import math

class SincConv(torch.nn.Module):
    @staticmethod
    def to_mel(hz):
        return 2595 * np.log10(1 + hz / 700)

    @staticmethod
    def to_hz(mel):
        return 700 * (10 ** (mel / 2595) - 1)

    def __init__(self, out_channels, kernel_size, sample_rate=16000, in_channels=1, stride=1, padding=0, min_low_hz=50, min_band_hz=50):
        super(SincConv,self).__init__()

        if in_channels != 1:
            msg = "SincConv only support one input channel (here, in_channels = {%i})" % (in_channels)
            raise ValueError(msg)

        self.out_channels = out_channels
        self.kernel_size = kernel_size
        # Forcing the filters to be odd (i.e, perfectly symmetrics)
        if kernel_size%2==0:
            self.kernel_size=self.kernel_size+1
        # parameters    
        self.stride = stride
        self.padding = padding
        self.sample_rate = sample_rate
        self.min_low_hz = min_low_hz
        self.min_band_hz = min_band_hz
        # initialize filterbanks such that they are equally spaced in Mel scale
        low_hz = 30
        high_hz = self.sample_rate / 2 - (self.min_low_hz + self.min_band_hz)
        mel = np.linspace(self.to_mel(low_hz), self.to_mel(high_hz), self.out_channels + 1)
        hz = self.to_hz(mel)
        # filter lower frequency (out_channels, 1)
        self.low_hz_ = torch.nn.Parameter(torch.Tensor(hz[:-1]).view(-1, 1))
        # filter frequency band (out_channels, 1)
        self.band_hz_ = torch.nn.Parameter(torch.Tensor(np.diff(hz)).view(-1, 1))
        # Hamming window
        n_lin=torch.linspace(0, (self.kernel_size/2)-1, steps=int((self.kernel_size/2))) # computing only half of the window
        self.window_=0.54-0.46*torch.cos(2*math.pi*n_lin/self.kernel_size);
        # (1, kernel_size/2)
        n = (self.kernel_size - 1) / 2.0
        self.n_ = 2*math.pi*torch.arange(-n, 0).view(1, -1) / self.sample_rate # Due to symmetry, I only need half of the time axes

    def forward(self, waveforms):
        self.n_ = self.n_.to(waveforms.device)
        self.window_ = self.window_.to(waveforms.device)
        low = self.min_low_hz  + torch.abs(self.low_hz_)
        high = torch.clamp(low + self.min_band_hz + torch.abs(self.band_hz_),self.min_low_hz,self.sample_rate/2)
        band=(high-low)[:,0]
        
        f_times_t_low = torch.matmul(low, self.n_)
        f_times_t_high = torch.matmul(high, self.n_)

        band_pass_left=((torch.sin(f_times_t_high)-torch.sin(f_times_t_low))/(self.n_/2))*self.window_ # Equivalent of Eq.4 of the reference paper (SPEAKER RECOGNITION FROM RAW WAVEFORM WITH SINCNET). I just have expanded the sinc and simplified the terms. This way I avoid several useless computations. 
        band_pass_center = 2*band.view(-1,1)
        band_pass_right= torch.flip(band_pass_left,dims=[1])
        
        band_pass=torch.cat([band_pass_left,band_pass_center,band_pass_right],dim=1)
        band_pass = band_pass / (2*band[:,None])

        self.filters = (band_pass).view(self.out_channels, 1, self.kernel_size)
        return torch.nn.functional.conv1d(waveforms, self.filters, stride=self.stride, padding=self.padding, dilation=1, bias=None, groups=1) 

In [9]:
class LogAbs(torch.nn.Module):
    def __init__(self):
        super(LogAbs,self).__init__()

    def forward(self, x):
        return torch.log(torch.abs(x) + 1)

In [10]:
def SincConvBlock(c, k, s, sample_rate=16000):
    sinc_conv = SincConv(out_channels=c, kernel_size=k, sample_rate=sample_rate, in_channels=1, stride=s, padding=k//2)
    avg_pool = torch.nn.AvgPool1d(kernel_size=2, stride=2)
    return torch.nn.Sequential(sinc_conv, LogAbs(), torch.nn.BatchNorm1d(c), avg_pool)

def DSConvBlock(c_in, c, k, s):
    depth_conv = torch.nn.Conv1d(in_channels=c_in, out_channels=c_in, kernel_size=k, stride=s, padding=k//2, groups=c_in)
    point_conv = torch.nn.Conv1d(in_channels=c_in, out_channels=c, kernel_size=1, stride=1)
    return torch.nn.Sequential(depth_conv, point_conv, torch.nn.ReLU(), torch.nn.BatchNorm1d(c), torch.nn.AvgPool1d(2), torch.nn.Dropout(0.1))

In [11]:
sinc_conv_model = torch.nn.Sequential(
    SincConvBlock(c=40, k=101, s=8),
    DSConvBlock(40,160,25,2), 
    DSConvBlock(160,160,9,1), 
    DSConvBlock(160,160,9,1), 
    DSConvBlock(160,160,9,1), 
    DSConvBlock(160,160,9,1), 
    torch.nn.AvgPool1d(15),
    torch.nn.Flatten(),
    torch.nn.Linear(160,35),
    torch.nn.Softmax(dim=1)
)

In [33]:
torch_model =sinc_conv_model
model_path = 'Trained_Models/SincConv_model.pth'

In [34]:
# torch_model.load_state_dict('SinConv_Model.pt')
torch_model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
# set the model to inference mode
torch_model.eval()

Sequential(
  (0): Sequential(
    (0): SincConv()
    (1): LogAbs()
    (2): BatchNorm1d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): AvgPool1d(kernel_size=(2,), stride=(2,), padding=(0,))
  )
  (1): Sequential(
    (0): Conv1d(40, 40, kernel_size=(25,), stride=(2,), padding=(12,), groups=40)
    (1): Conv1d(40, 160, kernel_size=(1,), stride=(1,))
    (2): ReLU()
    (3): BatchNorm1d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): AvgPool1d(kernel_size=(2,), stride=(2,), padding=(0,))
    (5): Dropout(p=0.1, inplace=False)
  )
  (2): Sequential(
    (0): Conv1d(160, 160, kernel_size=(9,), stride=(1,), padding=(4,), groups=160)
    (1): Conv1d(160, 160, kernel_size=(1,), stride=(1,))
    (2): ReLU()
    (3): BatchNorm1d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): AvgPool1d(kernel_size=(2,), stride=(2,), padding=(0,))
    (5): Dropout(p=0.1, inplace=False)
  )
  (3): Sequential(
    (0): Conv

In [35]:
waveform, fs = torchaudio.load('test1.wav')
# wav_libro, sr = librosa.load('test1.wav',sr=16000)
inp=waveform.unsqueeze(0)

In [36]:
print(inp.shape)

torch.Size([1, 1, 16000])


In [37]:
torch_out = torch_model(inp)

In [38]:
print(torch_out.argmax(dim=-1))

tensor([2])


In [39]:
torch.onnx.export(torch_model, inp, "sinc_conv.onnx", opset_version=10)

In [40]:
model_onnx = onnx.load('sinc_conv.onnx')
tf_rep = prepare(model_onnx) 

In [41]:
tf_rep.export_graph("model_sinc.pb")



INFO:tensorflow:Assets written to: model_sinc.pb\assets


INFO:tensorflow:Assets written to: model_sinc.pb\assets


## Converting SincConv to TFLite

In [42]:

# Convert the model
converter = tf.lite.TFLiteConverter.from_saved_model('C:/Users/cferr/OneDrive/Documents/4th Year/FYP/model_sinc.pb') # path to the SavedModel directory
tflite_model = converter.convert()

# Save the model.
with open('model_sinc.tflite', 'wb') as f:
  f.write(tflite_model)

## Converting MFCC to TFLite

This is a much more straightforward transformation as the MFCC model is natively built using tensorflow

In [4]:
keras_model_filename = '14_03_2021__21_28.h5'
tflite_filename = 'mfcc_model.tflite'

# Convert model to TF Lite model
model = models.load_model(keras_model_filename)
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()
open(tflite_filename, 'wb').write(tflite_model)

INFO:tensorflow:Assets written to: C:\Users\cferr\AppData\Local\Temp\tmp4a914hxl\assets


1122268