In [1]:
import numpy as np
import torch
import os
import sys

from torch import optim
import time
from pytorch_model_summary import summary

sys.path.insert(1, '../utils')
sys.path.insert(1, '../models')

from data import load_data

In [2]:
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        self.conv1 = nn.Conv2d(1,8,3)
        self.maxpool1 = nn.AdaptiveMaxPool2d(8)
        self.dropout = nn.Dropout(0.2)
        
        self.fc1 = nn.Linear(8*8*8, 256)
    def forward(self, x):
        x = self.conv1(x)
        x = self.maxpool1(x)
        x = F.relu(x)
        x = self.dropout(x)
        
        x = x.view(-1,8*8*8)
        x = F.relu(self.fc1(x))
        
        return x
    
class AudioLSTM(nn.Module):
    def __init__(self, batch_size, hidden_size, num_layers): #hid_size = fc1 out size
        super(AudioLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.encoder = Encoder()
        self.num_directions = 2
        self.lstm_layers = 2
        self.lstm = nn.LSTM(input_size=8*8*4, 
                            hidden_size = hidden_size,
                            num_layers=num_layers, 
                            batch_first=True,
                            dropout=0.5,
                           bidirectional=True)
        
        self.fc1 = nn.Linear(256*3, 128)
        self.fc2 = nn.Linear(128,1)
        self.dropout = nn.Dropout(0.5)
        

    
    def forward(self, prev, curr, nex):
#         h_0, c_0 = self.init_hidden(1)
#         print(curr.shape)
        (_, ht, wd) = curr.shape
        prev = self.encoder(prev.view(1,1,ht, wd)).view(-1)
        curr = self.encoder(curr.view(1,1,ht, wd)).view(-1)
        nex = self.encoder(nex.view(1,1,ht, wd)).view(-1)
        
        stacked = torch.stack([prev, curr, nex]).view(-1,3,256)
#         print(stacked.shape)
        x, (h_n, c_n)= self.lstm(stacked)
#     output dim: [3,]
#         print(x.shape)
        x = x.view(-1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x
        

In [1]:
import librosa
from torch.utils.data import Dataset

class CustomStutterData(Dataset):
    def __init__(self, root_dir, annotations_path):
        self.root_dir = root_dir
                
        self.annotations = np.load(annotations_path)[:-1]
        
    def __len__(self):
        return len(self.annotations)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        
        audio, sr = librosa.core.load(os.path.join(self.root_dir,
                                        self.annotations[idx][0].split('/')[-1]))
        if idx != 0:
            audio_prev, sr = librosa.core.load(os.path.join(self.root_dir,
                                        self.annotations[idx-1][0].split('/')[-1]))
            
        if idx != len(self.annotations)-1:
            audio_next, sr = librosa.core.load(os.path.join(self.root_dir,
                                        self.annotations[idx+1][0].split('/')[-1]))
        if idx == 0:
            mfcc_prev = torch.zeros(13,22)
        else:
            mfcc_prev = torch.from_numpy(librosa.feature.mfcc(audio_prev, sr, n_mfcc=13))
            
        mfcc = torch.from_numpy(librosa.feature.mfcc(audio, sr, n_mfcc = 13))
        
        if idx == len(self.annotations)-1:
            mfcc_next = torch.zeros(13,22)
        else:
            mfcc_next = torch.from_numpy(librosa.feature.mfcc(audio_next, sr, n_mfcc=13))
        
        label = float(self.annotations[idx][1])
        
        sample ={'mfcc': mfcc, 'label':label, 'prev_mfcc':mfcc_prev, 'next_mfcc':mfcc_next}
        
        return sample

In [2]:
class CustomDatasetInfer(Dataset):
    def __init__(self, filename):
        self.file_name = filename
        self.mfcc_all = []
        self.all_blocks=[]
        self.audio, self.sr = librosa.load(filename)

        buffer = int(0.5 * self.sr)

        samples_total = len(self.audio)
        samples_checked = 0
        counter = 1
        while samples_checked < samples_total:
                print(samples_checked, samples_total)
                #check if the buffer is not exceeding total samples 
                if buffer > (samples_total - samples_checked):
                    buffer = samples_total - samples_checked

                block = self.audio[samples_checked : (samples_checked + buffer)]
        #         print(type(block), .shape)
                #check if block is stuttered
                self.all_blocks.append(block)
                mfcc = librosa.feature.mfcc(block, self.sr, n_mfcc=13)
                self.mfcc_all.append(mfcc)
                counter += 1
                samples_checked += buffer
                
        # add conditional logic to only pop if insufficient length
        self.all_blocks.pop(-1)
        self.mfcc_all.pop(-1)
    def _len__(self):
        len(self.all_blocks)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        
        audio = torch.from_numpy(self.all_blocks[idx])
        
        if idx == 0:
            mfcc_prev = torch.zeros(13,22)
        else:
            mfcc_prev = torch.from_numpy(self.mfcc_all[idx-1])
            
        mfcc = torch.from_numpy(self.mfcc_all[idx])
        
        if idx == len(self.all_blocks)-1:
            mfcc_next = torch.zeros(13,22)
        else:
            mfcc_next = torch.from_numpy(self.mfcc_all[idx+1])
        
        sample ={'mfcc': mfcc, 'prev_mfcc':mfcc_prev, 'next_mfcc':mfcc_next, 'sr':self.sr}
        
        return sample

In [11]:
csd = CustomStutterData('./split/', annotations_path='./split/stutter1_annotations.npy')

In [12]:
inf = CustomDatasetInfer('record.wav')

0 161792
11025 161792
22050 161792
33075 161792
44100 161792
55125 161792
66150 161792
77175 161792
88200 161792
99225 161792
110250 161792
121275 161792
132300 161792
143325 161792
154350 161792


In [13]:
from torch.utils.data.sampler import SubsetRandomSampler, SequentialSampler

def load_data(dataset, batch_size, validation_split=0.2, shuffle_dataset=True, random_seed=42):

    dataset_size = len(dataset)
    indices = list(range(dataset_size))
    split = int(np.floor(validation_split * dataset_size))
    if shuffle_dataset :
        np.random.seed(random_seed)
        np.random.shuffle(indices)
    train_indices, val_indices = indices[:dataset_size-split], indices[dataset_size-split:]
    
    if shuffle_dataset:
        
        train_sampler = SubsetRandomSampler(train_indices)
        valid_sampler = SubsetRandomSampler(val_indices)
    else:
        print('seq')
        train_sampler = SequentialSampler(train_indices)
        valid_sampler = SequentialSampler(val_indices)
    train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, 
                                               sampler=train_sampler)
    validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
                                                    sampler=valid_sampler)
    return train_loader, validation_loader

In [14]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")     #Check whether a GPU is present.

model = AudioLSTM(1, 8*8*2,2)

In [15]:
lr = 0.001
batch_size = 1
epochs = 25
validation_split=0.2
shuffle_dataset=True
random_seed=42

# optimizer = optim.SGD(model.parameters(), lr = 0.00001, momentum=0.9, weight_decay=5e-4)
# scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[150, 200], gamma=0.1)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr = lr)

In [16]:
train_loader, validation_loader = load_data(csd, batch_size, validation_split=0.2, shuffle_dataset=False, random_seed=42)

seq


In [17]:
print(summary(AudioLSTM(1,8*8*2,2), csd[0]['prev_mfcc'].view(-1,13,22),csd[0]['prev_mfcc'].view(-1,13,22),csd[0]['prev_mfcc'].view(-1,13,22) , show_input=True))

-----------------------------------------------------------------------
      Layer (type)         Input Shape         Param #     Tr. Param #
         Encoder-1      [1, 1, 13, 22]         131,408         131,408
            LSTM-2         [1, 3, 256]         790,528         790,528
          Linear-3               [768]          98,432          98,432
          Linear-4               [128]             129             129
Total params: 1,020,497
Trainable params: 1,020,497
Non-trainable params: 0
-----------------------------------------------------------------------


In [71]:
model.to(device)

AudioLSTM(
  (encoder): Encoder(
    (conv1): Conv2d(1, 8, kernel_size=(3, 3), stride=(1, 1))
    (maxpool1): AdaptiveMaxPool2d(output_size=8)
    (dropout): Dropout(p=0.2, inplace=False)
    (fc1): Linear(in_features=512, out_features=256, bias=True)
  )
  (lstm): LSTM(256, 128, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc1): Linear(in_features=768, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [18]:
def save_ckp(state, is_best, checkpoint_path, best_model_path):
    """
    state: checkpoint we want to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best model
    """
    f_path = checkpoint_path
    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # copy that checkpoint file to best path given, best_model_path
        shutil.copyfile(f_path, best_fpath)

In [19]:
model.to('cuda')

AudioLSTM(
  (encoder): Encoder(
    (conv1): Conv2d(1, 8, kernel_size=(3, 3), stride=(1, 1))
    (maxpool1): AdaptiveMaxPool2d(output_size=8)
    (dropout): Dropout(p=0.2, inplace=False)
    (fc1): Linear(in_features=512, out_features=256, bias=True)
  )
  (lstm): LSTM(256, 128, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc1): Linear(in_features=768, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [20]:
next(model.lstm.parameters()).is_cuda

True

In [21]:
import shutil

In [76]:
model.train()

AudioLSTM(
  (encoder): Encoder(
    (conv1): Conv2d(1, 8, kernel_size=(3, 3), stride=(1, 1))
    (maxpool1): AdaptiveMaxPool2d(output_size=8)
    (dropout): Dropout(p=0.2, inplace=False)
    (fc1): Linear(in_features=512, out_features=256, bias=True)
  )
  (lstm): LSTM(256, 128, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc1): Linear(in_features=768, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [78]:
valid_loss_min = float('inf') #init val_loss
checkpoint_path = '../models/audioCNNLSTM_ckpt.pth'
best_model_path = '../models/audioCNNLSTM_best_ckpt.pth'
for epoch in range(epochs):
    losses=[]
#     scheduler.step()
    
    start = time.time()
    
    for b_idx, x in enumerate(train_loader):
#         print(b_idx)
#         print(x['mfcc'].shape)
#         print(x['mfcc'])
        prev, curr, nex, targets = x['prev_mfcc'].to(device), x['mfcc'].to(device),x['next_mfcc'].to(device), x['label'].to(device)
        
        optimizer.zero_grad()
#         print(next(model.parameters()).is_cuda)
        op = model(prev, curr, nex).view(-1)
#         print(op[0], targets[0])
#         print(type(op.view(-1)[0]), type(targets[0]))
#         print(b_idx, op)
#         print(targets)
        loss = criterion(op, targets)
        loss.backward()
        
        optimizer.step()
        losses.append(loss.item())
        end = time.time()
        if b_idx % 100 == 0:
            print('Batch Index : %d Loss : %.10f Time : %.3f seconds ' % (b_idx, np.mean(losses), end - start))    
    model.eval()
    total = 0
    correct = 0
    acc = 0
    
    with torch.no_grad():
        for b_idx, x in enumerate(validation_loader):
            prev, curr, nex, targets =  x['prev_mfcc'].to(device), x['mfcc'].to(device), x['next_mfcc'].to(device), x['label'].to(device)

            outputs = torch.sigmoid(model(prev, curr, nex))
#             print(outputs, targets)
            
            
            predicted = torch.round(outputs.data)
            total += targets.size(0)
            correct += predicted.eq(targets.data).cpu().sum()
#             print(targets, '\n targets', predicted)
            valid_loss = criterion(predicted.view(-1), targets.data)
            acc = 100.*correct/total
        print('Epoch : %d Val_Acc : %.3f Val_loss: %.3f' % (epoch, acc, valid_loss))
        print('--------------------------------------------------------------')
    checkpoint = {
            'epoch': epoch + 1,
            'valid_loss_min': valid_loss,
            'valid_acc': acc,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
        }

    save_ckp(checkpoint, False, checkpoint_path, best_model_path)
    
    if valid_loss <= valid_loss_min:
            print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,valid_loss))
            # save checkpoint as best model
            save_ckp(checkpoint, True, checkpoint_path, best_model_path)
            valid_loss_min = valid_loss
    model.train()   

Batch Index : 0 Loss : 0.0762666687 Time : 0.032 seconds 
Batch Index : 100 Loss : 0.3615717761 Time : 2.265 seconds 
Batch Index : 200 Loss : 0.4114687503 Time : 4.519 seconds 
tensor([0.], device='cuda:0', dtype=torch.float64) 
 targets tensor([0.], device='cuda:0')
tensor([0.], device='cuda:0', dtype=torch.float64) 
 targets tensor([0.], device='cuda:0')
tensor([0.], device='cuda:0', dtype=torch.float64) 
 targets tensor([0.], device='cuda:0')
tensor([0.], device='cuda:0', dtype=torch.float64) 
 targets tensor([0.], device='cuda:0')
tensor([1.], device='cuda:0', dtype=torch.float64) 
 targets tensor([1.], device='cuda:0')
tensor([0.], device='cuda:0', dtype=torch.float64) 
 targets tensor([1.], device='cuda:0')
tensor([0.], device='cuda:0', dtype=torch.float64) 
 targets tensor([1.], device='cuda:0')
tensor([0.], device='cuda:0', dtype=torch.float64) 
 targets tensor([0.], device='cuda:0')
tensor([0.], device='cuda:0', dtype=torch.float64) 
 targets tensor([0.], device='cuda:0')
ten

In [22]:
def load_ckp(checkpoint_fpath, model, optimizer):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into       
    optimizer: optimizer we defined in previous training
    """
    # load check point
    checkpoint = torch.load(checkpoint_fpath)
    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    # initialize valid_loss_min from checkpoint to valid_loss_min
    valid_loss_min = checkpoint['valid_loss_min']
    # return model, optimizer, epoch value, min validation loss 
    return model, optimizer, checkpoint['epoch'], valid_loss_min.item()

In [23]:
checkpoint_fpath = '../models/audioCNNLSTM_best_ckpt.pth'
model,optimizer, chkpt, v_loass = load_ckp(checkpoint_fpath, model, optimizer)

In [24]:
model.to('cuda')

AudioLSTM(
  (encoder): Encoder(
    (conv1): Conv2d(1, 8, kernel_size=(3, 3), stride=(1, 1))
    (maxpool1): AdaptiveMaxPool2d(output_size=8)
    (dropout): Dropout(p=0.2, inplace=False)
    (fc1): Linear(in_features=512, out_features=256, bias=True)
  )
  (lstm): LSTM(256, 128, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc1): Linear(in_features=768, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [29]:
def correct_audio_in_chunks(filename, seg_length=0.5, zero_fill = False):
    '''
    Correct stuttered speech in chunks of duration seg_length

    Correct each chunk of audio and merge the corrected chunks together. Effectiveness depends on the audio in question and the size of the chunks.

    Parameters:
    filename (string): The path of the stuttered audio file
    seg_length (number): Length of the chunks in seconds
    zero_fill (boolean): (Optional) Whether the stuttered bits are removed or replaced with zeroes

    Returns:
    y (numpy ndarray): The sampled amplitude of the corrected audio
    sr (number): The sampling rate of the corrected audio
    '''
    inf = CustomDatasetInfer(filename)
    
    buffer = int(seg_length * inf[0]['sr'])

    samples_total = len(inf.all_blocks)
    samples_checked = 0
    counter = 1
    
    corrected_audio = np.array([])
    indices = list(range(len(inf.all_blocks)))
    sampler = SequentialSampler(indices)
    
    loader = torch.utils.data.DataLoader(inf, batch_size=1, 
                                               sampler=sampler)
    for idx, x in enumerate(loader):
        prev, curr, nex = x['prev_mfcc'].to(device), x['mfcc'].to(device),x['next_mfcc'].to(device)
        op = model(prev, curr, nex).view(-1)
        outputs = torch.sigmoid(model(prev, curr, nex))
#             print(outputs, targets)
            
        predicted = torch.round(outputs.data)
        print(predicted)

        if predicted == 0.0:
            corrected_audio = np.concatenate((corrected_audio, np.array(inf.all_blocks[idx])))
        counter += 1
        samples_checked += buffer
    return corrected_audio, inf.sr

In [162]:
for x in inf:
    print(x['mfcc'].shape)

torch.Size([13, 22])
torch.Size([13, 22])
torch.Size([13, 22])
torch.Size([13, 22])
torch.Size([13, 22])
torch.Size([13, 22])
torch.Size([13, 22])
torch.Size([13, 22])
torch.Size([13, 22])
torch.Size([13, 22])
torch.Size([13, 22])
torch.Size([13, 22])
torch.Size([13, 22])
torch.Size([13, 22])


In [163]:
x,y =librosa.core.load('record.wav')

In [145]:
len(x)

161792

In [143]:
len(ca)

132300

In [30]:
ca, sr = correct_audio_in_chunks('stutter1.wav')

0 3585389
11025 3585389
22050 3585389
33075 3585389
44100 3585389
55125 3585389
66150 3585389
77175 3585389
88200 3585389
99225 3585389
110250 3585389
121275 3585389
132300 3585389
143325 3585389
154350 3585389
165375 3585389
176400 3585389
187425 3585389
198450 3585389
209475 3585389
220500 3585389
231525 3585389
242550 3585389
253575 3585389
264600 3585389
275625 3585389
286650 3585389
297675 3585389
308700 3585389
319725 3585389
330750 3585389
341775 3585389
352800 3585389
363825 3585389
374850 3585389
385875 3585389
396900 3585389
407925 3585389
418950 3585389
429975 3585389
441000 3585389
452025 3585389
463050 3585389
474075 3585389
485100 3585389
496125 3585389
507150 3585389
518175 3585389
529200 3585389
540225 3585389
551250 3585389
562275 3585389
573300 3585389
584325 3585389
595350 3585389
606375 3585389
617400 3585389
628425 3585389
639450 3585389
650475 3585389
661500 3585389
672525 3585389
683550 3585389
694575 3585389
705600 3585389
716625 3585389
727650 3585389
738675 35

In [31]:
import IPython
from IPython.display import display, Markdown, clear_output
# widget packages
import ipywidgets as widgets


IPython.display.Audio(data=ca, rate=sr)

In [32]:
import soundfile

In [33]:
soundfile.write('corrected.wav', ca, sr)

In [None]:
mfcc_all[0].shape

In [None]:
torch.ze

In [92]:
model(torch.zeros(13,22).view(-1,13,22), mfcc[0].view(-1,13,22), mfcc[1].view(-1,13,22))

TypeError: view() takes at most 2 arguments (3 given)

In [80]:
def correct_audio_segment(y, sr, zero_fill = False):
    '''
    Correct the audio specified audio chunk

    Corrects the audio clip by predicting the threshold corresponding to the maximum amplitude of the chunk, and removing the stuttered clips if zero_fill is set to False, or replacing said clips with zeroes if zero_fill is set to True.

    Parameters:
    y (list): The sampled amplitude of the soundwave
    sr (number): The sampling rate in hertz
    zero_fill (boolean): (Optional) Whether the stuttered bits are removed or replaced with zeroes
    
    Returns:
    y (list): The sampled amplitude of the corrected audio segment
    sr (number): The sampling rate of the corrected audio segment
    '''
    if len(y) == 0:
      print('here')
      return y, sr
    
    maxv = max(y)
    mfcc = librosa.feature.mfcc(y,sr)
#     print(torch.from_numpy(mfcc))
    st = torch.round(torch.sigmoid(model.forward(torch.from_numpy(mfcc).view((-1,mfcc.shape[0],mfcc.shape[1])))))
#     print(st)
#     if st < 1:
#         print(y.shape)
    return y, sr
#     else:
#         return np.array([]), sr
#     pred_thresh = model.predict()
#     frame_duration = 0.3
#     frame_len = int(frame_duration * sr)
#     n = len(y)
#     num_frames = int(n // frame_len)

#     corrected_audio_signal = []

#     count = -1

#     for i in range(num_frames):
#       frame = y[(i) * frame_len : frame_len * (i+1)]
#       frame_max = max(frame)
#       if (frame_max > pred_thresh):
#         count += 1
#         corrected_audio_signal[(count)*frame_len:frame_len*(count+1)] = frame
#       else:
#         if (zero_fill):
#           count += 1
#           # Zero fill
#           corrected_audio_signal[(count)*frame_len:frame_len*(count+1)] = np.repeat([0], frame_len)
#         else:
#           print('skipped frame with max {}'.format(frame_max))
#     corrected_audio_signal = np.array(corrected_audio_signal)
    return corrected_audio_signal, sr

In [81]:
model.to('cpu')

AudioLSTM(
  (encoder): Encoder(
    (conv1): Conv2d(1, 8, kernel_size=(3, 3), stride=(1, 1))
    (maxpool1): AdaptiveMaxPool2d(output_size=8)
    (dropout): Dropout(p=0.2, inplace=False)
    (fc1): Linear(in_features=512, out_features=256, bias=True)
  )
  (lstm): LSTM(256, 128, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc1): Linear(in_features=768, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [None]:
file_name = 'record.wav'

x = 