In [21]:
import torch
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

from torch import nn
import torchaudio

import numpy as np
import os
from scipy.io import wavfile

tensor([1.], device='mps:0')


In [22]:
# EDIT THIS SECTION FOR USER INPUTS
#
name = 'test'
in_file = 'data/ts9_test1_in_FP32.wav'
out_file = 'data/ts9_test1_out_FP32.wav'
epochs = 1

train_mode = 0     # 0 = speed training, 
                   # 1 = accuracy training 
                   # 2 = extended training

input_size = 150 
batch_size = 4096 
test_size = 0.2

if not os.path.exists('models/'+name):
    os.makedirs('models/'+name)
else:
    print("A model with the same name already exists. Please choose a new name.")
    exit

A model with the same name already exists. Please choose a new name.


In [23]:
def save_wav(name, data):
    wavfile.write(name, 44100, data.flatten().astype(np.float32))

def normalize(data):
    data_max = max(data)
    data_min = min(data)
    data_norm = max(data_max,abs(data_min))
    return data / data_norm

In [24]:
# Load and Preprocess Data ###########################################
in_rate, in_data = wavfile.read(in_file)
out_rate, out_data = wavfile.read(out_file)

X_all = in_data.astype(np.float32).flatten()  
y_all = out_data.astype(np.float32).flatten() 

# Get the last 20% of the wav data for testing and thee rest for training
y_training, y_testing = np.split(y_all, [int(len(y_all)*.8)])
X_training, X_testing = np.split(X_all, [int(len(X_all)*.8)])
print(f"y_training shape: {y_training.shape}")
print(f"X_training shape: {X_training.shape}")

# The input size defines the number of samples used for each prediction
# Therefore the first output value that we get is at index input_size-1
y_ordered_training = y_training[input_size-1:]
y_ordered_training = torch.from_numpy(y_ordered_training)
print(f"y_ordered_training shape: {y_ordered_training.shape}")

indices = np.arange(input_size) + np.arange(len(X_training)-input_size+1)[:,np.newaxis]
indices = torch.from_numpy(indices)
X_training = torch.from_numpy(X_training)
X_ordered_training = torch.zeros_like(indices, dtype=torch.float32)
for i, j in enumerate(indices):
    X_ordered_training[i] = torch.gather(X_training, 0, indices[i])
    
print(f"X_ordered_training shape: {X_ordered_training.shape}")

y_training shape: (6587907,)
X_training shape: (6587907,)
y_ordered_training shape: torch.Size([6587758])
X_ordered_training shape: torch.Size([6587758, 150])


In [25]:
training_dataset = torch.utils.data.TensorDataset(X_ordered_training, y_ordered_training)
training_dataloader = torch.utils.data.DataLoader(training_dataset, batch_size=batch_size, shuffle=True)

for batch, (X, y) in enumerate(training_dataloader):
    print(f"Batch: {batch}")
    print(f"Shape of X [N, C, H, W]: {X.shape}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    print(f"This is the first audio chunk in the batch: {X[0]}")
    print(f"This is the first target value in the batch: {y[0]}")
    break

# The input size defines the number of samples used for each prediction
# Therefore the first output value that we get is at index input_size-1
y_ordered_testing = y_testing[input_size-1:]
y_ordered_testing = torch.from_numpy(y_ordered_testing)
print(f"y_ordered_testing shape: {y_ordered_testing.shape}")

indices_testing = np.arange(input_size) + np.arange(len(X_testing)-input_size+1)[:,np.newaxis]
indices_testing = torch.from_numpy(indices_testing)
X_testing = torch.from_numpy(X_testing)
X_ordered_testing = torch.zeros_like(indices_testing, dtype=torch.float32)
for i, j in enumerate(indices_testing):
    X_ordered_testing[i] = torch.gather(X_testing, 0, indices[i])

testing_dataset = torch.utils.data.TensorDataset(X_ordered_testing, y_ordered_testing)
testing_dataloader = torch.utils.data.DataLoader(testing_dataset, batch_size=batch_size)

for batch, (X, y) in enumerate(testing_dataloader):
    print(f"Batch: {batch}")
    print(f"Shape of X [N, C, H, W]: {X.shape}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    print(f"This is the first audio chunk in the batch: {X[0]}")
    print(f"This is the first target value in the batch: {y[0]}")
    break



Batch: 0
Shape of X [N, C, H, W]: torch.Size([4096, 150])
Shape of y: torch.Size([4096]) torch.float32
This is the first audio chunk in the batch: tensor([ 0.0328,  0.0392,  0.0453,  0.0513,  0.0569,  0.0623,  0.0674,  0.0722,
         0.0766,  0.0807,  0.0844,  0.0878,  0.0908,  0.0934,  0.0956,  0.0975,
         0.0989,  0.1000,  0.1007,  0.1010,  0.1011,  0.1008,  0.1002,  0.0995,
         0.0985,  0.0974,  0.0959,  0.0944,  0.0926,  0.0907,  0.0887,  0.0865,
         0.0844,  0.0822,  0.0798,  0.0775,  0.0750,  0.0725,  0.0700,  0.0675,
         0.0650,  0.0625,  0.0601,  0.0577,  0.0554,  0.0531,  0.0509,  0.0487,
         0.0465,  0.0444,  0.0424,  0.0404,  0.0385,  0.0366,  0.0349,  0.0332,
         0.0316,  0.0302,  0.0287,  0.0274,  0.0261,  0.0249,  0.0236,  0.0224,
         0.0212,  0.0200,  0.0188,  0.0177,  0.0165,  0.0155,  0.0145,  0.0135,
         0.0126,  0.0117,  0.0110,  0.0103,  0.0096,  0.0088,  0.0081,  0.0074,
         0.0067,  0.0060,  0.0053,  0.0046,  0.0039, 

In [26]:
# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

if train_mode == 0:         # Speed Training
    learning_rate = 0.01 
    conv1d_strides = 12    
    conv1d_filters = 16
    hidden_units = 36
elif train_mode == 1:       # Accuracy Training (~10x longer than Speed Training)
    learning_rate = 0.01 
    conv1d_strides = 4
    conv1d_filters = 36
    hidden_units= 64
else:                       # Extended Training (~60x longer than Accuracy Training)
    learning_rate = 0.0005 
    conv1d_strides = 3
    conv1d_filters = 36
    hidden_units= 96

# # Create Sequential Model ###########################################
# clear_session()
# model = Sequential()
# model.add(Conv1D(conv1d_filters, 12,strides=conv1d_strides, activation=None, padding='same',input_shape=(input_size,1)))
# model.add(Conv1D(conv1d_filters, 12,strides=conv1d_strides, activation=None, padding='same'))
# model.add(LSTM(hidden_units))
# model.add(Dense(1, activation=None))
# model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse', metrics=[error_to_signal])
# model.summary()

# Define model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.pad = nn.ConstantPad1d(padding=12, value=0)
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=conv1d_filters, kernel_size=12, stride=conv1d_strides) # Padding needed
        self.conv2 = nn.Conv1d(in_channels=conv1d_filters, out_channels=conv1d_filters, kernel_size=12, stride=conv1d_strides) # Padding needed
        self.lstm = nn.LSTM(input_size=3, hidden_size=hidden_units, batch_first = True)
        self.linear = nn.Linear(in_features=hidden_units, out_features=1)

    def forward(self, x):
        x = self.pad(x)
        x = self.conv1(x)
        x = self.pad(x)
        x = self.conv2(x)
        output, (hidden, cell) = self.lstm(x)
        x = self.linear(cell)
        return x

model = NeuralNetwork().to(device)
print(model)

Using mps device
NeuralNetwork(
  (pad): ConstantPad1d(padding=(12, 12), value=0)
  (conv1): Conv1d(1, 16, kernel_size=(12,), stride=(12,))
  (conv2): Conv1d(16, 16, kernel_size=(12,), stride=(12,))
  (lstm): LSTM(3, 36, batch_first=True)
  (linear): Linear(in_features=36, out_features=1, bias=True)
)


In [27]:
loss_fn = nn.MSELoss(reduction='mean')
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [28]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        X = X.unsqueeze(1)
        y = y.unsqueeze(0)
        y = y.unsqueeze(2)
        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [29]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            X = X.unsqueeze(1)
            y = y.unsqueeze(0)
            y = y.unsqueeze(2)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [30]:
epochs = 1 

for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(training_dataloader, model, loss_fn, optimizer)
    test(testing_dataloader, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 0.055807  [ 4096/6587758]
loss: 0.001651  [413696/6587758]
loss: 0.000495  [823296/6587758]
loss: 0.000321  [1232896/6587758]
loss: 0.000239  [1642496/6587758]
loss: 0.000186  [2052096/6587758]
loss: 0.000181  [2461696/6587758]
loss: 0.000162  [2871296/6587758]
loss: 0.000153  [3280896/6587758]
loss: 0.000141  [3690496/6587758]
loss: 0.000129  [4100096/6587758]
loss: 0.000118  [4509696/6587758]
loss: 0.000114  [4919296/6587758]
loss: 0.000119  [5328896/6587758]
loss: 0.000107  [5738496/6587758]
loss: 0.000104  [6148096/6587758]
loss: 0.000113  [6557696/6587758]
Test Error: 
 Accuracy: 2.5%, Avg loss: 0.000096 

Done!


In [31]:
torch.save(model.state_dict(), "models/"+name+"/model.pth")
print("Saved PyTorch Model State to model.pth")

Saved PyTorch Model State to model.pth


In [32]:
model = NeuralNetwork().to(device)
model.load_state_dict(torch.load("models/"+name+"/model.pth"))

<All keys matched successfully>

In [33]:
result = torch.zeros(0).to(device)
model.eval()
with torch.no_grad():
    for X, _ in testing_dataloader:
        X = X.to(device)
        X = X.unsqueeze(1)
        pred = model(X)
        result = torch.cat((result, pred.flatten()), 0)
save_wav('models/'+name+'/result.wav', result.cpu().numpy())
save_wav('models/'+name+'/input.wav', X_testing.cpu().numpy())