In [1]:
import pandas as pd
import tensorflow as tf
import librosa
from sklearn.model_selection import train_test_split
import numpy as np



In [2]:
dataset_df = pd.read_csv('dataset.csv').to_numpy()
sounds_dataset = dataset_df[:, 0]
sr = 0
for i in range(len(sounds_dataset)):
    sounds_dataset[i], sr = librosa.load(f'Noisy-sounds/{sounds_dataset[i]}.wav')

label_dataset = dataset_df[:, 1]
for i in range(len(label_dataset)):
    label_dataset[i], sr = librosa.load(f'Clean-sounds/{label_dataset[i]}.wav')

X_train, X_test, y_train, y_test = train_test_split(sounds_dataset, label_dataset, test_size=0.1, random_state=42)



In [3]:
max_len = 0
for x in X_train:
    if len(x) > max_len:
        max_len = len(x)

for x in X_test:
    if len(x) > max_len:
        max_len = len(x)

for x in y_train:
    if len(x) > max_len:
        max_len = len(x)

for x in y_test:
    if len(x) > max_len:
        max_len = len(x)


def pad_vector(vec):
    for i in range(len(vec)):
        vec[i] = np.pad(vec[i], (0, max_len - len(vec[i])))
        # vec[i] = np.asarray(vec[i]).astype(np.float32)
    return vec


X_train = pad_vector(X_train)
X_test = pad_vector(X_test)
y_train = pad_vector(y_train)
y_test = pad_vector(y_test)

X_train_good = np.ones((len(X_train), max_len))
X_test_good = np.ones((len(X_test), max_len))
y_train_good = np.ones((len(y_train), max_len))
y_test_good = np.ones((len(y_test), max_len))

for i in range(len(X_train_good)):
    X_train_good[i] = X_train[i]
for i in range(len(X_test_good)):
    X_test_good[i] = X_test[i]
for i in range(len(y_train_good)):
    y_train_good[i] = y_train[i]
for i in range(len(y_test_good)):
    y_test_good[i] = y_test[i]



In [56]:
print(X_train)

[array([3.6621094e-04, 4.5776367e-04, 9.1552734e-05, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00], dtype=float32)
 array([0.00134277, 0.00146484, 0.00216675, ..., 0.        , 0.        ,
        0.        ], dtype=float32)
 array([0.00268555, 0.00326538, 0.003479  , ..., 0.        , 0.        ,
        0.        ], dtype=float32)
 ... array([-0.00457764, -0.00479126, -0.00375366, ...,  0.        ,
             0.        ,  0.        ], dtype=float32)
 array([-0.01177979, -0.0133667 , -0.01184082, ...,  0.        ,
         0.        ,  0.        ], dtype=float32)
 array([0.00762939, 0.00839233, 0.00579834, ..., 0.        , 0.        ,
        0.        ], dtype=float32)                                     ]


In [40]:
import torch
import torch.nn as nn

input_dim = max_len  # Dimensionality of input data (e.g., length of audio signal)
latent_dim = 128  # Dimensionality of the latent space

class DenoisingAutoencoder(nn.Module):
    def __init__(self, latent_dim):
        super(DenoisingAutoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, latent_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim),
            nn.ReLU()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# Initialize the model
model = DenoisingAutoencoder(latent_dim)

# Convert to CUDA if available
if torch.cuda.is_available():
    model.cuda()

# Define the loss function
criterion = nn.MSELoss()

# Define the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [5]:
# Define the hyperparameters
epochs = 5
batch_size = 32

# Define the model
autoencoder = DenoisingAutoencoder(latent_dim)

# Compile the model
autoencoder.compile(optimizer='adam', loss='mse')  # Using Mean Squared Error loss for audio reconstruction

# Train the model
autoencoder.fit(X_train_good, y_train_good, epochs=epochs, batch_size=batch_size, validation_data=(X_test_good, y_test_good))



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [51]:
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
import sklearn.metrics as metrics

class DatasetLoader(Dataset):
    def __init__(self) -> None:
        super().__init__()

    def __len__(self):
        return len(X_train_good)

    def __getitem__(self, index):
        # index 0 ar fi header ul
        # signal, sr = sf.read(self.df['ActorID'].to_numpy()[index])  ### Load the waveform using librosa and sample rate = 16000
        # label = (self.df['Gender'].to_numpy()[index], self.df['Age'].to_numpy()[index]) ### Load the label from DataFrame

        # stft = librosa.stft(signal)
        # stft = np.abs(stft) ** 2
        # stft = stft / np.max((np.abs(stft)))
        # stft = 20 * np.log10(stft)
        # stft = np.clip(stft, -80, 0)

        # stft = np.expand_dims(stft, axis=0).astype(np.float32)
        ### STFT for the signal
        ### normlize, clip (-80, 0).
        ### STFT shape !
        return X_train_good[index].astype(np.float32), y_train_good[index].astype(np.float32)


def train():
    BATCH_SIZE = 16
    EPOCHS = 15
    LR = 1e-4
    BAR_FORMAT = '{l_bar}{bar:10}{r_bar}{bar:-10b}'
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    print(f"model running on device: {DEVICE}\n")
    train = DatasetLoader()

    train_loader = DataLoader(train, batch_size=BATCH_SIZE, shuffle=True)

    model = DenoisingAutoencoder(latent_dim)
    loss_fn = torch.nn.MSELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=LR)  ### Optimizer

    error = 0
    for i in range(EPOCHS):
        print(f"+++++++++++++RUNNING EPOCH: {i} ++++++++++++++++")

        ct = 0
        print("=============TRAINING===============")

        model.train().to(device=DEVICE)
        for x, y in tqdm(train_loader, bar_format=BAR_FORMAT):
            pred_sound = model(x)
            loss = loss_fn(pred_sound, y.float())
            print(loss, pred_sound, y.float(), torch.max(y.float()), torch.max(pred_sound))
            break
            error += loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            ct += 1

        error /= ct
        print(f"Error: {error}")
        print(" ")

        # print("=============TESTING===============")

        # preds = np.array([])
        # labels = np.array([])
        # mae = 0
        # model.eval().to(device=DEVICE)
        # for x, y in tqdm(test_loader, bar_format=BAR_FORMAT):
        #     with torch.no_grad():
        #         pred_gender, pred_age = model(x)
        #         pred_age = torch.squeeze(pred_age)

        #         predicted = torch.argmax(pred_gender, -1)
        #         predicted = predicted.detach().cpu()
        #         y[0] = y[0].detach().cpu().numpy()
        #         mae = (pred_age - y[1].float()).abs().mean()

        #         preds = np.concatenate((preds, predicted), axis=0)
        #         labels = np.concatenate((labels, y[0]), axis=0)

        # acc = (preds == labels).sum() / len(labels)
        # f1 = metrics.f1_score(preds, labels)
        # print(f"mae: {mae:.3f}")
        # print(f"accuracy test: {acc:.3f}")
        # print(f"f1 score test: {f1}")
        # print(" ")

        # if acc > optim:
        #     optim = acc
        #     saved_epoch = i
        #     print(f"model with {optim} accuracy saved at epoch {i}")
        #     torch.save(model.state_dict(), "saved_model.pt")

    #print(f"model with {optim} accuracy saved at epoch {saved_epoch}")

In [52]:
train()

model running on device: cpu

+++++++++++++RUNNING EPOCH: 0 ++++++++++++++++


  0%|          | 0/355 [00:00<?, ?it/s]

tensor(0.0028, grad_fn=<MseLossBackward0>) tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0156, 0.0408, 0.0182],
        [0.0000, 0.0000, 0.0000,  ..., 0.0192, 0.0426, 0.0178],
        [0.0000, 0.0000, 0.0000,  ..., 0.0183, 0.0401, 0.0179],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0161, 0.0393, 0.0202],
        [0.0000, 0.0000, 0.0000,  ..., 0.0187, 0.0425, 0.0184],
        [0.0000, 0.0000, 0.0000,  ..., 0.0172, 0.0401, 0.0190]],
       grad_fn=<ReluBackward0>) tensor([[-0.0079, -0.0088, -0.0077,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0030,  0.0031,  0.0017,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0021, -0.0028, -0.0027,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [ 0.0055,  0.0063,  0.0055,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0077,  0.0088,  0.0073,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0021, -0.0011, -0.0005,  ...,  0.0000,  0.0000,  0.0000]]) tensor(1.0351) tensor(0.1583, grad_fn=<MaxBackward1>)





ZeroDivisionError: division by zero

In [16]:
import soundfile as sf
test = np.expand_dims(test, axis=0)
print(test.shape)
prediciton = autoencoder.predict(test)

(1, 110361)


LibsndfileError: Error opening 'test_denoised.wav': Format not recognised.

In [19]:
print(prediciton.shape)
sf.write('test_denoised.wav', prediciton[0], sr, subtype='PCM_24')

(1, 110361)
