In [5]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from torch.utils.data import random_split
from torchvision.datasets import MNIST
from torchvision import transforms
import pytorch_lightning as pl

In [171]:
class LitAutoEncoder(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Conv2d(1, 64,2,1),
#             nn.MaxPool2d(2,1),
            nn.Conv2d(64, 128, 2,1),
            nn.Dropout(0.2),
            nn.Flatten(),
            nn.ReLU(),
            nn.Linear(13056,10)
            ,
            nn.Softmax(1))
        

    def forward(self, x):
        x = self.encoder(x)
        return torch.max(x, axis=1)[0]

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

    def training_step(self, train_batch, batch_idx):
        x, y = train_batch['features'],
        x = x.view(x.size(0), -1)
        z = self.encoder(x)    
        x_hat = self.decoder(z)
        loss = F.mse_loss(x_hat, x)
        self.log('train_loss', loss)
        return loss

    def validation_step(self, val_batch, batch_idx):
        x, y = val_batch
        x = x.view(x.size(0), -1)
        z = self.encoder(x)
        x_hat = self.decoder(z)
        loss = F.mse_loss(x_hat, x)
        self.log('val_loss', loss)

In [172]:
model = LitAutoEncoder()

In [173]:
model

LitAutoEncoder(
  (encoder): Sequential(
    (0): Conv2d(1, 64, kernel_size=(2, 2), stride=(1, 1))
    (1): Conv2d(64, 128, kernel_size=(2, 2), stride=(1, 1))
    (2): Dropout(p=0.2, inplace=False)
    (3): Flatten(start_dim=1, end_dim=-1)
    (4): ReLU()
    (5): Linear(in_features=13056, out_features=10, bias=True)
    (6): Softmax(dim=1)
  )
)

In [184]:
model.eval()
model(data[0]["features"].reshape(1,1,36,5))

tensor([0.4037], grad_fn=<MaxBackward0>)

In [None]:
13056*

In [107]:
BASE_PATH = "/media/aneesh/USB1000/Zurich_Urban_Sounds"
RECORDER = "TASCAM_RECORDER"
SEGMENT_DIR = "audio_segments"
import os
import numpy as np
import pandas as pd

In [29]:
class unlabeled_audio_segments(Dataset):
    
    def __init__(self, file_list_path, feature_dir):

        self.file_list_path= file_list_path
        self.feature_dir = feature_dir
        self.filenames = pd.read_csv(file_list_path)
        self.filenames["features"] = self.filenames["Non_silent_segments"].apply(
            lambda x :  x.replace(".wav", ".npy")
        )

    def __len__(self):
        return len(self.filenames)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        sample = {'features':torch.from_numpy(
            np.load(
                os.path.join(self.feature_dir,
                             self.filenames["features"].iloc[idx]
                            )).squeeze())
                 }

        return sample  

In [102]:
class labeled_audio_segments(Dataset):
    def __init__(self, features_path, labels_path):

        self.labels = pd.read_csv(labels_path)
        self.features = pd.read_csv(features_path)
        
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        sample = {'features':torch.Tensor((self.features.iloc[idx].values.reshape(36,5)).astype(np.float32)),
                  'labels':torch.tensor(self.labels.iloc[idx].values.astype(np.float32)[0])
                 }

        return sample  
    

In [103]:
data = labeled_audio_segments('/media/aneesh/USB1000/Zurich_Urban_Sounds/train_data.csv', '/media/aneesh/USB1000/Zurich_Urban_Sounds/train_labels.csv')

In [104]:
data[0]

{'features': tensor([[-4.1729e+02,  1.0015e+02, -4.3869e+01,  5.1953e+01,  9.0444e+00],
         [ 8.7802e+00,  1.0376e+01,  2.7314e+00,  6.1753e+00,  5.0651e+00],
         [-4.3707e+00,  1.4676e+01, -3.5300e+00,  8.0437e+00,  4.8215e+00],
         [ 2.6133e+00,  1.0506e+00,  7.7588e+00, -3.1388e+00,  4.1032e+00],
         [-5.6389e-01,  6.8306e+00, -8.0799e-01,  7.2286e+00, -7.9898e-01],
         [ 7.5396e-01, -4.1683e+00,  1.2598e+00, -6.3962e+00,  1.3227e+00],
         [-1.6741e+00,  2.7055e+00, -1.9224e+00,  1.1593e+00, -1.2142e+00],
         [ 1.2042e+00,  3.4982e-02,  8.0266e-03,  2.7767e-03,  3.1436e-03],
         [ 2.7485e-03,  1.3418e-03,  1.6905e-03,  1.4311e-03,  6.4932e-04],
         [ 1.0004e-03,  1.3613e-03,  6.8252e-04,  1.0383e-03,  1.9018e-03],
         [ 1.6610e-03,  9.6201e-04,  1.2086e-03,  3.0248e-03,  4.7647e-03],
         [ 2.7041e-03,  2.7041e-03,  2.0084e-03,  2.9568e-03,  4.8983e-03],
         [ 2.7465e-03,  6.9050e-04,  5.8380e-04,  7.7438e-04,  7.5575e-04],


In [63]:
temp = DataLoader(data, batch_size=2)

In [36]:
data = unlabeled_audio_segments(os.path.join(BASE_PATH, RECORDER, "non_silent_segment.csv"),
                        os.path.join(BASE_PATH, RECORDER, "features"))

Unnamed: 0.1,Unnamed: 0,Non_silent_segments,features
0,0,2_000000051.wav,2_000000051.npy
1,1,2_000000052.wav,2_000000052.npy
2,2,2_000000053.wav,2_000000053.npy
3,3,2_000000086.wav,2_000000086.npy
4,4,2_000000112.wav,2_000000112.npy
...,...,...,...
4607,4607,12_000003802.wav,12_000003802.npy
4608,4608,12_000003971.wav,12_000003971.npy
4609,4609,12_000003972.wav,12_000003972.npy
4610,4610,12_000003992.wav,12_000003992.npy
