In [1]:
import sys

%matplotlib inline
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from glob import glob
import numpy as np

from plot_mel import * 

from tqdm import tqdm

from timeit import default_timer as timer

In [2]:
device = "cpu"
DEBUGGING = True #If True it limits the number of data you import in the load_dataset function

Data Loading
-------------------


In [None]:
# This function will take care of the dataloading
def load_dataset(folder,normalize,total_number_to_import=1965):
    dataset = []
    scaler = MinMaxScaler()
    
    count = 0
    for mel_spec in folder:
        data = np.load(mel_spec)
        if normalize is True:
            data = scaler.fit_transform(data)
        dataset.append(torch.tensor(data))
        count += 1
        
        if DEBUGGING:
            if count == total_number_to_import: 
                print(f"NE STAI USANDO SOLO {count}")
                break
            
    return dataset



### Use this to import MELS and convert them to tensors

Otherwise just load the tensors


In [None]:
# Enter the path where you keep your spectrograms
folder = glob("/Users/diego/Documents/Università/LCPb/Suweis_project/PheezeekzSuperteam/Dataset_Diego/spa_filt_mel_spectrograms/*")

data = load_dataset(folder,normalize=True)

# Concatenates a sequence of tensors along a new dimension.
# All tensors need to be of the same size.
filtered_data = torch.stack(data,dim=0)

Just load the tensors if you already have them.

In [6]:
spa = torch.load("../Dataset/Unfiltered tensors/spa_unfiltered.pt")
jap = torch.load("../Dataset/Unfiltered tensors/jap_unfiltered.pt")
ita = torch.load("../Dataset/Unfiltered tensors/ita_unfiltered.pt")


Autoencoder Definition
-----------------------
We use a **Convolutional Encoder - Decoder**, which generally gives better performance than fully connected versions that have the same number of parameters.

In convolution layers, we increase the channels as we approach the bottleneck, but note that the total number of features still decreases, since the channels increase by a factor of 2 in each convolution, but the spatial size decreases by a factor of 4.

Kernel size 4 is used to avoid biasing problems described here: https://distill.pub/2016/deconv-checkerboard/

In [7]:
class Autoencoder(nn.Module):
    def __init__(self,encoded_space_dim):
        super(Autoencoder, self).__init__()

        # Encoder
        self.encoder = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, stride=1, padding='same'),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            
            nn.MaxPool2d(kernel_size=(4, 4),stride=2, padding=1),
            
            nn.Conv2d(in_channels=32,out_channels= 64, kernel_size=3, stride=1, padding='same'),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            
            nn.MaxPool2d(kernel_size=(4, 4),stride=2, padding=1),
            
            nn.Conv2d(in_channels=64,out_channels= 4, kernel_size=3, stride=1, padding='same'),
            nn.BatchNorm2d(4),
            nn.ReLU(),
          
        )


         ### Flatten layer
        self.flatten = nn.Flatten(start_dim=1) 

        ### Linear section
        ## This is the actual ENCODED / LATENT SPACE that will be extracted and studied
        self.encoder_lin = nn.Sequential(
            nn.Linear(4*32*54, encoded_space_dim), #input dimension is Number of out_channels from last CNN layer, rowXcolumn.
                                                    #if you change some CNN parameter rowXcolumn wil change too
            nn.ReLU(True),
            
        )
        #Decoding part
        #This is the reverse step-by-step of what comes before
        self.decoder_lin = nn.Sequential(
            nn.Linear(encoded_space_dim, 4*32*54), #out put dim is the same of input dim of encoder_lin. If smth changes, you should change this and the following too
            nn.ReLU(True),
            
        )

        self.unflatten = nn.Unflatten(dim=1,
        unflattened_size=(4, 32, 54)) #depends on output of preceeding linear layer







        # Decoder
        ## If some of the preceeding parameres are changed, you need to tune scale_factor accordingly
        #BatchNorm2d takes out_channel as input
        self.decoder = nn.Sequential(
           
            nn.ConvTranspose2d(in_channels=4, out_channels=64, kernel_size=3, stride=1, padding=1, output_padding=0),
            nn.BatchNorm2d(64),
            nn.ReLU(),
        
            nn.Upsample(scale_factor=2, mode='bilinear'),
            nn.ConvTranspose2d(in_channels=64, out_channels=32, kernel_size=3, stride=1, padding=1, output_padding=0),
            nn.BatchNorm2d(32),
            nn.ReLU(),
           
            nn.Upsample(scale_factor=2, mode='bilinear'),
            nn.ConvTranspose2d(in_channels=32, out_channels=1, kernel_size=3, stride=1,padding=1, output_padding=0),
            nn.BatchNorm2d(1),
            nn.ReLU())
      
            
            
            
    def forward(self, x):
        #dataset requires to convert to double before doing anything. This line can be removed
        x = x.double()
        
        x = self.encoder(x)

        x = self.flatten(x)

        latent_space = self.encoder_lin(x)
        
        x = self.decoder_lin(latent_space)
    
        x = self.unflatten(x)
       
        x = self.decoder(x)

        return x 

# Create an instance of the autoencoder
# Here the encoded space dimension can be tuned 
autoencoder = Autoencoder(encoded_space_dim=3000).double()

# Print the architecture
print(autoencoder)


Autoencoder(
  (encoder): Sequential(
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=same)
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=(4, 4), stride=2, padding=1, dilation=1, ceil_mode=False)
    (4): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=same)
    (5): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): MaxPool2d(kernel_size=(4, 4), stride=2, padding=1, dilation=1, ceil_mode=False)
    (8): Conv2d(64, 4, kernel_size=(3, 3), stride=(1, 1), padding=same)
    (9): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()
  )
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (encoder_lin): Sequential(
    (0): Linear(in_features=6912, out_features=3000, bias=True)
    (1): ReLU(inplace=True)
  )
  (decoder_lin): Sequential(
    (0): Linear(in_features=3000, out_features=6

### Save / Load pre-trained model

In [8]:
#torch.save(autoencoder.state_dict(),"../saved_models/spa_pretrain_linear.pt")
autoencoder.load_state_dict(torch.load("../Final models state dictionaries/Autoencoder model states/spa_pretrain.pt"))

<All keys matched successfully>

### Gotta encode 'em all!

Labels:

* 1 : spanish
* 0 : non spanish

In [9]:
n_encoded_spanish = 1000
n_encoded_foreign = 2000

In [10]:
#Tune the numbers accordin
labels = torch.tensor([1 for _ in range(n_encoded_spanish)] + [0 for _ in  range(n_encoded_foreign)] )

#### Let's unify the dataset

In [12]:
full_dataset = torch.cat([spa[:n_encoded_spanish],jap[:int(n_encoded_foreign/2)],ita[:int(n_encoded_foreign/2)]])
full_dataset.shape

torch.Size([3000, 128, 216])

In [13]:
def encode_me(x):
    
        x = autoencoder.encoder(x)
        x = autoencoder.flatten(x)
        x = autoencoder.encoder_lin(x)
        
        return x

In [14]:
encoded_space = torch.tensor([])

#it's necessary to add the channel information AND the batch size for the encoder
#that's why there are two unsqueeze()
encoded_data = map(encode_me,full_dataset.unsqueeze(1).unsqueeze(1).double())

#encode it!
for spectrum in tqdm(encoded_data):
    tensor = torch.tensor(spectrum)
    encoded_space = torch.cat([encoded_space,tensor])

  tensor = torch.tensor(spectrum)
71it [00:03, 19.09it/s]


KeyboardInterrupt: 

In [15]:
encoded_space.shape

torch.Size([71, 3000])

In [None]:
torch.save(encoded_space,r"C:\Users\bonat\Documents\UNI\Physics of data\LCP-b project\Autoencoder flex\encoded_space.pt")

In [None]:
encoded_space = torch.load(r"C:\Users\bonat\Documents\UNI\Physics of data\LCP-b project\Autoencoder flex\encoded_space.pt")

In [None]:
encoded_space.shape

Now we have a nice dataset of foreign + native language, all of which have been encoded by the CNN. 