# Contrastive Pre-Training
We use contrative pre-training as the basis of later supervised training. The idea is to project the basic features
first to a common space where they are invariant to some sensor specifics, and then use supervised training on top of it.

---
some imports

In [1]:
from torchvision import transforms
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
import torch
import numpy as np
import torch_utils as tu
import utils as ut
import models
import simclr
import pickle

---
## Prepare the dataset

We simulate four sensors with different noise and sampling rate

In [2]:
# set the important parameters for dataset generation
batch_size = 512
timesteps_per_example_in_100Hz = 200
num_examples = batch_size * 20
jiggle_offsets = 20

sensor1 = ut.Sensor(40, 0.0, 0.2)
sensor2 = ut.Sensor(80, 0.1, 0.1)
sensor3 = ut.Sensor(80, 0.0, 0.3)
sensor4 = ut.Sensor(20, 0.0, 0.1)
sensors = [sensor1, sensor2, sensor3, sensor4]

---
They key to contrastive learning is to use augmentations for the data, we add three options + one we have directly
in the dataset

In [3]:
transform_options = [
    transforms.Compose([tu.AblateBlock(5,30), tu.ToTensor()]),
    transforms.Compose([tu.AddNoise((-0.1, 0.1), (0.0, 0.3)), tu.ToTensor()]),
    transforms.Compose([tu.RandomDownsample(), tu.ToTensor()])
]
trsfm = transforms.RandomChoice(transform_options)

---
## Generate the Dataset

In [25]:
dataset = tu.BadSensorsDataset(sensors,timesteps_per_example_in_100Hz, num_examples, jiggle_offsets=jiggle_offsets, transform=trsfm, return_two_transforms=True)

assert len(dataset) == num_examples * len(sensors)
assert len(dataset[0][0]) == timesteps_per_example_in_100Hz

data with only two columns is assumed to have no missing value field, just return the whole index
data with only two columns is assumed to have no missing value field, just return the whole index
data with only two columns is assumed to have no missing value field, just return the whole index
data with only two columns is assumed to have no missing value field, just return the whole index


This generation of the Dataset might take a bit of time, so if generated once just load it from a pickle

In [26]:
with open('data/dataset.pkl', 'wb') as file:
    pickle.dump(dataset, file)

In [4]:
with open('data/dataset.pkl', 'rb') as file:
    dataset = pickle.load(file)

assert len(dataset) == num_examples * len(sensors)
assert len(dataset[0]) == 2
assert len(dataset[0][0]) == timesteps_per_example_in_100Hz

---
Train, validation and test split

In [5]:
valid_size = 0.2
test_size = 0.1
num_workers = 2

num_train = len(dataset)
indices = list(range(num_train))
train_idx, valid_idx, test_idx = ut.random_splits(indices, test_size, valid_size)

train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)
test_sampeler = SubsetRandomSampler(test_idx)

# prepare data loaders (combine dataset and sampler)
train_loader = DataLoader(dataset, batch_size=batch_size,
    sampler=train_sampler, num_workers=num_workers)
valid_loader = DataLoader(dataset, batch_size=batch_size,
    sampler=valid_sampler, num_workers=num_workers)
test_loader = DataLoader(dataset, batch_size=batch_size,
    sampler=test_sampeler, num_workers=num_workers)


---
Check if a GPU is available

In [6]:
train_on_gpu = torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if not train_on_gpu:
    print('CUDA is not available.  Training on CPU ...')
else:
    print('CUDA is available!  Training on GPU ...')

CUDA is not available.  Training on CPU ...


---
## Train Loop

Define training loop

In [19]:
def train(n_epochs, model, projection, optimizer, criterion, train_loader, valid_loader):
    def cd(x): #cd = correct dimension
        x = x.float()
        x = x[:,:,1:].contiguous()
        #print(x.shape)
        #x = x.unsqueeze(dim=2)
        batch_size, sequence_length, input_dim = x.shape
        x = x.view(batch_size, input_dim, sequence_length)
        return x
    valid_loss_min = np.Inf # track change in validation loss

    for epoch in range(1, n_epochs+1):
        # keep track of training and validation loss
        train_loss = 0.0
        valid_loss = 0.0

        ###################
        # train the model #
        ###################
        model.train()
        for i_batch, (sk1, sk2) in enumerate(train_loader):
            if sk1.shape[0] != 512:
                continue # if not full batch, just continue
            sk1, sk2 = cd(sk1), cd(sk2)
            #print('data', sk1.shape, sk2.shape)
            #print('val example', sk1[5,0,:10])
            # move tensors to GPU if CUDA is available
            if train_on_gpu:
                sk1, sk2 = sk1.cuda(), sk2.cuda()
            # clear the gradients of all optimized variables
            optimizer.zero_grad()
            # forward pass: compute feature embeddings from model,
            # and add non-linear projection for loss
            h1, h2 = model(sk1), model(sk2)
            #print('model', h1.shape, h2.shape)
            #print('mod-val example', h1[5,:])
            z1, z2 = projection(h1), projection(h2)
            #print('projection', z1.shape, z2.shape)
            #print('pr-val example', z1[5,:])
            # calculate the batch loss
            loss = criterion(z1, z2)
            #print('loss: ', loss)
            # backward pass: compute gradient of the loss with respect to model parameters
            loss.backward()
            # perform a single optimization step (parameter update)
            optimizer.step()
            # update training loss
            train_loss += loss.item()*sk1.size(0)

        ######################
        # validate the model #
        ######################
        model.eval()
        for i_batch, (sk1, sk2) in enumerate(valid_loader):
            if sk1.shape[0] != 512:
                continue # if not full batch, just continue
            sk1, sk2 = cd(sk1), cd(sk2)
            # move tensors to GPU if CUDA is available
            if train_on_gpu:
                sk1, sk2 = sk1.cuda(), sk2.cuda()
            # forward pass: compute feature embeddings from model,
            # and add non-linear projection for loss
            h1, h2 = model(sk1), model(sk2)
            z1, z2 = projection(h1), projection(h2)
            # calculate the batch loss
            loss = criterion(z1, z2)
            # update average validation loss
            valid_loss += loss.item()*sk1.size(0)

        # calculate average losses
        train_loss = train_loss/len(train_loader.sampler)
        valid_loss = valid_loss/len(valid_loader.sampler)

        # print training/validation statistics
        print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
            epoch, train_loss, valid_loss))

        # save model if validation loss has decreased
        if valid_loss <= valid_loss_min:
            print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
            valid_loss_min,
            valid_loss))
            torch.save(model.state_dict(), 'data/model_embedding.pt')
            torch.save(projection.state_dict(), 'data/model_projection.pt')
            valid_loss_min = valid_loss

In [8]:
model = models.InceptionModel(num_blocks=3, in_channels=2, out_channels=16,
                           bottleneck_channels=2, kernel_sizes=41, use_residuals=True,
                           num_pred_classes=128).float()

projection = models.Projection([128,56,32,2]).float()

In [20]:
model = models.SimpleEncoder()

projection = models.Projection([44,22,10,2]).float()

In [21]:
model

SimpleEncoder(
  (encoding): Conv1d(2, 2, kernel_size=(10,), stride=(2,))
  (layers): Sequential(
    (0): Conv1d(2, 2, kernel_size=(10,), stride=(2,))
    (1): ReLU()
    (2): Conv1d(2, 1, kernel_size=(10,), stride=(2,))
    (3): ReLU()
  )
)

In [22]:
projection

Projection(
  (layers): Sequential(
    (0): Linear(in_features=44, out_features=22, bias=False)
    (1): ReLU()
    (2): Linear(in_features=22, out_features=10, bias=False)
    (3): ReLU()
    (4): Linear(in_features=10, out_features=2, bias=False)
  )
)

In [24]:
trails = 10

for t in range(trails):
    temperature = np.random.uniform(0.01, 1.0)#0.1
    lr = np.random.uniform(0.00001, 0.0001)
    #lr=0.00001
    model = models.SimpleEncoder()
    projection = models.Projection([44,22,10,2]).float()
    print(f"training with temp: {temperature} and lr: {lr}")
    criterion = simclr.NT_Xent(batch_size, temperature, device)
    optimizer = torch.optim.Adam(list(model.parameters()) + list(projection.parameters()), lr=lr)

    n_epochs = 3

    train(n_epochs, model, projection, optimizer, criterion, train_loader, valid_loader)

training with temp: 0.43905823466525024 and lr: 9.523780331283934e-05
Epoch: 1 	Training Loss: 6.519601 	Validation Loss: 6.285295
Validation loss decreased (inf --> 6.285295).  Saving model ...
Epoch: 2 	Training Loss: 6.305074 	Validation Loss: 6.102453
Validation loss decreased (6.285295 --> 6.102453).  Saving model ...
Epoch: 3 	Training Loss: 6.137534 	Validation Loss: 5.965303
Validation loss decreased (6.102453 --> 5.965303).  Saving model ...
training with temp: 0.8093627637294365 and lr: 1.0948920358169622e-05
Epoch: 1 	Training Loss: 6.577522 	Validation Loss: 6.391799
Validation loss decreased (inf --> 6.391799).  Saving model ...
Epoch: 2 	Training Loss: 6.502877 	Validation Loss: 6.378729
Validation loss decreased (6.391799 --> 6.378729).  Saving model ...
Epoch: 3 	Training Loss: 6.484091 	Validation Loss: 6.371346
Validation loss decreased (6.378729 --> 6.371346).  Saving model ...
training with temp: 0.667841233652229 and lr: 1.4890628566334744e-05
Epoch: 1 	Training Lo

In [None]:
temperature = 0.1
lr = 0.001

criterion = simclr.NT_Xent(batch_size, temperature, device)
optimizer = torch.optim.Adam(list(model.parameters()) + list(projection.parameters()), lr=lr)

In [None]:
n_epochs = 3

train(n_epochs, model, projection, optimizer, criterion, train_loader, valid_loader)


