# Queue

In [1]:
import os
import sys

sys.path.append("../")

In [2]:
import numpy as np
from sklearn.datasets import make_blobs, make_moons, make_circles
import seaborn as sns
from matplotlib import pyplot as plt
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Function
from torchviz import make_dot

from src.ml.sinkhorn import pot_sinkhorn, SinkhornValue

In [3]:
sns.set(style="whitegrid")

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Dataset generation

In [5]:
in_features = 512
out_features = 256
n_clusters = 10
batch_size = 128
n_samples = 2000

In [6]:
class ToyDataset(Dataset):
    def __init__(self, n_features, n_clusters, n_samples):
        self.n_features = n_features
        self.n_clusters = n_clusters
        self.n_samples = n_samples
    
    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx], self.y_true[idx]
    
    def plot(self):
        fig, ax = plt.subplots(
            nrows=1,
            ncols=1,
            figsize=(5, 5)
        )

        sns.scatterplot(  # plot first 2 components
            x=self.X[:, 0],
            y=self.X[:, 1],
            hue=map(str, self.y_true),
            ax=ax,
            legend=False
        )

        ax.set_xlabel("Component 1")
        ax.set_ylabel("Component 2")
        ax.set_title("Clusters visualization")

        return fig


class BlobsDataset(ToyDataset):
    """
    https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_blobs.html
    """
    def __init__(self, n_features, n_clusters, n_samples):
        super().__init__(n_features, n_clusters, n_samples)
        
        X, y_true = make_blobs(
            n_samples=n_samples,
            n_features=n_features,
            centers=n_clusters,
            cluster_std=.8,
            random_state=0
        )
        
        self.X = torch.FloatTensor(X)
        self.y_true = torch.LongTensor(y_true)

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
dataset = BlobsDataset(in_features, n_clusters, n_samples)

train_set, val_set = torch.utils.data.random_split(dataset, [1500, 500])

In [26]:
dataloader = DataLoader(train_set, batch_size=batch_size, drop_last=True)
dataloader_val = DataLoader(val_set, batch_size=1000, drop_last=False)

## Model definition

In [27]:
class Model(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        
        self.mlp = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, output_dim),
        )
    
    def forward(self, inputs, return_features=False):
        mlp_out = self.mlp(inputs)
        softmax_out = nn.LogSoftmax(dim=1)(mlp_out)
        
        if return_features:
            return mlp_out
        else:
            return softmax_out

## Queue integration

In [34]:
# Define model
model = Model(
    input_dim=in_features,
    output_dim=out_features
)
assert model.mlp[0].in_features == in_features
assert model.mlp[-1].out_features == out_features


# Define optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)


# Define loss
SV = SinkhornValue(
    epsilon=.2,
    solver=pot_sinkhorn,
    max_n_batches_in_queue=2,
    method="sinkhorn_log",
    numItermax=100,
    warn=True
)
assert SV.solver == pot_sinkhorn
assert SV.solver_options.get("method") == "sinkhorn_log"
assert SV.solver_options.get("numItermax") == 100
assert SV.solver_options.get("warn") == True
assert isinstance(SV.stored_M, torch.Tensor)
assert SV.stored_M.shape[0] == 0


# Input data
X, y = next(iter(dataloader))
X_norm = nn.functional.normalize(X)
assert X_norm.shape == (batch_size, in_features)
assert round(torch.norm(X_norm[1]).item(), ndigits=4) == 1.
assert y.shape[0] == batch_size


# Forward pass
optimizer.zero_grad()
M = model(X_norm)
assert M.shape == (batch_size, out_features)
assert all(torch.round(torch.exp(M).sum(axis=1)) == 1.)
assert torch.exp(M).min() >= 0
assert torch.exp(M).max() <= 1

In [35]:
SV = SinkhornValue(
    epsilon=0.1,
    solver=pot_sinkhorn,
    max_n_batches_in_queue=0,
    stopThr=1e-02,
    method="sinkhorn_log",
    # numItermax=400
)

for epoch in range(10):
    epoch_loss = 0
    optimizer.zero_grad()
    
    for batch_ix, (inputs, labels) in enumerate(dataloader):
        if inputs.shape[0] != batch_size:
            continue
            
        outputs = model(inputs)
        loss = SV(-outputs)

        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        
    print("Epoch {0}: {1}".format(epoch+1, epoch_loss / (batch_ix+1)))

Epoch 1: 4.420969356190074
Epoch 2: 3.980233582583341
Epoch 3: 3.721419854597612
Epoch 4: 3.563913280313665
Epoch 5: 3.4498187628659336
Epoch 6: 3.3855877139351587
Epoch 7: 3.3369964469562876
Epoch 8: 3.303353959863836
Epoch 9: 3.2776683243838223
Epoch 10: 3.2549408132379707


## Classifier

In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [37]:
for batch_ix, (inputs, labels) in enumerate(dataloader_val):
    break

In [38]:
X = model(inputs, True).data.numpy()
y = labels.data.numpy()

clf = LogisticRegression()
clf.fit(X, y)

y_pred = clf.predict(X)

In [39]:
accuracy_score(y, y_pred)

1.0