# Unifying distillation and privileged information - Synthetic experiment 1 
This notebook reproduces the synthetic experiments 1 and 3 from [Unifying Distillation and Privileged Information by Lopez et al.](https://arxiv.org/abs/1511.03643).

This notebooks demonstrates: 1) That the teacher can be replaced with a Cumulative Distribution Function of our privileged information $z$, and 2) that the effects of Generalised Distillation are very limited in terms of sample size.

In [1]:
import numpy as np
import torch.optim as optim
from torch.autograd import Variable
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from scipy.stats import norm

## Synthetic environments

1. **Clean labels as privileged information.** We sample triplets $(x_i, x^\star_i, y_i)$ from:

\begin{align*}
  x_i       &\sim \mathcal{N}(0,I_d)\\
  x^\star_i &\leftarrow \langle \alpha, x_i \rangle\\
  \varepsilon_i &\sim \mathcal{N}(0,1)\\
  y_i       &\leftarrow \mathbb{I}((x^\star_i + \varepsilon_i) > 0).
\end{align*}

3. **Experiment 3: Relevant features as PI** We sample triplets $(x_i, x^\star_i, y_i)$ from:
\begin{align*}
    & x_i \sim \mathcal{N}(0, I_d)\\
    & z_i \leftarrow x_{i, J} \\
    \\
    & y_i \leftarrow \mathbb{I} \left \{ \langle \alpha, z_i \rangle > 0 \right \}, \\
\end{align*}

For a more detailed explanation of the synthetic environments, we refer to the original paper.

In [2]:
# experiment 1: noiseless labels as privileged info
def synthetic_01(a,n):
    x  = np.random.randn(n,a.size)
    e  = (np.random.randn(n))[:,np.newaxis]
    xs = np.dot(x,a)[:,np.newaxis]
    y  = ((xs+e) > 0).ravel()
    return (xs,x,y)

# experiment 3: relevant inputs as privileged info
def synthetic_03(a,n):
    x  = np.random.randn(n,a.size)
    xs = np.copy(x)
    xs = xs[:,0:3]
    a  = a[0:3]
    y  = (np.dot(xs,a) > 0).ravel()
    return (xs,x,y)

## Auxiliary functions

In [3]:
def fitModel(model,optimizer,criterion,epochs,x,target, linear=False):
    for epoch in range(epochs):
            # Forward pass: Compute predicted y by passing x to the model
        if not linear:
            y_pred,_ = model(x)
        else:
            y_pred = model(x)
        # Compute and print loss
        loss = criterion(y_pred, target)
        #print(epoch, loss.data[0])
        # Zero gradients, perform a backward pass, and update the weights.
        optimizer.zero_grad()
        loss.backward(retain_graph=True)
        optimizer.step()
    return model

In [4]:
def softmax(w, t = 1.0):
    e = np.exp(w / t)
    return e/np.sum(e,1)[:,np.newaxis]

In [5]:
class Net(nn.Module):
    def __init__(self,d,q):
        super(Net, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(d, q),
        )

    def forward(self,x):
        x = self.fc(x)
        x1 = F.softmax(x, dim=1)
        return x1,x

In [6]:
def do_exp(x_tr,xs_tr,y_tr,x_te,xs_te,y_te):
    t = 1
    l = 1
    l_r=0.001
    epochs=1000
    criterion = torch.nn.MSELoss()
    # scale stuff
    s_x   = StandardScaler().fit(x_tr)
    s_s   = StandardScaler().fit(xs_tr)
    x_tr  = s_x.transform(x_tr)
    x_te  = s_x.transform(x_te)
    xs_tr = s_s.transform(xs_tr)
    xs_te = s_s.transform(xs_te)
    y_tr  = y_tr*1.0
    y_te  = y_te*1.0
    y_tr  = np.vstack((y_tr==1,y_tr==0)).T
    z_tr  = norm.cdf(xs_tr)
    z_tr  = np.vstack((z_tr.T,1-z_tr.T)).T
    y_te  = np.vstack((y_te==1,y_te==0)).T
    """
    Training of privileged model
    """
    xs_tr = Variable(torch.from_numpy(xs_tr)).type(torch.FloatTensor)
    y_tr = Variable(torch.from_numpy(y_tr*1.0)).type(torch.FloatTensor)
    mlp_priv = Net(xs_tr.shape[1],2)
    optimizer = optim.RMSprop(mlp_priv.parameters(),lr=l_r)
    mlp_priv=fitModel(mlp_priv,optimizer,criterion,epochs,xs_tr,y_tr)    
    xs_te = Variable(torch.from_numpy(xs_te)).type(torch.FloatTensor)
    _,soften=mlp_priv(xs_tr)
    output,_=mlp_priv(xs_te)
    pred = torch.argmax(output,dim=1)
    pred=pred.numpy()
    res_priv=np.mean(pred==np.argmax(y_te,1))
    """
    Training of regular MLP
    """
    x_tr = Variable(torch.from_numpy(x_tr)).type(torch.FloatTensor)
    mlp_reg = Net(x_tr.shape[1],2)
    optimizer = optim.RMSprop(mlp_reg.parameters(),lr=l_r)
    mlp_reg=fitModel(mlp_reg,optimizer,criterion,epochs,x_tr,y_tr)
    x_te = Variable(torch.from_numpy(x_te)).type(torch.FloatTensor)
    output,_=mlp_reg(x_te)
    pred = torch.argmax(output,dim=1)
    pred=pred.numpy()
    res_reg=np.mean(pred==np.argmax(y_te,1))

    softened=soften.detach()
    softened=softened.numpy()
    p_tr=softmax(softened,t)
    p_tr=Variable(torch.from_numpy(p_tr)).type(torch.FloatTensor)
    
    ### freezing layers
    for param in mlp_priv.parameters():
        param.requires_grad =False
    """
    LUPI Combination of two model
    """
    mlp_dist = Net(x_tr.shape[1],2)
    optimizer = optim.RMSprop(mlp_dist.parameters(),lr=l_r)
    criterion = torch.nn.MSELoss()
    # Training loop
    for epoch in range(epochs):
        # Forward pass: Compute predicted y by passing x to the model
        y_pred,_ = mlp_dist(x_tr)
        # Compute and print loss
        loss1 = (1-l)*criterion(y_pred, y_tr)
        loss2 = t*t*l*criterion(y_pred, p_tr)
        loss = loss1 + loss2
        # Zero gradients, perform a backward pass, and update the weights.
        optimizer.zero_grad()
        loss.backward(retain_graph=True)
        optimizer.step()
    output,_=mlp_dist(x_te)
    pred = torch.argmax(output,dim=1)
    pred=pred.numpy()
    res_dis=np.mean(pred==np.argmax(y_te,1))
    """
    Training a student model using a CDF transformed Z.
    """
    # z_tr = Variable(torch.from_numpy(z_tr*1.0)).type(torch.FloatTensor)
    # mlp_z = Net(x_tr.shape[1],2)
    # optimizer = optim.RMSprop(mlp_z.parameters(),lr=l_r)
    # mlp_z=fitModel(mlp_z,optimizer,criterion,epochs,x_tr,z_tr)
    # output,_=mlp_z(x_te)
    # pred = torch.argmax(output,dim=1)
    # pred=pred.numpy()
    # res_cdf=np.mean(pred==np.argmax(y_te,1))

    #return np.array([res_priv, res_reg, res_dis, res_cdf])
    return np.array([res_priv, res_reg, res_dis])

## Results

In [7]:
# experiment hyper-parameters
d      = 50
n_tr   = 2000
n_te   = 1000
n_reps = 100

results = {}

## Scaling the sample size
#print("Training_size\tPrivileged\tNo PI\t\tGeneralised Distillation\tCDF Student")
print("Training_size\tPrivileged\tNo PI\t\tGeneralised Distillation")
mean_results = {
    "Experiment name": [],
    "Training size": [],
    "Privileged": [],
    "Generalized Distillation": [],
    "No PI": [],
    "CDF student": []
}

std_results = {
    "Experiment name": [],
    "Training size": [],
    "Privileged": [],
    "Generalized Distillation": [],
    "No PI": [],
    "CDF student": []
}

for n_tr in [200, 500, 1000, 2000]:
    np.random.seed(0)
    for experiment in [synthetic_01, synthetic_03]:
        R = np.zeros((n_reps,3))
        for rep in tqdm(range(n_reps)):
            a   = np.random.randn(50)
            (xs_tr,x_tr,y_tr) = experiment(a,n=n_tr)
            (xs_te,x_te,y_te) = experiment(a,n=n_te)
            R[rep,:] += do_exp(x_tr,xs_tr,y_tr,x_te,xs_te,y_te)
        means = R.mean(axis=0).round(2)
        stds  = R.std(axis=0).round(2)
        #print(f"{n_tr}\t\t{means[0]}(+/-{stds[0]})\t{means[1]}(+/-{stds[1]})\t{means[2]}(+/-{stds[2]})\t\t\t{means[3]}(+/-{stds[3]})\t")
        print(f"{n_tr}\t\t{means[0]}(+/-{stds[0]})\t{means[1]}(+/-{stds[1]})\t{means[2]}(+/-{stds[2]})")
        
        mean_results["Experiment name"].append(experiment.__name__)
        mean_results["Training size"].append(n_tr)
        mean_results["Privileged"].append(means[0])
        mean_results["No PI"].append(means[1])
        mean_results["Generalized Distillation"].append(means[2])
        #mean_results["CDF student"].append(means[3])
        std_results["Experiment name"].append(experiment.__name__)
        std_results["Training size"].append(n_tr)
        std_results["Privileged"].append(stds[0])
        std_results["No PI"].append(stds[1])
        std_results["Generalized Distillation"].append(stds[2])
        #std_results["CDF student"].append(stds[3])
        
        results[experiment.__name__] = R

Training_size	Privileged	No PI		Generalised Distillation


100%|█████████████████████████████████████████████████████████████████████████████████| 100/100 [03:14<00:00,  1.94s/it]


200		0.95(+/-0.01)	0.87(+/-0.02)	0.95(+/-0.01)


100%|█████████████████████████████████████████████████████████████████████████████████| 100/100 [05:02<00:00,  3.02s/it]


200		0.97(+/-0.02)	0.85(+/-0.03)	0.96(+/-0.02)


100%|█████████████████████████████████████████████████████████████████████████████████| 100/100 [05:56<00:00,  3.57s/it]


500		0.95(+/-0.01)	0.92(+/-0.01)	0.95(+/-0.01)


100%|█████████████████████████████████████████████████████████████████████████████████| 100/100 [05:28<00:00,  3.28s/it]


500		0.97(+/-0.02)	0.93(+/-0.01)	0.97(+/-0.01)


100%|█████████████████████████████████████████████████████████████████████████████████| 100/100 [07:57<00:00,  4.78s/it]


1000		0.95(+/-0.01)	0.94(+/-0.01)	0.95(+/-0.01)


100%|█████████████████████████████████████████████████████████████████████████████████| 100/100 [07:26<00:00,  4.47s/it]


1000		0.97(+/-0.02)	0.95(+/-0.01)	0.97(+/-0.01)


100%|█████████████████████████████████████████████████████████████████████████████████| 100/100 [04:05<00:00,  2.46s/it]


2000		0.95(+/-0.01)	0.95(+/-0.01)	0.95(+/-0.01)


100%|█████████████████████████████████████████████████████████████████████████████████| 100/100 [03:54<00:00,  2.35s/it]

2000		0.98(+/-0.01)	0.96(+/-0.01)	0.97(+/-0.01)



