In [0]:
import math
import time
import pylab
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import random

#num_tasks = 1
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")



def gen_data(N, num_tasks, sigma):
    X = np.random.randn(N, 1)    
    w = [random.choice(list(range(-5, 0)) + list(range(1, 6))) for i in range(num_tasks)]
    b = [random.randint(-5, 5) for i in range(num_tasks)]
    #w = [random.choice(list(range(-5, 0)) + list(range(1, 6))) for i in range(num_tasks)]
    #b = [random.randint(-5, 5) for i in range(num_tasks)]
    #sigma = [random.randint(1, 10) for i in range(num_tasks)]
    Y = [0] * num_tasks
    for i in range(num_tasks):
        Y[i] = X.dot(w[i]) + b[i] + sigma[i] * np.random.randn(N, 1)

    return X, Y

def gen_val_data(N, num_tasks):
    X = np.random.randn(N, 1)
    #w = [random.choice(list(range(-5, 0)) + list(range(1, 6))) for i in range(num_tasks)]
    #b = [random.randint(-5, 5) for i in range(num_tasks)]
    #sigma = [random.randint(1, 10) for i in range(num_tasks)]
    Y = [0] * num_tasks
    for i in range(num_tasks):
        Y[i] = X.dot(w[i]) + b[i] + sigma[i] * np.random.randn(N, 1)

    return X, Y
  
def gen_polynomial_data(N, num_tasks, sigma, degree, regression=True):
    X = np.random.randn(N, 1)
    w = [[random.choice(list(range(-5, 0)) + list(range(1, 6))) for i in range(num_tasks)] for j in range(degree)]
    b = [random.randint(-5, -5) for i in range(num_tasks)]
    Y = [0] * num_tasks
    for i in range(num_tasks):
        Y[i] = b[i] + sigma[i] * np.random.randn(N, 1)
        for j in range(1, degree+1):
            Y[i] += (X**j).dot(w[j-1][i])
        
    return X, Y



class Data(Dataset):

    def __init__(self, feature_num, X, Y):
        self.num_tasks = len(Y)
        self.feature_num = feature_num

        self.X = torch.tensor(X, dtype=torch.float32, device=device)
        self.Y = [0] * self.num_tasks
        for i in range(self.num_tasks):
            #self.Y[i] = torch.from_numpy(Y[i])
            self.Y[i] = torch.tensor(Y[i], dtype=torch.float32, device=device)
    def __len__(self):
        return self.feature_num

    def __getitem__(self, idx):
        return self.X[idx,:], [self.Y[i][idx,:] for i in range(self.num_tasks)]

class MultiTaskLossWrapper(nn.Module):
    def __init__(self, num_tasks, model):
        super(MultiTaskLossWrapper, self).__init__()
        self.model = model
        self.num_tasks = num_tasks
        self.log_vars = nn.Parameter(torch.zeros((num_tasks), device=device))

    def forward(self, input, targets):
        outputs = self.model(input)
        loss = 0
        task_losses = [0] * self.num_tasks
        precision = [0] * self.num_tasks
        for i in range(self.num_tasks):
            precision[i] = torch.exp(-self.log_vars[i])
            task_loss = torch.sum(precision[i] * (targets[i] - outputs[i]) ** 2. + self.log_vars[i], -1)
            task_losses[i] = torch.mean(task_loss).item()
            loss += task_loss
        
        return torch.mean(loss), task_losses, self.log_vars.data.tolist()


class MTLModel(torch.nn.Module):
    def __init__(self, n_hidden, n_output, num_tasks):
        super(MTLModel, self).__init__()
        self.num_tasks = num_tasks
        self.shared_fc = nn.Sequential(nn.Linear(1, n_hidden), nn.ReLU())
        self.nets = [0] * num_tasks
        
        for i in range(num_tasks):
            self.nets[i] = nn.Sequential(nn.Linear(n_hidden, n_hidden), nn.ReLU(), nn.Linear(n_hidden, n_output)).to(device)
        
    def forward(self, x):
        shared_out = self.shared_fc(x)
        return [self.nets[i](shared_out) for i in range(self.num_tasks)]

In [0]:
seed = 5
np.random.seed(seed)

feature_num = 1500
val_feature_num = 200
nb_epoch = 2000
batch_size = 75
hidden_dim = 512
lr = 0.1
fixed = False
fixed_sigma = 1



In [0]:
#from torch.utils.tensorboard import SummaryWriter
from datetime import datetime
import math
torch.autograd.set_detect_anomaly(True) 
patience = 50
delta = 1e-4
num_iters = 5
max_num_tasks = 5
avg_min_val_loss = [0] * (max_num_tasks - 1)

for num_tasks in range(2, max_num_tasks + 1):
    for i in range(num_iters):
        print(i)
        if fixed:
            sigma = [fixed_sigma for _ in range(num_tasks)]
        else:
            #sigma = [random.randint(1, 3) for i in range(num_tasks)]
            sigma = list(range(1, num_tasks+1))
        X, Y = gen_polynomial_data(feature_num + val_feature_num, num_tasks, sigma, 8)
        X_val, Y_val = X[feature_num:], [y[feature_num:] for y in Y]
        X, Y = X[:feature_num], [y[:feature_num] for y in Y]
        
        lowest_val_loss = None
        counter = 0
        early_stop = False


        train_data = Data(feature_num, X, Y)
        val_data = Data(val_feature_num, X_val, Y_val)
        train_data_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
        val_data_loader = DataLoader(val_data, shuffle=True, batch_size=batch_size)

        model = MTLModel(hidden_dim, 1, num_tasks)
        mtl = MultiTaskLossWrapper(num_tasks, model)

        model.to(device)
        mtl.to(device)

        # https://github.com/keras-team/keras/blob/master/keras/optimizers.py
        # k.epsilon() = keras.backend.epsilon()
        optimizer = torch.optim.Adam(mtl.parameters(), lr=lr, eps=1e-07)

        loss_list = []
        val_loss_list = []
        plot_val_loss = []
        times = []
        for t in range(nb_epoch):
            start = time.time()
            cumulative_loss = 0
            cumulative_val_loss = 0
            cumulative_task_losses = [0] * num_tasks
            cumulative_task_losses_val = [0] * num_tasks

            for X_batch, Y_batch in train_data_loader:
                X_batch, Y_batch = X_batch.to(device), [y.to(device) for y in Y_batch]

                loss, task_losses, log_vars = mtl(X_batch, Y_batch)
                cumulative_task_losses = [cumulative_task_losses[i] + task_losses[i] for i in range(len(cumulative_task_losses))]

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                cumulative_loss += loss.item()
            with torch.no_grad():
                for X_val_batch, Y_val_batch in val_data_loader:
                    X_val_batch, Y_val_batch = X_val_batch.to(device), [y.to(device) for y in Y_val_batch]
                    val_loss, task_losses_val, _ = mtl(X_val_batch, Y_val_batch)
                    cumulative_task_losses_val = [cumulative_task_losses_val[i] + task_losses_val[i] for i in range(len(cumulative_task_losses_val))]

                    cumulative_val_loss += val_loss.item()

            #loss_list.append(cumulative_loss/(feature_num / batch_size))
            #val_loss_list.append(cumulative_val_loss/(val_feature_num / batch_size))
            
            val_loss_batch = cumulative_val_loss/(val_feature_num / batch_size)
            if lowest_val_loss is None:
                lowest_val_loss = val_loss
            elif val_loss > lowest_val_loss - delta:
                counter += 1
                if counter >= patience:
                    early_stop = True
            else:
                lowest_val_loss = val_loss
                counter = 0


            if t % 25 == 0:
                plot_val_loss.append(val_loss.item() / num_tasks)
                #print(val_loss.item() / num_tasks)
                #print('   ', task_losses_val)

                #print('   ', [math.exp(log_var) ** 0.5 for log_var in mtl.log_vars])
            
            if early_stop:
                break
        avg_min_val_loss[num_tasks - 2] += lowest_val_loss / num_tasks
    avg_min_val_loss[num_tasks - 2] /= num_iters

    print('Finished Task', num_tasks)

### Increase noise on task 3

In [0]:
#from torch.utils.tensorboard import SummaryWriter
from datetime import datetime
import math
torch.autograd.set_detect_anomaly(True) 
patience = 50
delta = 1e-4
num_iters = 3
max_num_tasks = 5
max_num_sigma = 5
avg_min_val_loss = [0] * (max_num_sigma)

for k in range(1, max_num_sigma+1):
    num_tasks = 3
    for i in range(num_iters):
        print(i)
        if fixed:
            sigma = [fixed_sigma for _ in range(num_tasks)]
        else:
            #sigma = [random.randint(1, 3) for i in range(num_tasks)]
            sigma = [1, 2, k]
        X, Y = gen_polynomial_data(feature_num + val_feature_num, num_tasks, sigma, 8)
        X_val, Y_val = X[feature_num:], [y[feature_num:] for y in Y]
        X, Y = X[:feature_num], [y[:feature_num] for y in Y]
        
        lowest_val_loss = None
        counter = 0
        early_stop = False


        train_data = Data(feature_num, X, Y)
        val_data = Data(val_feature_num, X_val, Y_val)
        train_data_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
        val_data_loader = DataLoader(val_data, shuffle=True, batch_size=batch_size)

        model = MTLModel(hidden_dim, 1, num_tasks)
        mtl = MultiTaskLossWrapper(num_tasks, model)

        model.to(device)
        mtl.to(device)

        # https://github.com/keras-team/keras/blob/master/keras/optimizers.py
        # k.epsilon() = keras.backend.epsilon()
        optimizer = torch.optim.Adam(mtl.parameters(), lr=lr, eps=1e-07)

        loss_list = []
        val_loss_list = []
        plot_val_loss = []
        times = []
        for t in range(nb_epoch):
            start = time.time()
            cumulative_loss = 0
            cumulative_val_loss = 0
            cumulative_task_losses = [0] * num_tasks
            cumulative_task_losses_val = [0] * num_tasks

            for X_batch, Y_batch in train_data_loader:
                X_batch, Y_batch = X_batch.to(device), [y.to(device) for y in Y_batch]

                loss, task_losses, log_vars = mtl(X_batch, Y_batch)
                cumulative_task_losses = [cumulative_task_losses[i] + task_losses[i] for i in range(len(cumulative_task_losses))]

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                cumulative_loss += loss.item()
            with torch.no_grad():
                for X_val_batch, Y_val_batch in val_data_loader:
                    X_val_batch, Y_val_batch = X_val_batch.to(device), [y.to(device) for y in Y_val_batch]
                    val_loss, task_losses_val, _ = mtl(X_val_batch, Y_val_batch)
                    cumulative_task_losses_val = [cumulative_task_losses_val[i] + task_losses_val[i] for i in range(len(cumulative_task_losses_val))]

                    cumulative_val_loss += val_loss.item()

            #loss_list.append(cumulative_loss/(feature_num / batch_size))
            #val_loss_list.append(cumulative_val_loss/(val_feature_num / batch_size))
            
            val_loss_batch = cumulative_val_loss/(val_feature_num / batch_size)
            if lowest_val_loss is None:
                lowest_val_loss = val_loss
            elif val_loss > lowest_val_loss - delta:
                counter += 1
                if counter >= patience:
                    early_stop = True
            else:
                lowest_val_loss = val_loss
                counter = 0


            if t % 25 == 0:
                plot_val_loss.append(val_loss.item() / num_tasks)
                #print(val_loss.item() / num_tasks)
                #print('   ', task_losses_val)

                #print('   ', [math.exp(log_var) ** 0.5 for log_var in mtl.log_vars])
            
            if early_stop:
                break
        avg_min_val_loss[k - 1] += lowest_val_loss / num_tasks
    avg_min_val_loss[k - 1] /= num_iters

    print('Finished Task', num_tasks)