# Test: Deep, non-linear, two-factor model
- Date: Dec 2, 2020

## TODO:
- [ ] Test ShallowSymBilinear (W tensor, A,B + non-linear (sigmoid))
    - [ ] Run experiments and compare performances 

- [ ] Implement Residual learning framework for bilinear models
- [ ] Test ResidualBilinear 
    - [ ] Single W layer + non-linear
    - [ ] Two step residual learning: similar to ResNet
    



A standard symmetric bilinear model in Tenanbaum2000 can be described as:
$$y^{sc}_k = \sum_{j} \sum_{i} w_{ijk}a^s_{i}b^c_{j}$$, which has an equivalent vector form:

$$\mathbf{y}^{sc} = \sum_{j} \sum_{i} \mathbf{W}_{ij}a^s_{i}b^c_{j}$$ where $\mathbf{W}_{ij}$ is a matrix of size (i,j).

This symmetric model has 2 types of model parameters:
- content variable $b$ of length $J$
- K number of matrix $W_{ij}$ of size $(I,J)$: total number of parameters of this 3Dim tensior $W$ is IxJxK.
    - Basis vector interpretation (See Eqn. 2.3): Alternative way to view this interaction weight parameter W is to view as $I \times J$ number of vectors $w_{ij}$, each of which has a length of $K$.
      This vector $w_{ij}$ specifices 
      - If we want to look at how the ith component of a style vector a^s and the jth component of a content vector b^c interacts over the entire image/data point

## Load libraries

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
# %reset out

In [None]:
import os,sys
import re
import math
from datetime import datetime
import time
sys.dont_write_bytecode = True

In [None]:
import pandas as pd
import joblib

import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
from skimage.color import rgb2gray
from skimage.transform import resize

from pprint import pprint
from pathlib import Path
from typing import List, Set, Dict, Tuple, Optional, Iterable, Mapping, Union, Callable

from ipdb import set_trace

In [None]:
# import holoviews as hv
# from holoviews import opts
# hv.extension('bokeh')

In [None]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from  torch.linalg import norm as tnorm

from torchvision import datasets, transforms
from torch.autograd import Variable

## Set Path


In [None]:
this_nb_path = Path(os.getcwd())
ROOT = this_nb_path.parent
SRC = ROOT/'src'
paths2add = [this_nb_path, ROOT, SRC]

print("Project root: ", str(ROOT))
print('Src folder: ', str(SRC))
print("This nb path: ", str(this_nb_path))


for p in paths2add:
    if str(p) not in sys.path:
        sys.path.insert(0, str(p))
        print(str(p), "added to the path\n")
        
# print(sys.path)

In [None]:
# Import libraries from the source
from src.models.SymBilinear import ShallowSymBilinear
from src.utils.misc import info
from src.data.transforms.functional import to_3dim

## Helpers

In [None]:
def now2str():
    now = datetime.now()
    now_str = now.strftime("%m_%d_%H:%M:%S")
    return now_str

def info(arr, header=None):
    if header is None:
        header = "="*30
    print(header)
    print("shape: ", arr.shape)
    print("dtype: ", arr.dtype)
    print("min, max: ", min(np.ravel(arr)), max(np.ravel(arr)))

In [None]:
def to_3dim(X: torch.Tensor, target_size: Tuple[int,int,int], dtype=torch.float32)->torch.Tensor:
    """
    Rearragne data matrix X of size (n_styles*dim_x, n_contents) 
    to (n_styles, n_contents, dim_x)
    
    Args: 
    - X: torch.Tensor of 2dim data matrix
    - target_size: tuple of n_style, n_contents, dim_x
    """
    assert X.ndim == 2
    n_styles, n_contents, dim_x = target_size
    assert X.shape[0] == n_styles * dim_x
    assert X.shape[1] == n_contents

    target = torch.zeros(target_size, dtype=X.dtype)
    
    for s in range(n_styles):
        for c in range(n_contents):
            img = X[s*dim_x: (s+1)*dim_x, c]
            target[s,c] = img
    return target.to(dtype)
    
        
# def mse(out, target):
#     """
#     Return a 
#     out: a minibatch of reconstructed images: (S,C,K)
#     target: a minibatch of ground-truth images: (S,C,K)
#     """
#     assert out.shape == target.shape
#     n_styles, n_contents, dim_x = out.shape
#     n_samples = n_stlyes * n_contents
#     return nn.MSELoss()

loss_fn = nn.MSELoss()   
    

In [None]:
styles.shape,contents.shape


In [None]:
contents.shape[-2:] == (dim_content,1), styles.shape[-2:] == (1,dim_style)

In [None]:
model = TFModel(styles, contents, W)

In [None]:
for name, p in model.named_parameters():
    print(f"{name}: {p.shape}")

In [None]:
model(0,0).shape

In [None]:
out = model()
out.shape

## Restore data matrix variable X as saved from the notebook "02"


In [None]:
%store -r X
%store -r TARGET_SIZE

In [None]:
# Test create_target
def test_create_target():
    pass

# 3 styles, 9 contents, x_dim = np.prod(TARGET_SIZE), TARGET_SIZE = (64,64,3) 
sx, n_contents = X.shape
dim_x = np.prod(TARGET_SIZE)
img_size = TARGET_SIZE
n_styles = int(sx/dim_x)
print(X.shape)
print("n_styles, n_contents, dim_x: ", n_styles, n_contents, dim_x)

In [None]:
X_3d = to_3dim(X, (n_styles, n_contents, dim_x) )
X_3d.shape

In [None]:
# visualize(X, n_styles, n_contents, img_size);
# visualize(X_3d, n_styles, n_contents, img_size);
visualize(out.detach(), n_styles, n_contents, img_size, 
          normalize=True);

## Compiled training specs

In [None]:
def mkdir(p: Path, parents=True):
    if not p.exists():
        p.mkdir(parents=parents)
        print("Created: ", p)


In [None]:
def create_exp_name(hyperparams):
    pass

# Hyperparameters
n_styles, dim_style = 3, 3
n_contents, dim_content = 9, 4
img_size = (64,64,3)
dim_x = np.prod(img_size)

# Define model
styles = torch.randn((n_styles, 1,1, dim_style)) # A: each row is a style vector
W = torch.randn((dim_x, dim_style, dim_content))
contents = torch.randn((n_contents, 1,1, dim_content,1)) # B: each column is a content vector

model = TFModel(styles, contents, W)
# model.show_params()

# Gradient computation
## learn_rate depending on the type of reduction on computing the MSELoss
lrs = {'mean': 1e-2,
      'sum': 1e-6}


# Specify loss function and learning rate
reduction = 'mean'
lr = lrs[reduction]
lr_W = lr*30
# Optimizer
optim_params = [
    {'params': [model.styles, model.contents]},
    {'params': [model.W], 'lr': lr_W}
]
optimizer = optim.Adam(optim_params, lr=lr)


# Training configs
max_epoches = 100
print_every = 10
show_every = 30

# data
target = to_3dim(X, (n_styles, n_contents, dim_x))

# Start training
start = time.time()
losses = []
for ep in range(max_epoches):
    # Compute loss, and compute partial derivatives wrt each parameters, which will be stored 
    # in each parameter (tensor)'s `.grad` property
    out = model()
    loss = nn.MSELoss(reduction=reduction)(out, target) #per-dim of x (pixel)
    
    # Make sure all the `.grad`s of the model parameters are zero 
    optimizer.zero_grad()
    loss.backward()
    losses.append(loss.item())

    
    # Check if the parameters are changing before/after the gradient step
    model.cache_params()
    # Update the parameter values using the current partial derivatives based on the current loss
    optimizer.step()
    model.all_params_changed()
   
#     set_trace()
    
    # Log
    with torch.no_grad():
        if (ep+1)%print_every == 0:
            print(f"Ep {ep}: {loss.item()}")
            for n,p in model.named_parameters():
                print(n)
                print('\t', tnorm(p), tnorm(p.grad))
        if (ep+1)%show_every == 0:
            model.show_params()
            visualize(out, n_styles, n_contents, img_size, normalize=True);
print(f"Took {time.time() - start} sec. Loss: {losses[-1]}")

In [None]:
# Experiment name
result_dir = Path("../results/batch_bilinear/{model.descr()}")
mkdir(result_dir)
exp_descr = f"reduction:{reduction}_lr:{lr}_lrW:{lr_W}_ep:{ep}"

# save model parameters
# save last reconstructions
f_params = model.show_params()
f_params.savefig(result_dir/f"params_{exp_descr}")
with torch.no_grad():
    out = model()
    f_out = visualize(out, n_styles, n_contents, img_size, normalize=True);
    f_out.savefig(result_dir/f"xhat_{exp_descr}")

In [None]:
plt.plot(losses)
