In [1]:
from datasets import get_cmu_mosi_dataset
import torch.nn as nn
import torch
import tltorch
from tltorch.factorized_layers.factorized_linear import FactorizedLinear
from tltorch.factorized_tensors import TensorizedTensor
from layers import FactorizedLinearCP



In [4]:
train_set, valid_set, test_set = get_cmu_mosi_dataset(binary=True)

In [5]:
batch_size = 4
batch = train_set[:batch_size][0]

In [2]:
rnn = nn.LSTM(input_size=300, hidden_size=128, num_layers=1, bidirectional=False, batch_first=True)

In [6]:
output, (h_n, c_n) = rnn(batch)

In [7]:
param_list = list(rnn.parameters())

In [8]:
W_ii = param_list[0][:128]
W_if = param_list[0][128:256]
W_ig = param_list[0][256:384]
W_io = param_list[0][384:]

In [9]:
W_hi = param_list[1][:128]
W_hf = param_list[1][128:256]
W_hg = param_list[1][256:384]
W_ho = param_list[1][384:]

In [10]:
b_ii = param_list[2][:128]
b_if = param_list[2][128:256]
b_ig = param_list[2][256:384]
b_io = param_list[2][384:]

In [11]:
b_hi = param_list[3][:128]
b_hf = param_list[3][128:256]
b_hg = param_list[3][256:384]
b_ho = param_list[3][384:]

In [12]:
tensorized_shape = tltorch.utils.get_tensorized_shape(128, 300)

Tensorizing (in, out)=((128, 300)) -> (((4, 4, 8), (4, 5, 15)))


In [14]:
layer_ii = FactorizedLinearCP(tensorized_shape[1], tensorized_shape[0], bias=True)
layer_ii.from_matrix(W_ii.data, b_ii.data)
layer_if = FactorizedLinearCP(tensorized_shape[1], tensorized_shape[0], bias=True)
layer_if.from_matrix(W_if.data, b_if.data)
layer_ig = FactorizedLinearCP(tensorized_shape[1], tensorized_shape[0], bias=True)
layer_ig.from_matrix(W_ig.data, b_ig.data)
layer_io = FactorizedLinearCP(tensorized_shape[1], tensorized_shape[0], bias=True)
layer_io.from_matrix(W_io.data, b_io.data)



In [15]:
tensorized_shape = tltorch.utils.get_tensorized_shape(128, 128)

Tensorizing (in, out)=((128, 128)) -> (((4, 4, 8), (4, 4, 8)))


In [16]:
layer_hi = FactorizedLinearCP(tensorized_shape[1], tensorized_shape[0], bias=True)
layer_hi.from_matrix(W_hi.data, b_hi.data)
layer_hf = FactorizedLinearCP(tensorized_shape[1], tensorized_shape[0], bias=True)
layer_hf.from_matrix(W_hf.data, b_hf.data)
layer_hg = FactorizedLinearCP(tensorized_shape[1], tensorized_shape[0], bias=True)
layer_hg.from_matrix(W_hg.data, b_hg.data)
layer_ho = FactorizedLinearCP(tensorized_shape[1], tensorized_shape[0], bias=True)
layer_ho.from_matrix(W_ho.data, b_ho.data)

In [None]:
c = torch.zeros((batch_size, 128))
h = torch.zeros((batch_size, 128))
for seq in range(20):
    i = torch.sigmoid(layer_ii(batch[:,seq,:]))
    f = torch.sigmoid(layer_if(batch[:,seq,:]))
    g = torch.tanh(layer_ig(batch[:,seq,:]))
    o = torch.sigmoid(layer_io(batch[:,seq,:]))
    c = f * c + i * g
    h = o * torch.tanh(c)   

In [11]:
c = torch.zeros((batch_size, 128))
h = torch.zeros((batch_size, 128))
for seq in range(20):
    i = torch.sigmoid(batch[:,seq,:] @ W_ii.T + b_ii + h @ W_hi.T + b_hi)
    f = torch.sigmoid(batch[:,seq,:] @ W_if.T + b_if + h @ W_hf.T + b_hf)
    g = torch.tanh(batch[:,seq,:] @ W_ig.T + b_ig + h @ W_hg.T + b_hg)
    o = torch.sigmoid(batch[:,seq,:] @ W_io.T + b_io + h @ W_ho.T + b_ho)
    c = f * c + i * g
    h = o * torch.tanh(c)

In [12]:
class LinearCP(FactorizedLinear):
    
    def __init__(self, in_tensorized_features, out_tensorized_features, bias=False,
                 max_rank=10, device=None, dtype=None):
        '''
        args:
            in_tensorized_features: a tuple of ints, (in_size_1, in_size_2, ..., in_size_n) 
            out_tensorized_features: a tuple of ints, (out_size_1, out_size_2, ..., out_size_m)
            bias: a boolean, True for bias False for no bias
            max_rank: maximum rank for CP decomposition of weight
        '''
        
        super(LinearCP, self).__init__(in_tensorized_features, out_tensorized_features, bias,
                                               factorization='cp', rank=max_rank, n_layers=1, 
                                               device=device, dtype=dtype)
        self.max_rank = max_rank
        self.n_input_factors = len(in_tensorized_features)
        self.n_output_factors = len(out_tensorized_features)
    
    def from_matrix(self, matrix, bias=None):
        
        self.weight = TensorizedTensor.from_matrix(matrix, 
                                                   self.out_tensorized_features, 
                                                   self.in_tensorized_features, 
                                                   self.max_rank, 
                                                   factorization='CP')
        if bias is None:
            self.bias = bias
        else:
            self.bias = nn.Parameter(bias)       
        
    def forward(self, x):
        '''
        X @ W.T + b
        
        factors are in the order of [out_factors, in_factors]
        '''
        
        # tensorize input
        output = x.reshape((x.shape[0],) + self.in_tensorized_features)
        print(output.shape)
        
        # forward propagate with input factors
        output = torch.einsum('na...,ar->n...r', output, self.weight.factors[self.n_output_factors])
        for factor in self.weight.factors[self.n_output_factors+1:]:
            output = torch.einsum('na...r,ar->n...r', output, factor)
            
        # forward propagate with output factors
        for factor in self.weight.factors[:self.n_output_factors-1]:
            output = torch.einsum('n...r,ar->n...ar', output, factor)
        output = torch.einsum('n...r,ar->n...a', output, self.weight.factors[self.n_output_factors-1])
        
        # vectorize output
        output = output.reshape((x.shape[0], self.out_features))
        
        # add bias
        if self.bias is not None:
            output = output + self.bias
        
        return output

In [13]:
layer = LinearCP(tensorized_shape[1], tensorized_shape[0], bias=True)

In [14]:
layer.from_matrix(W_ii.data, b_ii.data)



In [15]:
out = layer(batch[:,0,:])

torch.Size([4, 4, 5, 15])


In [16]:
out_ = batch[:,0,:] @ W_ii.T + b_ii

In [17]:
torch.isclose(out, out_)

tensor([[True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, 

In [30]:
class LinearCP_from_matrix(FactorizedLinear):
    
    def __init__(self, in_tensorized_features, out_tensorized_features, matrix, bias=None, has_bias=False,
                 max_rank=10, device=None, dtype=None):
        
        super(LinearCP_from_matrix, self).__init__(in_tensorized_features, out_tensorized_features, has_bias,
                                               factorization='cp', rank=max_rank, n_layers=1, 
                                               device=device, dtype=dtype)
        
        # replace weight and bias 
        self.weight = TensorizedTensor.from_matrix(matrix, out_tensorized_features, in_tensorized_features, max_rank, factorization='CP')
        self.bias = nn.Parameter(bias)
        self.n_input_factors = len(in_tensorized_features)
        self.n_output_factors = len(out_tensorized_features)
        
    def forward(self, x):
        '''
        X @ W.T
        '''
        
        # tensorize input
        output = x.reshape((x.shape[0],) + self.in_tensorized_features)
        
        # forward propagate with input factors
        output = torch.einsum('na...,ar->n...r', output, self.weight.factors[self.n_output_factors])
        for factor in self.weight.factors[self.n_output_factors+1:]:
            output = torch.einsum('na...r,ar->n...r', output, factor)
            
        # forward propagate with output factors
        for factor in self.weight.factors[:self.n_output_factors-1]:
            output = torch.einsum('n...r,ar->n...ar', output, factor)
        output = torch.einsum('n...r,ar->n...a', output, self.weight.factors[self.n_output_factors-1])
        
        # vectorize output
        output = output.reshape((x.shape[0], self.out_features))
        
        # add bias
        if self.bias is not None:
            output = output + self.bias
        
        return output

In [1]:
class LinearCP(FactorizedLinear):
    
    def __init__(self, in_tensorized_features, out_tensorized_features, bias=False,
                 max_rank=10, device=None, dtype=None):
        '''
        args:
            in_tensorized_features: a tuple of ints, (in_size_1, in_size_2, ..., in_size_n) 
            out_tensorized_features: a tuple of ints, (out_size_1, out_size_2, ..., out_size_m)
            bias: a boolean, True for bias False for no bias
            max_rank: maximum rank for CP decomposition of weight
        '''
        
        super(LinearCP, self).__init__(in_tensorized_features, out_tensorized_features, bias,
                                               factorization='cp', rank=max_rank, n_layers=1, 
                                               device=device, dtype=dtype)
        self.max_rank = max_rank
        self.n_input_factors = len(in_tensorized_features)
        self.n_output_factors = len(out_tensorized_features)
    
    def from_matrix(self, matrix, bias=None):
        
        self.weight = TensorizedTensor.from_matrix(matrix, 
                                                   self.out_tensorized_features, 
                                                   self.in_tensorized_features, 
                                                   self.max_rank, 
                                                   factorization='CP')
        if bias is None:
            self.bias = bias
        else:
            self.bias = nn.Parameter(bias)       
        
    def forward(self, x):
        '''
        X @ W.T + b
        
        factors are in the order of [out_factors, in_factors]
        '''
        
        # tensorize input
        output = x.reshape((x.shape[0],) + self.in_tensorized_features)
        print(output.shape)
        
        # forward propagate with input factors
        output = torch.einsum('na...,ar->n...r', output, self.weight.factors[self.n_output_factors])
        for factor in self.weight.factors[self.n_output_factors+1:]:
            output = torch.einsum('na...r,ar->n...r', output, factor)
            
        # forward propagate with output factors
        for factor in self.weight.factors[:self.n_output_factors-1]:
            output = torch.einsum('n...r,ar->n...ar', output, factor)
        output = torch.einsum('n...r,ar->n...a', output, self.weight.factors[self.n_output_factors-1])
        
        # vectorize output
        output = output.reshape((x.shape[0], self.out_features))
        
        # add bias
        if self.bias is not None:
            output = output + self.bias
        
        return output

NameError: name 'FactorizedLinear' is not defined

In [124]:
layer = LinearCP((2,3),(4, 5))

In [125]:
x = torch.randn((5, 6))
output = layer(x)

torch.Size([5, 2, 3])


In [126]:
output_ = x @ layer.weight.to_matrix().T

In [128]:
torch.isclose(output, output_)

tensor([[ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True, False,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True]])

In [53]:
layer.weight.factors[0].shape

torch.Size([4, 10])

In [54]:
layer.weight.order

4

In [None]:
def cp_times_matrix_fwd(tensor, matrix):
    """
    Multiplies a tensorly CP tensorized matrix and an input matrix
    
    X @ W
    """
    
    order = len(tensor.tensorized_shape[0])
    saved_tensors = []

    # tensorize the input
    output = matrix.reshape((matrix.shape[0],) + tensor.tensorized_shape[0])
    saved_tensors.append(output)

    # forward propagate with input factors
    output = torch.einsum('na...,ar->n...r', output, tensor.factors[0])
    saved_tensors.append(output)
    for factor in tensor.factors[1:order]:
        output = torch.einsum('na...r,ar->n...r', output, factor)
        saved_tensors.append(output)

    # forward propagate with output factors
    for factor in tensor.factors[order:tensor.order-1]:
        output = torch.einsum('n...r,ar->n...ar', output, factor)
        saved_tensors.append(output)
    output = torch.einsum('n...r,ar->n...a', output, tensor.factors[-1])
    
    # vectorize the output
    output = output.reshape((matrix.shape[0], tensor.shape[1]))
    
    return output, saved_tensors

#### LSTM

For each element in the input sequence, each layer computes the following function:

$i_t = \sigma(W_{ii}x_t + b_{ii} + W_{hi}h_{t-1} +b_{hi})$  
$f_t = \sigma(W_{if}x_t + b_{if} + W_{hf}h_{t-1} +b_{hf})$  
$g_t = \tanh(W_{ig}x_t + b_{ig} + W_{hg}h_{t-1} +b_{hg})$  
$o_t = \sigma(W_{io}x_t + b_{io} + W_{ho}h_{t-1} + b_{ho})$  
$c_t = f_t \odot c_{t-1} + i_t \odot g_t$  
$h_t = o_t \odot \tanh(c_t)$  

where $h_t$ is the hidden state at time $t$, $c_t$ is the cell state at time $t$, $x_t$ is the input at time $t$, $h_{t-1}$ is the hidden state of the layer at time $t-1$ or the initial hidden state at time $o$, and $i_t, f_t, o_t$ are the input, forget,cell and output gates, respectively. $\sigma$ is the sigmoid function, and $\odot$ is the Hadamard product.

In a multilayer LSTM, the input $x_t^{(l)}$ of the $l$-th layer $(l \geq 2)$ is the hidden state $h_t^{(l-1)}$ of the previous layer multiplied by dropout $\delta_t^{(l-1)}$ where each $\delta_t^{(l-1)}$ is a Bernoulli random variable which is 0 with probability dropout.

If proj_size > 0 is specified, LSTM with projections will be used. This changes the LSTM cell in the following way. First, the diemnsion of $h_t$ will be changed from hidden_size to proj_size (dimensions of $W_{hi}$ will change accordingly). Second, the output hidden state of each layer will be multiplied by a learnable projection matrix: $h_t = W_{hr}h_t$. Note that as a consequence of this, the output of LSTM network will be of different shape as well.

##### Parameters

* **input_size** - the number of expected features in the input $x$
* **hidden_size** - the number of features in the hidden state $h$
* **num_layers** - the number of recurrent layers. E.g., setting num_layers=2 would mean stacking two LSTMs together to form a stacked LSTM, with the second LSTM taking in outputs of the first LSTM and computing the final results. Default: 1
* **bias** - if False, then the layer does not use bias weights $b_ih$ and $b_hh$. Default: True
* **batch_first** - if True, then the input ad output tensors are provided as (batch,seq,feature) instead of (seq,batch,feature). Note that this does not apply to hidden or cell states. Default: False
* **dropout** - if non-zero, introduces a Dropout layer on the outputs of each LSTM layer except the last layer, with dropout pro ability equal to drpout. Default: 0
* **bidirectional** - if True, becomes a bidirectional LSTM. Default: False
* **proj_size** - if > 0, will use LSTM with projections of corresponding size. Default: 0


##### Inputs: input, (h_0,c_0)

* **input**: tensor of shape $(L,N,H_{in})$ when batch_first=False or $(N,L,H_{in})$ when batch_first=True containing the features of the input sequence.  
* **h_0**: tensor of shape $(D*$num_layers$,N,H_{out})$ containing the initial hidden state for each element in the batch. Defaults to zeros if (h_0,c_0) is not provided.
* **c_0**: tensor of shape $(D*$num_layers$,N,H_{cell})$ containing the initial cell state for each element in the batch. Defaults to zeros if (h_0,c_0) is not provided.

where:


$N$ = batch size  
$L$ = sequence length  
$D$ = 2 if bidirectional=True otherwise 1  
$H_{in}$ = input_size  
$H_{cell}$ = hidden_size  
$H_{out}$ = proj_size if proj_size > 0 otherwise hidden_size


##### Outputs: output, (h_n,c_n)

* **output**: tensor of shape $(L,N,D*H_{out})$ when batch_first=False or $(N,L,D*H_{out})$ when batch_first=True containing the output features $(h_t)$ from the last layer of the LSTMk for each $t$.
* **h_n**: tensor of shape $(D*$num_layers$,N,H_{out})$ containing the final hidden state for eah element in the batch.
* **c_n**: tensor of shape $(D*$num_layers$,N,H_{cell})$ containing the final cell state for each element in the batch.