In [1]:
# realizing importing torch takes much longer time than expected, thus should always run this line before moving down. 
import torch
import torchvision
import torchtext
import torch.nn as nn # -- package
import torch.nn.functional as F # -- package

This is just a head start for all pytorch functionalities. Useful for providing a basic understanding. But for more advanced usages, it's always recommended to visit official pytorch official API guide: <br>
https://pytorch.org/docs/stable/index.html

# Tensor related: 

## Tensor initialization

torch.zeros((dimension), dtype=(default)torch.int32)

<br>

torch.ones((dimension), dtype=(default)torch.int32)

<br>

torch.rand((dimension)); 

<br>

torch.randn((dimension)) -> element-wisely normalized between 0, 1
> Useful for applying reparameterization trick for weights of Bayesian Neural Network

<br>

torch.tensor(input, dtype=torch.float32)
- "input" must have consistent shape and can only hold either integers or floats. Can be Numpy array or other python data structure satisfying the condition. 
- "dtype": refer to this list for a complete set of datatypes: https://pytorch.org/docs/stable/tensors.html

<br>

torch.range(start=0, end, step=1) 
> Similar to numpy.arange(). "start" and "end" are INCLUSIVE!!!

<br>

torch.linspace(start, end, steps)
> Create a 1D tensor with each element being linearly spaced. 

<br>

torch.logspace(start, end, steps, base=10.0)
> Create a 1D tensor with each element bing log-spaced. <br>
e.g: torch.logspace(1, 3, step=3) -> 10, 100, 1000


In [3]:
torch.logspace(1, 3, 3)

tensor([  10.,  100., 1000.])

## Tensor Operations

torch.flatten(input, start_dim=0, end_dim=-1) -> Tenser
> flatten the data from "input" into 1D array, by specifying the start and end dimensions. 

<br>

torch.reshape(input, shape_tuple) / Tensor.reshape(shape_tuple) -> Tensor
> the new shape must be able to contain all elements from "input". 

<br>

torch.cat(tensor_tuple, dim=0) -> Tensor
> Concaatenates given tuples of tensors along "dim". <br> All tensors must have same shape except the concatenated dimension. 

<br>

torch.permute(input, new_dim)/Tensor.permute(new_dim) -> Tensor
> permute the dimension according to "new_dim" tuple.
- new_dim: all integers in "new_dim" must be between 0 and 'len(input.size)', and must be unique. 

<br>

torch.swapaxis/transpose(input, axis0, axis1) -> Tensor
> swap two axes specified by "axis0" and "axis1" in "input".  

<br>

torch.gather(input, dim, index) -> Tensor
> Extract elements from "inputs" according to "index" along "dim". 
- dim: cannot exceed "len(input.size())"
- index: cannot exceed "input.size(dim)". Similar role to indices in "numpy.indices()"
> See below code demo for how this method is used. 

In [None]:
t = torch.tensor([[1, 100], [3, 400]])
print(torch.gather(t, 1, torch.tensor([[0, 0], [1, 0]])))
print(torch.gather(t, 0, torch.tensor([[0, 0], [1, 0]])))

Tensor.clone() -> Tensor
> Returns a copy of Tensor

<br>

Tensor.mask_fill_(mask, value) -> Union[None, Tensor]
> Fill the given Tensor with "value" at "mask" locations
- “fill_” indicates in-place(can be removed to return Tensor), <br> "fill" indicates returning a new Tensor. 
- mask: BOOLEAN Tensor, must have shape boardcastable with “Tensor”; 
- value: when mask indice is “True”, “value” will replace the value; is a scalar

<br>

Tensor.size(dim=None) -> Union[tuple, int]
> Returns size of the Tensor; 
- dim: when “dim” is specified, a scalar will be returned (the size for the corresponding dimension);

<br>

Tensor.split(split_size_or_sections, dim=0) -> List[Tensor]:
> split given "Tensor" into a set of smaller Tensors along "dim". 
- Split_size_or_sections: int/list[int], indicating the size of each chunk after splitting;

<br>

Tensor.squeeze(dim=None) -> Tensor: 
> Removes all dimensions in Tensor with size “1”; -> “A x 1 x B” will be changed to “A x B”
- dim: when specified, if the size of “dim” is 1, will be removed; 
    > OTHERWISE size remain UNCHANGED

<br>

Tensor.unsqueeze(dim) -> Tensor:
> Inserts a new dimension with size “1” at given “dim”;

<br>

Tensor.view(dim1_size, dim2_size, ….) -> Tensor
> equivalent to reshaping operations. <br> 
The product of each new dimension’s size must match with number of elements in “Tensor”. 

<br>

Tensor.to(args) -> Tensor:
> Return a Tensor with new properties. Commonly used for putting a Tensor into a device.


In [2]:
device = ('cuda' if torch.cuda.is_available() else 'cpu')
new_tensor = torch.zeros(3)
new_tensor.to(device)
print(new_tensor.device)

cpu


Tensor.repeat((repeat_count)) -> Tensor
> Repeat a given tensor according to "repeat_count". 
- repeat count: a tuple showing the counts of repetitions along given dimensions. 
> Intuition for understanding this method: <br>
&emsp;    Treat whole "Tensor" as a scalar, and repetition would be just repeating this scalar to a larger tensor of shape "repeat_count". <br>
&emsp;    Then replacing those scalars into "Tensor" would give resulting tensor. 

<br>

Tensor.scatter_(dim, index, src) -> None
> Replacing elements of "Tensor" at "index" along "dim" with elements from "src", in-place operation. 
- "index": is a tensor of integers, each element represents the index at a given dimension (can refer to "np.indices" for a better understanding). 
- "src": have the same dimensionality as "index", and must have lengths at least as high as "index"'s largest elements. 
    > "index"'s elements must not exceed the dimension of "src" or result in "index out of range problem"
> See below code demo for a better understanding. 
<center><img src="reference_img/scatter_ref.jpg" width=800></center>

In [None]:
# demo of tensor.scatter_, in-place version of tensor.scatter
test_tensor = torch.arange(40).reshape(2, 5, 4)
source_tensor = torch.arange(100, 132).reshape(2, 4, 4)
ind = torch.tensor([[[0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3]], 
[[0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3]]])
# print(test_tensor.scatter(2, ind, source_tensor)) # this gives a complete replacement of elements for given 4 rows
print(test_tensor.scatter(1, ind, source_tensor))
# print(test_tensor.scatter(1, ind[:, :, :], source_tensor))
print(test_tensor.scatter(1, ind[:, 3:], source_tensor))
print(test_tensor.scatter(1, ind[:, 2:], source_tensor))
# above two line's experiment shows, given the length is constrained, the indexing of sublists in super list also affects the elements being filled. 
# may use these two lines(and more custom code) to understand filling methods shown in attached image of Tensor.scatter_. 


Tensor.expand(new_size) -> Tensor
> Expand dimensions with size 1 to a larger sized tensor. 
- new_size: a tuple representing expanded tensor's size. Need to be consistent with "Tensor.size()" or "-1" to indicate no change, except at dimensions with size 1. 
> Note: the operation is suggested to apply to "cloned" tensor, or can lead to problems. 

In [None]:
# demo of method Tensor.expand(new_size)

test_tensor = torch.arange(32).reshape(2, 4, 1, 4)
res = test_tensor.clone().expand(2, 4, 5, 4)
print(res.size())
# res2 = test_tensor.expand(2, 4, 1, 5) # this is a demo of non-singleton dimension, will result in error
res3 = test_tensor.clone().expand(1, 1, 2, 4, 2, 4) # this demos appending new dimensions at front, and matching singleton dimension locations. 
print(res3.size()) 
res4 = res3.clone().expand(2, 2, 2, 4, 2, 4)
print(res4.size()) # this demos expanding multiple singleton dimensions

## Linear Algebra Operations

Tensor.addr(vec1, vec2, beta=1, alpha=1) -> Tensor; 
> An addition with a vector-OUTER-product and input(vector product will be a matrix), following the below formula: 

<center><img src="reference_img/addr.png" width=400></center>

- "input" is Tensor itself. 

<br>

Tensor.bmm(second-matrix, *) -> Tensor
> Perform a matrix-matrix product.
<center><img src="reference_img/tensor_bmm.jpg" width=800></center>

- "second-matrix" must have shapes following the criterion of matrix product. 

<br>

Tensor.add(other) -> Tensor: 
> Add "Tensor" and "other" following broadcasting rules. 
- "other" must have shape boardcastable with Tensor.
    > Shape boardcasting: other’s shape must be boardcastable with Tensor’s [LAST few dimensions]

<br>

Tensor.cross(other, dim=None, ) -> Tensor
> Returns cross product of vectors from Tensor and other, applied on “dim”; 
<br> &emsp; To avoid problems, need to ensure the vectors size are consistent

<br>

Tensor.det() -> Tensor: 
> Returns determinant of "Tensor". 
- Tensor: must have last two dimensions being a square matrix. Operation only applies to last two dimensions. 

<br>

Tensor.diag(diagonal=0) -> Tensor:
> Extract diagonal elements of "Tensor". 
- "Tensor" can be either 1D or 2D; <br> &emsp;if 1D: returns a new square matrix tensor with “Tensor” elements on diagonal; <br> &emsp;if 2D tensor, will return diagonal elements on “Tensor”. 
- “diagonal”: controls shifting of diagonal line; 
<br> &emsp;positive: the diagonal line is drawn above main diagonal; 

<br>

Tensor.triu(diagonal=0) -> Tensor: 
> Returns upper triangle of 2D “Tensor”. 

<br>

Tensor.matmul(other) -> Tensor:
> Performs matrix-vector products, requiring size of “Tensor” and “other” to be multipliable; <br> &emsp;
Order matters: returns "Tensor @ other"


> This function is omniscient, and can automatically determine which operation to adopt based on input vector; more details refer to API; <br>
When dimension is larger than 2, batch_matrix products are conducted; 



torch.linalg.eig(A) -> tuple[Tensor, Tensor]
> Computes eigenvalue decomposition of “A”, a DIAGONIZABLE square matrix; 
<center><img src="reference_img/diagonalizable_mat.png" width=200></center>

> Returns: tuple[eigenvalues, eigenvectors] <br> &emsp;
Realize: COLUMNS (dim=1) are corresponding eigenvectors!!!(See below image, where columns, grouped by red circles, represents one eigenvector) <br> &emsp;
Final eigenvector matrix is NORMALIZED to 1!!! 
<center><img src="reference_img/eigenvector_sample.png" width=600></center>

- "A": has shape (*, N, N), and the method will be applied to only last two dimensions of A.  

<br>

torch.linalg.norm(A, ord=None, dim=None, ) -> Tensor:
> Computes vector norm or matrix norm; 
- "ord": type of matrix norm to choose, if A is a matrix; (refer to API for specific types; )
	> Common Ord: Frobenius norm
- dim: If “dim” is an int, vector norm will be calculated; if 2-tuple, will calculate matrix norm.

<br>

torch.linalg.inv(A) -> Tensor: 
> return inverse of matrix A; requiring last two dimension of "A" be squared and invertible.

torch.svd(A) -> tuple(Tensor, Tensor, Tensor)
> Computes the singular-value-decomposition of matrix A; 


> Returns a tuple, (U, S, V) where “U” and “V” are square matrices, and “S” is a diagonal matrix; <br>


>Singular value decomposition can be used to determine whether a filter for convolution is separable. 
<!-- (see csc420 lecture 2 notes) -->

## Mathematical Functions

> Note: each functions listed below has two formats: with "_" at end indicates in-place operation, or a new tensor is returned. 

Tensor.asin_()
> return inverse of sine function; e.g, finding the degree given a sine value

> acos, atan

<br>

Tensor.sin()
> cos, tan, sinh, cosh, tanh, arcsinh, arccosh, arctanh

<br>

Tensor.ceil(), Tensor.floor()

<br>

Tensor.exp()
> Exponential with base "e", and power as "Tensor". Element-wise operation. 

<br>

Tensor.sum(dim=None) -> Tensor
- dim: if "None", will flatten and sum. 

<br>

Tensor.logcumsumexp(dim) -> Tensor
> Returns log-sum-exponential calculation of a tensor along "dim". 
<center><img src="reference_img/log_sum_exp.png" width=400></center>

> The summation result is cumulative (see "numpy.cumsum" for more reference on cumulation)

> Using this method for finding a stable denominator when calculating softmax or other non-linear activation functions'results. 

<br>

Tensor.logsumexp(dim) -> Tensor
> Equivalent to logcumsumexp except the result keeps only the sum over entire dimension. (Final dimension will be reduced)


## Statistical functions

Tensor.amax/amin(dim) -> Tensor
> Take maximum/minimum of Tensor along “dim”; 

> Returns VALUE

<br>

Tensor.argmax/argmin(dim) -> Tensor
> Take maximum/minimum of Tensor along “dim”; 

> Returns INDICES; 

<br>

Tensor.mean/median/mode/sum(dim=None) -> Tensor
- "dim" can also be a tuple of ints, meaning it will be calculated in multiple dimensions. See below demo

<br>

Tensor.cov() -> Tensor
> Gives covariance of given Tensor, NOT COVARIANCE MATRIX!!!
- Tensor: 2D tensor, where first dimension/dim=0 gives each sample vector, and second dimension/dim=1 gives values from all sample vectors at that index. 
<center><img src="reference_img/cov.png" width=400></center>

> Note: $\bar{x}$ and $\bar{y}$ in above images are means of each sample vector $X$, $Y$ respectively. 

> Refer to API for biased output information; (parameter: correction, fweights, aweights) 

In [5]:
# Demo of Tensor.mean with multiple dimensions
test_tensor = torch.arange(48, dtype=torch.float32).reshape(3, 4, 4)
print(test_tensor.mean((1, 2)))
print(test_tensor.mean((2))) # comparing the result shows the mean is calcuated and shown for the foremost dimension, by finding means of later dimensions' calculation results. 

tensor([ 7.5000, 23.5000, 39.5000])
tensor([[ 1.5000,  5.5000,  9.5000, 13.5000],
        [17.5000, 21.5000, 25.5000, 29.5000],
        [33.5000, 37.5000, 41.5000, 45.5000]])


# Model Related: 
This note only contains commonly used methods, and only listed common parameters adopted. For detailed usage which might be uncommon, a visit to API is still required. 

In [None]:
import torch.nn as nn # -- package
import torch.nn.functional as F # -- package

## Linear Layers

nn.Linear(in_d, out_d)(input) -> output
> The most fundamental fully-connected layers, represented by a linear matrix with shape $out\_ d \times in\_ d$
- input: has shape (..., in_d)
- output: has shape (..., out_d)

## Convolutional Layers: 
Fundamental vision tasks operations

nn.Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, padding_mode=’zeros’, ...)(input) -> output
> Defines a 2D convolutional layer; <br>
<!-- Groups: groups input, each grouped input will have independent weights, having same output channel number, but input channel is equivalent to number of input channels in the group. Group must be divisible by input_channel to perform actions; https://iksinc.online/2020/05/10/groups-parameter-of-the-convolution-layer/  -->
"input": has shape $(N,C\_in, H, W)$ or $(C\_in, H, W)$; output has shape $(N, C\_out, H\_out, W\_out)$ or $(C\_out, H\_out, W\_out)$, where $H\_out, W\_out$ are defined as below: 


<center><img src="reference_img/convoutshape.png" width=500></center>

- in_channels: $in\_C$ of "input"
- out_channels: $out\_C$ of "output"
- stride: distance between any two filters
- padding: extra space added to "input"'s boundary
- dilation: distance between each kernel's elements. 

> Refer to this visualization: https://ezyang.github.io/convolution-visualizer/

## Pooling Layers
Used in conjunction with convolutional layers to restore resolution for vision tasks. <br>
Interpolation (nn.Functional.interpolate) is an alternative

nn.MaxPool2d(Kernel_size, stride=None,  padding=0, dilation=1, return_indices=False)<br> &emsp;(N, C, H_in, W_in)/(C, H_in, W_in) -> (N, C, H_out, W_out) / (C, H_out, W_out)
> Pool the resulting tensor with given configurations; <br>
&emsp;Kernels can be overlapping if stride is set less than kernel size
<br> &emsp;Return_indices=True: indices returned could be useful for MaxUnpool2d below

> Output configuration: 
<center><img src="reference_img/maxpoolout.jpg" width=500></center>

<br>

<!-- nn.MaxUnpool2d(kernel_size, stride, padding)(input_tensor, max_indices_for_input) (NCHW/CHW)
	a PARTIAL reverse of max pooling where non-max values are all set to zero. 
Max’s indices for each kernel: can be generated by MaxPool2d; If possible can perform a maxpool on masked array to acquire indices -->
<!-- 
<br> -->

nn.AvgPool2d(kernel, stride=None, padding=0, count_include_pad=True, divisor_override=None)<br> &emsp;(N, C, H_in, W_in)/(C, H_in, W_in) -> (N, C, H_out, W_out) / (C, H_out, W_out)
> Instead of extracting max, this pooling outputs average of values for each kernel. 
- count_include_pad: average calculation will also count zero paddings in this kernel, affects denominator. 
- divisor_override: will change divisor when calculating average (specify a number)


## Padding Layers: 

nn.ReflectionPad2d(padding: int/tuple_of_ints)(input) -> Tensor
> A demonstration of result is shown below. <br> Pay attention to the reflective axis!
<br> Original input is at the center of resulting padded tensor

- “padding”: must have size less than “min(H, W)” of input “tensor”!!!
- input: has shape: (NCHW)/(CHW)
<center><img src="reference_img/reflectpad.png" width=300></center>

<br>

nn.ReplicationPad2d(padding)(input) -> Tensor
> Replication repeats values CLOSEST to it. See demo below for a visualization
<center><img src="reference_img/replicatpad.png" width=300></center>

<br>

nn.ZeroPad2d(padding)(input) -> Tensor
> pads extra spaces using zero only

<br>

nn.ConstantPad2d(padding, value)(input) -> Tensor
> Similar to zero padding, but the value can also be other constants not zero. 


In [None]:
# demo of reflection padding and replication padding

test_tensor = torch.Tensor(torch.arange(27, dtype=torch.float32).reshape(1, 3, 3, 3)) # NCHW
print(test_tensor[0][0])
print(nn.ReflectionPad2d(2)(test_tensor)[0][0])
print(nn.ReflectionPad2d(2)(test_tensor).shape)
print(nn.ReplicationPad2d(2)(test_tensor)[0][0])
print(nn.ReplicationPad2d(2)(test_tensor).shape)

## Normalization Layers: 
Gradient exploded or unstable? Inconsistent units and scale? Ask Normalization for help!

<center><img src="reference_img/normdiag.png" width=800></center><center> <img src="reference_img/normform.png" width=300></center>

nn.BatchNorm2d(num_features, eps=1e-05, affine=True, track_running_stats=True)(input) -> output
> Normalize all inputs along 'batch dimension', applied 'channel-wise'. 
- num_features: “C” in above graph(don’t confuse with width/height of input data!!!)
- eps: epsilon when calculating normalization configurations (in denominator, to avoid denominator divide by zero error)
- affine: the parameter “gamma” and “beta” in normalization formula would be learnable
<!-- track_running_stats;  -->
- input: has shape [NCHW], 4D tensor
- output: has same shape as "input"

<br>

nn.LayerNorm(normalized_shape, eps=1e-05, elementwise_affine=True)(input) -> output
> normalized_shape: a tuple of integers; length of tuple determine how many dimensions COUNTED FROM LAST would be normalized. <br>
&emsp; Norms for those dimensions are calculated INDEPENDENTLY; i.e, "Tensor.mean((-2, -1))". <br>
&emsp;&emsp; Note this normalization is not specific for 2D. 

> Example shown below shows how normalized_shape shall be given (usually just the shape of last few dimensions of input tensor) 
<center> <img src="reference_img/layernormshape.png" width=600></center>

<br>

nn.InstanceNorm2d(num_features, eps=1e-5, affine=True, <!-- track_running_stats=True -->...)(input) -> output
> input and output have the same shape. 

## Non-Linear Activations
Non-linearity makes neural network capable of approximating any functions, after combining with linear layers. 

> Below are all common elementwise activation functions, with output shape same as input. 
- nn.LeakyReLu()(input)
- nn.ReLu()(input)
- nn.Sigmoid()(input)<img style="float:right" src="reference_img/logsig.jpg" width=300>
- nn.LogSigmoid()(input)
- nn.Tanh()(input)


<br>

nn.Softmax(dim=None)(input) -> output
>  in-place application of softmax activation function, applied on assigned "dim" dimension. <br>
Note: the dimension to apply should 
- dim: if "None", will apply on flattened tensor; otherwise will apply on assigned dimension. 
- output: same shape as input. 
<center><img src="reference_img/softmax.jpg" width=200></center>

<br>

nn.Softmax2d()(NCHW) -> output
> Apply softmax on "C" dimension of 2D batched input. 

> Image below shows the effect: adds values from all channels at each coordinate and do softmax on that: channel_1 + channel_2 + channel_3 == 1, for each [x, y] coordinate in the image/2D tensor. 
<center><img src="reference_img/softmax2d.png" width=300></center>


## Recurrent Neural Network
The beginning of sequential learning

### Ordinary RNN: 


<img src="reference_img/rnn_hidden.png" width=300><br>
weights and biases are initialized along with creation of RNNCell. 

- “x”: input; 
- “h”: previous layer’s output; 

When number of layers is not large, using RNN provides computational efficiency, while the effect of gradient vanishing is not too significant. 


nn.RNNCell(input_size, hidden_size, bias=True, nonlinearity=’tanh’)(input_tensor, hidden_tensor) -> output
> Create an RNN cell based on equation described above. 
- input_size: the size of input_tensor; 
- hidden_size: the size of hidden_tensor as input; 
- bias: if false, no bias will be taken into account; 
- nonlinearity: if ‘relu’, will change activation function to relu instead of "tanh" as default activation function
>
- input_tensor: has shape (N, input_size)
- hidden_tensor: has shape(N, hidden_size)
>
- output: [h'] in formula, has shape (N, hidden_size)
An example code shown below gives idea of how previous output along with input shall be used. 

<br>

nn.RNN(input_size, hidden_size, num_layers, nonlinearity=’tanh’, bias=True, dropout=0, bidirectional=False)(input_tensor, previous_h_tensor) -> (Output_tensor, final_hidden_tensor)
> Constructs an RNN contains "num_layers" many "RNNCells", and the connection method follows from description of RNN. 
- num_layers: how many recurrent layers will present in RNN
- dropout: a ratio between 0 and 1; will have a Dropout layer (see later descriptions) for each output with probability set as “dropout”
- bidirectional: allows sequence predictions to refer to future prediction results as well. 
>
- input_tensor: has shape (sequence_len, N, input_size); L is sequence length, input_size is # of features; N is batches. 
- previous_h_tensor(OPTIONAL): in case a complex structure has multiple RNN each with different configuration, then previous RNN’s h_value could be fed as input here. 
	If not provided, will default as zero. 
	Has shape (D*num_layers, N, hidden-size); D is 2 if "bidirectional=True"
>
- Output: (Output_tensor, final_hidden_tensor)
	> Has shape [(sequence_len, N, hidden_size), (D*num_layers, N, hidden_size)]


In [6]:
# RNNcell demo in for loop
rnn = nn.RNNCell(13, 17)
sequence_len = 5
N = 7
input = torch.randn(sequence_len, N, 13) # sequence length, batch size, input feature size
hidden = torch.randn(N, 17) # batch size, hidden size
output = []
for i in range (sequence_len):
    hidden = rnn(input[i], hidden) # note: hidden is being constantly updated along the way
    output.append(hidden)
print(output[sequence_len - 1].shape) # batch size, hidden size


num_layers = 11
rnn_s = nn.RNN(13, 17, num_layers)
new_hidden = torch.randn(1 * num_layers, N, 17)
rnns_res = rnn_s(input, new_hidden)
print(rnns_res[0].shape, rnns_res[1].shape)

torch.Size([7, 17])
torch.Size([5, 7, 17]) torch.Size([11, 7, 17])


### LSTM

<center><img src="reference_img/lstmcircuit.png" width=600></center>

Formula notations: 
- h: hidden state; <img style="float:right" src="reference_img/lstmfor.png" width=300>
- c: cell state; 
- x: input; 
- ifgo: input/forget/cell/output gates; 
> circle with a dot is element-wise matrix product. 

Mechanism: when inputs and previous hidden states are given: 
1. first check forget gates (“f”) and see whether previous hidden states shall be kept or ignored. 
2. Then “i” and “g” will be calculated, to respectively determine which values to update(by “i”), and how much (by “g”). 
3. Then “c” is updated based on forget gate and update gate’s contents, which gives current cell’s state. 
4. Finally “o” is computed, along with next hidden state. New hidden state will take current cell state “c” into account. 

In general, LSTM allows more manipulation on previous states, so that when analyzing long sequences, only important info would be kept; results in less noise towards understanding current state. Also this resolves “gradient vanishing problems”. 
However without attention-mechanism, when sequence is long enough LSTM could fail as well

To elaborate, the structure of each LSTM cells shown in diagram could be easily modified to have various extra functionings. 


nn.LSTMCell(input_size, hidden_size, bias=True)(input, (hidden, cell)) -> (late_hidden, new_cell)
- input: has size (N, input_size)
- hidden: has size (N, hidden_size)
- cell: has size (N, hidden_size): the initial cell’s state; 
    > If (hidden, cell) is not provided, will both default to zero tensor. 
>
- late_hidden: has size (N, hidden_size); tensor containing next hidden state
- new_cell: has size (N, hidden_size); tensor containing next cell state

<br>

nn.LSTM(input_size, hidden_size, num_layers, bias=True, dropout=0, bidirectional=False, <br> &emsp;&emsp;proj_size=0)(input, (h0, c0)) -> (output, (hn, cn))
- proj_size: if >0,  final output will be linearly projected to this dimension.
    > h0, output, hn will have "hidden_size" replaced as "proj_size". 
- input: has size (sequence_len, N, input_size)
- h0: has size (D*num_layers, N, hidden_size); D=2 if bidirectional
- c0: has size (D*num_layers, N, hidden_size)
>
- output: has size (sequence_len, N, D*hidden_size)
- hn: (D*num_layers, N, hidden_size)
- cn: has size (D*num_layers, N, hidden_size)


### GRU: 

<center><img src="reference_img/grudiag.png" width=400></center>

<img style="float:left" src="reference_img/gruform.png" width=300>

- r: reset gate
- z: update gate
- n: new gate

<br>

mechanism: 
1. first calculate update gate(z); this determines how much previous gate’s info shall be passed on; 
2. Then calculate reset gate(r); this determines how much info shall be forgotten; (in diagram reset gates and update gates are parallel)
3. Then new state is calculated;  it represents current cell’s status, containing current input, along with previous hidden states, with a reset ratio. 
4. Finally new hidden state is calculated based on previous hiddens and current cell status. 



nn.GRUCell(input_size, hidden_size, bias=True)(input, hidden) -> new_hidden
- input: has size (N, input_size)
- hidden: has size (N, hidden_size)
>
- new_hidden: has size (N, hidden_size); tensor containing next hidden state

<br>

nn.GRU(input_size, hidden_size, num_layers, bias=True, dropout=0, bidirectional=False)(input, <br>&emsp;&emsp;h_0) -> (output, h_n)
- input: (sequence_len, N, input_size); h_0: (D * num_layers, N, hidden_size); 
- output: (sequence_len, N, D * hidden_size); h_n: (D * num_layers, N, hidden_size)
    > "D=2" if "bidirectional=True" otherwise "D=1". 


### Comparison of GRU and LSTM

- Forgetting: 
    > GRU’s forget/reset gate only handles previous hidden states; 

    > But LSTM forgets both input and hidden state by a certain rate; 

- Updating: 
    > LSTM’s updating and forgetting is applied on COMBINED result of current input and previous hidden state; 
    
    > GRU deals with previous hidden state separately from current input. (see formula of how forget gate and reset gate are calculated for LSTM and GRU for better understanding)


## Attention Mechanism & Transformer
Advanced sequential learning

### Explanation of Attention Mechanism and "nn.MultiHeadAttention"

Attention mechanism allows the model to "attend to/focus on" particular set of information in a long-sequence for making inferences or predictions, instead of allowing all known information to contribute to inference or prediction tasks, which can lead to inaccuracy or bias. 

nn.MultiheadAttention(embed_dim, kdim=None, vdim=None, num_heads, dropout=0.0)(query, key, value) -> (attn_output)
> Perform multi-head attention operation on given set of inputs. 
- embed_dim: query's feature dimension size; 
- kdim: key's feature dimension size; if "None", will be equal to "embed_dim". 
- vdim: value's feature dimension size; if "None", will be equal to "embed_dim". 
- num_heads: controls the number of heads for multi-head attention. 
>
- query: (output_sequence_len, N, embed_dim)
    > Represents the set of vectors containing acquired information as output. In a sequential setting, 'query' contains previously acquired information, and at each layer of sequential processing, query will be 'refined' with new information(represented by "key" and "value"). 
    >
    > e.g: in question-answering tasks(NLP), query can be the already existing answer(or Beginning_Of_Sequence, BOS signal), where "key" and "value" represents the question being asked. 
- key: (input_sequence_len, N, kdim)
    > Represents the new incoming message or already acquired message, used for making inferences on "query" acquired from previous sequential processing steps. 
    >
    > Keys will be processed differently than "value". For dot-product attention, "query" and "key" jointly determines the weight assigned to each "value" vector for making final weighted sum calculation. 
- value: (input_sequence_len, N, vdim)
    > Also represents the new incoming message or acquired message. 
    >
    > In dot-product attention, calculated weights are applied on "value" to acquire updated "query" results. 
>
- attn_output: (target_sequence_len, N, embed_dim)

In [4]:
# Code demo of attention mechanism, partially referred to personal private slot attention code. 
class dot_product_attention(nn.Module):
    """
    assumptions for using this module:
    the input_shape is assumed to be: (B, D_i) where B is batch, D_i is the input dim;
    query shape is assumed to be: (Q, D_s) where Q is number of querys, and D_s is the query feature dim

    """
    def __init__(self, input_shape, query_shape, attention_dim,
                 num_querys, query_mu, query_sigma):

        super(dot_product_attention, self).__init__()
        self.input_shape = input_shape
        self.query_shape = query_shape
        self.attention_dim = attention_dim
        self.q = nn.Linear(query_shape, attention_dim) # attention_dim ensures matrix product shape consistency; 
        self.k = nn.Linear(input_shape, attention_dim)
        self.v = nn.Linear(input_shape, input_shape) # usually the shape is (input_shape, attention)

        self.num_querys = num_querys
        self.query_mu = query_mu
        self.query_sigma = query_sigma


    def forward(self, input):
        """

        :param input: size: [batch, num_input, D], where D is feature dimension
        :return:
        """
        # first generate querys
        querys = self.create_querys(self.query_mu, self.query_sigma,
                                  self.num_querys, self.query_shape, input.shape[0])
        # perform attention
        query = self.q(querys) # shape [..., Q, att_dim]
        key = self.k(input) # shape [..., K, att_dim]
        value = self.v(input) # shape [..., K, input_dim]
        dot_product = query.matmul(key.swapaxes(-1, -2)) * (self.attention_dim ** (-0.5)) # key becomes: [..., att_dim, K] for matrix product operation
            # realizing the result is the same as (key.matmul(query.swapaxes(-1, -2))).swapaxes(-1, -2)
        # alternative implementation:
        # dot_product = torch.einsum("bqa, bka -> bqk", query, key)

        weight = nn.Softmax(dim=-1)(dot_product) # realizing "dot_product" has shape [..., Q, K],
                                                # dim=-1 normalizes "inputs" so for each query so the sum of all KEYS'weight for that query is 1
        query_prediction = weight.matmul(value) # [..., Q, input_dim]
        return query_prediction
    
    
    # create a demo query
    def create_querys(self, mu, sigma, k, d_querys, batch_size):
        """
        :param mu: Gaussian sampling mean
        :param sigma: Gaussian sampling sigma
        :param k: number of querys
        :param d_querys: slot dimension
        :return:
        """
        # will sample querys according to gaussian.
        querys = torch.fill(torch.zeros([batch_size, k, d_querys]), mu)
        return querys + sigma * torch.randn(querys.shape) # this will return sampled querys

dot_prod_module = dot_product_attention(17, 13, 11, 5, 0, 1.0)
input = torch.arange(3 * 7 * 17, dtype=torch.float32).reshape(3, 7, 17)
print(dot_prod_module(input).shape) # batch of 3, predict 5 queries, each query vector has feature dimension 17. 
# That is how the weighted sum is being calculated over values. 

torch.Size([3, 5, 17])


### Transformer

https://towardsdatascience.com/illustrated-guide-to-transformers-step-by-step-explanation-f74876522bc0

<center><img src="reference_img/transformer.png" width=700></center>

<center><img src="reference_img/transformer_proced.png" width=400></center>
<center><img src="reference_img/transenc.png" width=400><img src="reference_img/transdec.png" width=300></center>

<center>Each encoder’s output will become part of each decoder’s input. </center>

Transformer decoder’s input: 
- Two parts: one is the whole encoder sequence’s output, the other is the previously generated output of whole transformer model (at beginning of translation, a “start” signal is given instead)
- transformer uses previously generated outputs and encoder outputs to predict what to generate next one by one, and the new word generated also become part of decoder’s next input. 
<br><br>
<img style="float:left" src="reference_img/transformer_mask.png" width=200>Functionality of masking in decoder: <br>
As shown in left image, when predicting next word, the attention score required should only rely on previously generated output’s score. Thus the score for word “fine” is masked out when word “am” is predicted. 


nn.Transformer(feature_dim=512, num_heads=8, num_encoder_layer=6, num_decoder_layer=6, dim_feedforward=2048, dropout=0.1, activation=”relu”, layer_norm_eps=1e-05)
- feature_dim: expected input features for encoder and decoder. 
- num_heads: number of heads in multihead attention layer
- dim_feedforward: the dimension of feedforward neural network inside each encoder and decoder. 
>
><br>
&emsp;(src, tgt, *src_mask, *tgt_mask, *memory_mask) -> output
- src: the sequence of encoder input, has shape (encoder_sequence_length, N, embedding_dim)
- tgt: the sequence of decoder extra input, has shape (decoder_sequence_length, N, embedding_dim); 
	> For natural language processing task, the extra input is usually a signal for start as the first element, and all remaining parts are zero vectors, awaiting for filling in; 
- src_mask: has shape (encoder_sequence_length, encoder_sequence_length): checks when each input is attended at, which key words shall be ignored from other src input. 
- tgt_mask: has shape (decoder_sequence_length, decoder_sequence_length)
- memory_mask: has shape (DEcoder_sequence_length, ENcoder_sequence_length)
	> Ensures when decoding, which encoder output shall be masked when determining each position of decoder respectively. <br><br>Also indicates dimention 0 is the position to evaluate, and dimension 1 is the words to mask when evaluating dimension 0’s position. 

- output: has shape (decoder_sequence_length, N, embedding_dim) if “batch_first=False”

<br>

nn.TransformerEncoderLayer(input_feature, nhead, dim-feedforward=2048, dropout=0.1, activation=”relu”, layer_norm_eps=1e-5)
<br> &emsp;(src, src_mask=None)

> create an instance of transformer encoder layer as illustrated in figure above. 

<br>

nn.TransformerEncoder(encoder-layer, num-layers, norm=None)
<br> &emsp;(src, mask=None)

> creates a stack of “num-layers” many transformer encoder layers. <br>
		Looks like all stacked encoders are the same, provided by the input parameter. 
- encoder-layer: an instance of “nn.TransformerEncoderLayer()”. 

<br>

nn.TransformerDecoderLayer(input_feature, nhead, dim-feedforward=2048, dropout=0.1, activation=”relu”, layer_norm_eps=1e-5)
<br> &emsp;(tgt, memory, tgt_mask=None, memory_mask=None)

> Create a decoder layer for transformer as described in previous diagrams
- memory: the result from FINAL encoder layer ONLY; 
- tgt: realizing the decoder layer might not be the first layer; thus apart from beginning of “tgt sequence”, other elements in the “tgt sequence” might also be non-zero; 

<br>

nn.TransformerDecoder(decoder-layer, num_layers, norm=None)
<br> &emsp;(tgt, memory, tgt_mask=None, memory_mask=None)

> Creates a stack of transformer decoder layers. 
- decoder-layer: an instance of the “TransformerDecoderLayer()”


## Dropout & Embedding

### Dropout
Dropout layers are used to help improving independence between input layers, which can prevent model from overfitting. 

nn.Dropout2d(p=0.5, inplace=False)(input) -> output
> mask the entire channel given a "input" with shape "(NCHW)/(CHW)" 
<br>&emsp;When the channel “C” is chosen, input[:C:…] will be filled by 0. 

- p: the probability of masking the given channel, following a Bernoulli distribution
- input: has shape (NCHW)/(CHW) depending on input shape
- output: has same shape as input
> Image below shows how channel-wise dropout is conducted. 
<center><img src="reference_img/dropout2dcha.png" width=200></center>

<br>

nn.Dropout(p=0.5, inplace=False)(input) -> output
> mask the input entries randomly by a probability of “p”. 
<br>&emsp;compared with Dropout2d, the masking is irregular, and not restricted to channel

- input: a tensor with no restrictions on shape
- output: has same shape as input

<br>

nn.AlphaDropout(p=0.5, inplace=False)(input) -> output
> randomly mask a value with probability “p”, and MAINTAIN mean and std of data. (specific details require reading papers)
- "input" and "output" must have the same shape.


### Embedding: 

nn.Embedding(num_embeddings, embedding_dim, padding_idx=None, max_norm=None, norm_type=2.0, scale_grad_by_freq=False, sparse=False)(input_tensor) -> output
- num_embeddings: the number of elements to be indexed by embedding; 
- embedding_dim: the number of features of each embedded vector; 
- padding_idx: the integer for padding the remaining space of input -> ensure input length consistency for those short inputs. 
- max_norm: a restriction on embedding vector’s norm, to ensure consistency of vector scale. 
- norm_type: the p-scale (specific restriction on the norm of embedding vector)
- scale_grad_by_freq: if True, will scale gradients (in backward pass) in INVERSE of word’s frequency
><br>
- input_tensor: can have any shape; 
- output: have shape (input_tensor_shape, embedding_dim) -> the set of embedded vectors for all input components


Tips on using Embedding: 
Refer to code shown left; 
- realizing that embedding tries to embed EACH ELEMENT from the input tensor; thus, for example, taking in a number of words/strings, the first step is to iteratively assign each string a number, then input the numbers into embedding function to acquire a translated result for training models. 
- the number can be acquired using a dictionary, mapping each string to a number; the efficient way to create such dictionary is: <br>
&emsp;using “zip()” function in python; the code shown right uses the function to combine two length-consistent lists into a pair-wise dictionary. <br>
&emsp;When “dict()”ing the zipped result, the duplicated words will be handled automatically; 


In [None]:
embed = nn.Embedding(10, 4)
a = torch.tensor([1, 0, 1, 0])
print(embed(a)) # note 1st and 3rd row are the same -> duplicates handling
a = torch.tensor([1, 0, 1, 0, 2, 3, 4])
print(embed(a)) # note the first 4 rows is same as previous output -> embedding is set for each nn.Embedding module;

In [None]:
# Demo of zip() function, and potentially tokenizing an input sentence, useful for NLP tasks
test_string = "this is just a test sentence that makes totally no sense, yes it is"
all_words = test_string.split()
indexing = torch.arange(len(all_words))
zip_res = dict(zip(all_words, indexing)) # this ensures the key of dictionary is "all_words", and values are "indexing"
print(zip_res) # realize the word "it" occurs for only one time in resulting dict, showing duplicates handling
# convert the test string into tensor representation. 
# perhaps there is a better way than using for loop...?
res = []
for word in all_words:
    res.append(zip_res[word])
print(torch.tensor(res)) # the resulting format can be applied with nn.Embedding, following previous chunk of code


## Loss function: 
Will only list commonly used ones, and more choices can be found in pytorch API. <br>
Custom loss functions is also possible to define, so long as "input" and "target" shape is preserved like other loss functions. 

nn.L1Loss(reduction='mean')(input, target) -> output
> Perform L1Loss on "input" and "target". 
<center><img src="reference_img/l1loss.png" width=300></center>

- reduction: if 'mean', will take the average of all losses; if 'sum', will sum-up all the acquired losses; if 'None', no further operations will be performed, and "output" has same shape as "input". 
><br>
- input, target: must have same shape
- output: a scalar or a tensor with same shape as "input". 
    > usually preferred to be a scalar for performing "output.backward()", weight update operation

<br>

nn.MSELoss(reduction='mean')(input, target) -> output
> Perform mean squared error loss calculation on "input" and "target".
<center><img src="reference_img/mse.png" width=400></center>

<br>

nn.CrossEntropyLoss(reduction='mean')(input, target) -> output
> Perform cross entropy loss on input and target
<center><img src="reference_img/crossentropy.png" width=500></center>

- input: has shape (N, C) or (N, C, d1, ..., dk); N: batch size; C: number of classes; d1, ... dk: dimension of inputs; 
- target: has shape (N) or (N, d1, ..., dk)
- output: either a scalar or: has shape (N) or (N, d1, ..., dk). 

# Operation Related: 

## Datasets & Dataloaders

"import torch.utils.data as data_p"

### Datasets

In [None]:
# Below demonstrates how to construct a dataset, can be as simple as just overwritting 
# 3 functions: __init__, __len__, __getindex__, and other helper functions. 
# realizing above 3 functions are abstract methods that MUST be re-written. 

import torch.utils.data as data_p

class SampleDataset(data_p.Dataset): # all datasets need to inherit their abstract class: torch.utils.data.Dataset
    def __init__(self, data, target):
        """
        data: each single element from the "sequential data" has dimension [sequence_length, feature_size].
        target: has dimension [sequence_length]
        """
        super(SampleDataset, self).__init__()
        self.sequential_data = data
        self.target = target

    def __len__(self):
        """
        This method is required to overwrite, as its superclass doesn't implement this method
        This method simply returns how many pairs of data-target (or data piece, for unsupervised learning) are stored within this dataset. 
        """
        return len(self.sequential_data)

    def __getitem__(self, index):
        """
        This method is required to overwrite, as its superclass doesn't implement this method
        Return the (data, target) pair (or just data, for unsupervised learning) at specific index. 
        
        index: the INDEX of data required to extract
        """
        return self.sequential_data[index], self.target[index]

Elaborations: 
1. Do not underestimate the flexibility of dataset initialization methods. To read data, there are several ways: 
    - Directly load as tensor and feed as input parameters (above code)
    - Provide "PATH" to the data folder, and pass in "PATH" as parameter, instead of "data" or "target". <br> Then inside "\_\_init__", use "read()" method and further processing to convert data into appropriate format. 
2. Initialization of Dataset can also provide additional functionalities. 
    - All vision datasets allows inputting a sequence of preprocessing methods (see "torchvision.transforms" for more details) for images; 
3. "\_\_getitem__" method can actually perform preprocessing on data pieces before returning for training. 
    - e.g: vision data pieces could be applied with "torchvision.transforms" before returning. 
4. "\_\_getitem__" doesn't have restriction on the format of returned data, so long as follow-up code extracts returned data pieces appropriately. 

In general, explore how you could load data in various ways, and play with those data pieces in whichever way you like before returning them. 

### Dataloaders

In [None]:
# Demonstration of dataloader method
# do not run the code below
import math

dataset = SampleDataset(data, target)

# train-test split
train_test_factor = 0.8 # 80% training, 20% testing; 
train_size = math.ceil(len(dataset) * train_test_factor)
split_dataset = data_p.random_split(dataset, [train_size, len(dataset) - train_size]) # explanation of this method is in markdown part below

train_dataloader = data_p.DataLoader(split_dataset[0], batch_size=50) # This is how dataloaders are initialized, in simplest ways. 
valid_dataloader = data_p.DataLoader(split_dataset[1], batch_size=1)

# the following lines of code shows how to extract data from DataLoaders containing "SampleDataset" format data. 
# pay attention to "index", and realize "sample" is the return value of "__getitem__" from "SampleDataset". 
for index, sample in enumerate(train_dataloader):
    data, target = sample[0], sample[1] 

data_p.DataLoader(dataset, batch_size=1, shuffle=None, sampler=None, num_workers=0, ...)
> Initializes a "DataLoader" class for given "dataset". 
- dataset: subclasses of class "data_p.Dataset"
- shuffle: if 'True', data will be reshuffled at every __epoch__. Otherwise data will be reshuffled only once for entire program running. 
- sampler: a subclass of "data_p.Sampler", will explain later. 
- num_workers: if '>0', data loading process will be conducted in multiple processors. Useful when using multiple GPU/TPU to load data and train. 

<br>

data_p.random_split(dataset, lengths) -> List[dataset]
> Randomly split the given dataset into a list of multiple __non-overlapping__ subsets of "data_p.Dataset" object. 
- dataset: a subclass of "data_p.Dataset"
- lengths: 
    1. if a list of integers summing up to "len(dataset)", will split "dataset" into "len(lengths)" many datasets, each with length in "lengths". 
    2. if a list of non-negative fractions summing up to "1", will split "dataset" into propotional segments. 

<br>

data_p.Sampler(data_source, ...)
> Initialize a data sampler. It's the abstract class of all Samplers. 
- "\_\_iter__": the method all Sampler subclasses must overwrite, to provide a method for iterating over the indices of "data_source"
- "\_\_len__": another method all Sampler subclasses must overwrite. 

## Training & Validating
This section will mainly consist of code and explanation comments. Additional references will be listed at end of code. 

### Hyperparameters Explanation

In [None]:
import math

NUM_BATCHES = 200 # batch_size
EPOCHS = 100 # number of total training epoches;
TRAIN_TEST_SPLIT = 0.9 # divide dateset into training and testing sets
EARLY_STOP_THRESHOLD = 10 # See below explanation for early stopping

lr_milestones = [17, 40, 75] # learning rate milestones; see below description
lr_decay_gamma = 0.5 # learning rate decay ratio
loss_fn = torch.nn.MSELoss() # loss function, here is mean squared error

device = 'cuda' if torch.cuda.is_available() else 'cpu' # device setting. 

Key words explanation: 
1. Epoches: determines how many total iterations over entire training dataset is required. 
    - Each epoch requires iterating through all training data in a batch-wise manner, and then calculate total loss for performing backpropagation, updating weights. 
2. Early Stopping: a technique for avoid overfitting. 
    - If the validation loss hasn't decrease for consecutive __EARLY_STOP_THRESHOLD__ many epoches, the training will stop and adopt the model weight with lowest validation loss as final weight of trained model. 
3. Learning rate decay: during training, as loss becomes lower and lower, learning rate also need to decrease for more slight improvements of weights, and to avoid overshooting local minimum. See image below for why learning rate decay is necessary: as training loss becomes closer and closer to a minimum, the required adjustments to weights should also be smaller and smaller.  
    - lr_milestones: determines "epoches" where learning rate should decay. 
    - lr_decay_gamma: new learning rate will change at lr_milestones, by multiplying with "lr_decay_gamma". 
<center><img src="reference_img/lr_decay.jpg" width=400></center>


### Training

#### Training procedure

In [None]:
def train(model, train_dataloader, valid_dataloader, device):
    model.to(device)
    model.train() # set model status to train to adjust model's settings. 
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01) # set up optimizer methods. See below for more references. Here Adam method is used

    # learning rate scheduler:
    lr_schedule = torch.optim.lr_scheduler.MultiStepLR(optimizer, 
                                lr_milestones, gamma=lr_decay_gamma) # set up learning rate scheduling, see below for more scheduling methods

    # early stopping:
    curr_best = model.state_dict() # see "state_dict" referenced below. 
    curr_lowest_val_accuracy = math.inf 
    stop_count = 0 # for checking with EARLY_STOP_THRESHOLD

    for iterations in range(EPOCHS):
        total_loss = 0
        prediction, actual = (None, None)

        optimizer.zero_grad() # This step ensures gradients are reset for each epoch. Otherwise gradients update will accumulate, leading to errors. 

        for index, sample in enumerate(train_dataloader):
            device_sample, device_target = sample[0].to(device), sample[1].to(device) # feed data and target to device
            result = model(device_sample) # acquire model result
            prediction = result[0] # acquiring prediction results depends on the output of the model!!!
            actual = device_target[0]
            loss = loss_fn(result, device_target) # acquire loss
            total_loss += loss.item()

            loss.backward() # loss update is done batch-wise; 
            optimizer.step()
        lr_schedule.step() # check learning rate and decide to decrease learning rate or not
        average_loss = total_loss / NUM_BATCHES
        print("epoch {}: average training loss is ({})".format(iterations, average_loss))

        # validation & early stopping
        validation_accuracy = validate(model, valid_dataloader, device)
        model_saved = False
        if validation_accuracy < curr_lowest_val_accuracy:
            curr_lowest_val_accuracy = validation_accuracy
            curr_best = model.state_dict()
            stop_count = 0
            model_saved = True
        else:
            stop_count += 1
            if stop_count == EARLY_STOP_THRESHOLD:
                print("reached early stopping threshold, training stopped")
                break
        print("early_stopping_watcher: stop count: {}; model saved: {}\n".format(stop_count, model_saved))
        # print("prediction: {}; actual: {}".format(prediction, actual))
    # save model
    torch.save(curr_best, "../model/model_state.pt") # save model weights

Summarized procedure: 
1. check device setting for "model" and "data". 
2. Setting up optimization method and learning rate scheduling. 
3. In epoch "for-loop": iterate over batches of datasets, calculate loss and perform gradient updates FOR EACH batch!!!
4. Validate model using validation datasets
5. Update learnin rate, collect total loss, determine early stopping. 

#### Optimizers: 
For a complete reference of each optimizer, visit: <br>
https://pytorch.org/docs/stable/optim.html
<br><br>
"import torch.optim as optim"

optim.Optimizer()
> Abstract class inherited by all below optimizers

<br>

optim.Adam(params, lr=0.001, eps=1e-08, weight_decay=0, ...)
> Initializes adam optimizer
- params: usually "model.parameters()", the initial/already updated weights of the model. 
- lr: learning rate
- eps: numeric stable terms for denominators
- weight_decay: L2 penalty, for weight regularizer, a method preventing overfitting

<br>

optim.SGD(params, lr, momentum=0, dampening=0, weight_decay=0)
> Initializes optimizer adopting stochastic gradient descent method, potentially with momentum. 

> Momentum: the formula below shows its effect: previous step's gradient also contributes to current step's gradient, to prevent the situation shown in 3rd image below. 

<center><img src="reference_img/sgdmomentum.png" width=300>  <img src="reference_img/sgdmomevec.png" width=500>  <img src="reference_img/sgdmomeloss.png" width=400></center>

- momentum: "beta" in above formula
- dampening: controls the reduction of momentum as training progresses. (a ratio between 0 and 1)

<br>

optim.Adadelta/Adagrad/AdamW/SparseAdam/Adamax/ASGD/LBFGS/NAdam/RAdam/RMSprop/Rprop

#### Learning rate scheduler:

optim.lr_scheduler.LambdaLR(optimizer, lr_lambda, last_epoch=-1, verbose=False)
> update learning rate by multiplying previous learning rate by a given function "lr_lambda". 
- optimizer: a subclass of optim.Optimizer()
- lr_lambda: a __function__ computing a multiplicative factor for current epoch. 
    > input of function: current epoch(an integer)
- last_epoch: the index of __previous__ epoch; "-1" indicates training hasn't started yet. 
- verbose: if "True", prints out learning rate update information for each update. 

<br>

optim.lr_scheduler.MultiplicativeLR(optimizer, lr_lambda, last_epoch=-1, verbose=False)
> Quite similar to LambdaLR. 

<br>

optim.lr_scheduler.StepLR(optimizer, step_size, gamma=0.1, last_epoch=-1, verbose=False)
> decays learning rate at given "step_size" periods by multiplying with "gamma". 
- step_size: an integer, representing periodic benchmark; when reached, will update current learning rate by multiplying with "gamma". 

<br>

optim.lr_scheduler.StepLR(optimizer, milestones, gamma=0.1, last_epoch=-1, verbose=False)
> compared with "StepLR", "milestones" can be a list of INCREASING integers, and updates of learning rate doesn't need to be periodic. 

<br>

optim.lr_scheduler.LinearLR(optimizer, start_factor=0.3333333333333333, end_factor=1.0, total_iters=5, last_epoch=-1, verbose=False)
> Multiplies current learning rate with a __linearly increasing__ factor grows from "start_factor" to "end_factor" evenly for "total_iters" many epoches.  

### Validation

In [None]:
NUM_OUTPUT = 6
VALIDATION_THRESHOLD = 0.25

def validate(model, valid_dataloader, device):
    """
    if the predicted price is within a certain range, will treat the prediction as correct.
    threshold is a hyperparameter set at beginning of file.
    """
    model.eval()
    with torch.no_grad(): # for GPU memory saving. 
        correct_prediction = [0] * NUM_OUTPUT # all updates for correct predictions are used to calculate accuracy. 
        total_predictions = 0 
        total_loss = 0

        for index, sample in enumerate(valid_dataloader):

            # chunk below acquires the result of prediction and target
            device_sample, device_target = sample[0].to(device), sample[1].to(device)
            result = model(device_sample)
            prediction = result
            actual = device_target[0]
            total_predictions += 1 

            # chunk below is used for updating data for calculating validation accuracy
            # for this particular model and task, a prediction is considered "correct" if the predicted result falls in a 
            for period in range(len(prediction)):
                actual_low = actual[period] - VALIDATION_THRESHOLD
                actual_high = actual[period] + VALIDATION_THRESHOLD
                if actual_low <= prediction[period] <= actual_high:
                    correct_prediction[period] += 1
            
            # chunk below updates loss
            loss = loss_fn(prediction, actual)
            total_loss += loss
        
        # print_statement should be modified;
        accuracy = []
        for i in range(len(correct_prediction)):
            accuracy.append(round((correct_prediction[i] / total_predictions), 2)) # this is the formula for calculating validation accuracy
        average_loss = total_loss / index
        print("average validation loss: " + str(average_loss.item()))
        print("number of in-threshold predictions: " + str(accuracy))
        print("number of total predictions: " + str(total_predictions))

    # calculate evaluation score, based on a weight measure emphasizing long term prediction
    return average_loss

Summary of validation step: 
1. "with torch.no_grad()": reduce GPU usage, as gradient update is not required. 
2. no longer updates gradients, so no optimizers. 
3. validation loss's calculation method is the same as training loss calculation. 
4. calculating validation accuracy is actually optional. Usually accuracy is calculated as the number of correct predictions over total predictions. <br> Depending on the tasks, there could be several ways of evaluating whether one prediction is correct or not: 
    - Regression (the code above): a threshold allowing predictions to be deviating around a reasonable range. 
    - Classification: the likelihood of the correct class is the highest among all other classes' predictions. (take "argmax")
    - Other custom methods to determine whether a prediction is correct or not. 