In [1]:
import torch

## Parameter Access

In [3]:
# Instantiating model for sake of parameters access
model = torch.nn.Sequential(torch.nn.Linear(20, 2), torch.nn.ReLU(), torch.nn.Linear(2,1))

In [5]:
model[0], model[1] # Accessing first layer

(Linear(in_features=20, out_features=2, bias=True), ReLU())

In [7]:
model[0].state_dict() # State dict returns layer weights and biases

OrderedDict([('weight',
              tensor([[-0.0157,  0.0521, -0.0347,  0.1497, -0.0891, -0.1076, -0.1529,  0.1386,
                        0.0511, -0.0278, -0.1195,  0.0907, -0.1497,  0.1675,  0.0779, -0.0553,
                        0.0068, -0.0675, -0.1974,  0.2047],
                      [-0.0798,  0.1234, -0.0981, -0.0773, -0.1976,  0.0849,  0.1805, -0.0683,
                       -0.1438,  0.1512,  0.1640, -0.2036,  0.1930, -0.1198, -0.0527, -0.1737,
                       -0.0323, -0.1602,  0.1441,  0.1902]])),
             ('bias', tensor([-0.1828, -0.0875]))])

In [8]:
# All model parameters by default require gradient
model[0].weight # Returns a Parameter object: parameter object stores state of parameter (information for gradients, etc.)

Parameter containing:
tensor([[-0.0157,  0.0521, -0.0347,  0.1497, -0.0891, -0.1076, -0.1529,  0.1386,
          0.0511, -0.0278, -0.1195,  0.0907, -0.1497,  0.1675,  0.0779, -0.0553,
          0.0068, -0.0675, -0.1974,  0.2047],
        [-0.0798,  0.1234, -0.0981, -0.0773, -0.1976,  0.0849,  0.1805, -0.0683,
         -0.1438,  0.1512,  0.1640, -0.2036,  0.1930, -0.1198, -0.0527, -0.1737,
         -0.0323, -0.1602,  0.1441,  0.1902]], requires_grad=True)

In [9]:
model[0].weight.data # Parameter data attribute store tensor itself

tensor([[-0.0157,  0.0521, -0.0347,  0.1497, -0.0891, -0.1076, -0.1529,  0.1386,
          0.0511, -0.0278, -0.1195,  0.0907, -0.1497,  0.1675,  0.0779, -0.0553,
          0.0068, -0.0675, -0.1974,  0.2047],
        [-0.0798,  0.1234, -0.0981, -0.0773, -0.1976,  0.0849,  0.1805, -0.0683,
         -0.1438,  0.1512,  0.1640, -0.2036,  0.1930, -0.1198, -0.0527, -0.1737,
         -0.0323, -0.1602,  0.1441,  0.1902]])

In [11]:
dir(model[0].weight) # Massive object consisting of a lot of different data operations
# Make sure to name layers something which is integer, as the layer keys have an attempted concat with the model repr and integer names will result in an error

['T',
 '__abs__',
 '__add__',
 '__and__',
 '__array__',
 '__array_priority__',
 '__array_wrap__',
 '__bool__',
 '__class__',
 '__complex__',
 '__contains__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__div__',
 '__doc__',
 '__eq__',
 '__float__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__idiv__',
 '__ifloordiv__',
 '__ilshift__',
 '__imod__',
 '__imul__',
 '__index__',
 '__init__',
 '__init_subclass__',
 '__int__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__irshift__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__long__',
 '__lshift__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pos__',
 '__pow__',
 '__radd__',
 '__rdiv__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__rfloordiv__',
 '__rmul__',
 '__rpow__',
 '__rshi

In [22]:
# Models are hierarchially nested, as shown in example below
def block1():
    return torch.nn.Sequential(torch.nn.Linear(5,2), torch.nn.ReLU())

def block2(input_net: torch.nn.Sequential):
    net = torch.nn.Sequential()
    for i in range(4):
        # Can add any module or block to a sequential NN (for a custom Neural Network, will have to add the modification to the execution flow ourselves)
        # Adding named module to neural net
        net.add_module(f'nested_block {i}', input_net)
    
    # Abstraction: instead of separate layer functionality we treat this identically to adding any Neural Block
    net.add_module('final layer', torch.nn.Linear(2,1)) # First arg is a module as a modules added to the ._modules dict must be named
    return net
 
nested_model = torch.nn.Sequential(block2(block1())) 
nested_model.add_module('model_output', torch.nn.Softmax()) # Default name in modules dict is index in str form, but can add custom name

In [26]:
nested_model # Model is hierarchial: executed sequentially, yet block structure maintained in parameter access

Sequential(
  (0): Sequential(
    (nested_block 0): Sequential(
      (0): Linear(in_features=5, out_features=2, bias=True)
      (1): ReLU()
    )
    (nested_block 1): Sequential(
      (0): Linear(in_features=5, out_features=2, bias=True)
      (1): ReLU()
    )
    (nested_block 2): Sequential(
      (0): Linear(in_features=5, out_features=2, bias=True)
      (1): ReLU()
    )
    (nested_block 3): Sequential(
      (0): Linear(in_features=5, out_features=2, bias=True)
      (1): ReLU()
    )
    (final layer): Linear(in_features=2, out_features=1, bias=True)
  )
  (model_output): Softmax(dim=None)
)

In [33]:
nested_model[0][1][0] # Can obtain a specific layer by subindexing hierarchially
# First sequential NN, second block, first layer = Linear

Linear(in_features=5, out_features=2, bias=True)

In [36]:
nested_model[1] # Final layer of outermost model is 1 and can be accessed as such

Softmax(dim=None)

## Initialization

Importing NN from previous notebook

In [108]:
# The use of torch.nn.Module is to define arbitrary architectures with control flow, one such attempted below:
class NonSequentialNN(torch.nn.Module):
    """A NonSequential NN"""
    def __init__(self, *dims):
        """Instantiates all layers with given dimensions"""
        super().__init__() # Call to parent constructor
        self.layers = [torch.nn.Linear, torch.nn.ReLU, torch.nn.Linear, torch.nn.ReLU, torch.nn.Dropout, torch.nn.Linear, torch.nn.ReLU]
        k = -1 # Separate indexer to keep track of current layer
        for i in range(len(self.layers) - 1):
            k += 1
            # Activation and Dropout need not be initialized with dimensions
            if self.layers[i] != torch.nn.Dropout and self.layers[i] != torch.nn.ReLU:
                self._modules[str(i)] = self.layers[i](dims[k], dims[k+1])
            else:
                # If not a layer then functional assignment
                self._modules[str(i)] = self.layers[i]()
                k-=1
            
    def forward(self, X: torch.Tensor) -> torch.Tensor:
        """Forward propagation with control flow"""
        for i in range(len(self._modules)):
            X = self._modules[str(i)](X)
        return X
    
def build_nonseq_nn(lr:float):
    model = NonSequentialNN(7, 5, 3, 1)
    model.apply(initialize_parameters) # Initializing parameters for all Linear layers
    loss = torch.nn.MSELoss()
    trainer = torch.optim.AdamW(model.parameters(), lr)
    return model, trainer, loss

def train_nonseq_nn(X: torch.Tensor, labels: torch.Tensor, lr:float, epochs:int):
    model, trainer, loss = build_nonseq_nn(lr)
    for epoch in range(epochs):
        # Initializing graident to 0
        trainer.zero_grad()
        cost = loss(model(X), labels)
        if epoch % 100 == 0:
            print("epoch: ", epoch, ", cost: ", cost)
        cost.backward()
        # Stepping along gradient and updating weights
        trainer.step()
    return model

def initialize_parameters(layer: torch.nn.Module):
    """
    Applicable layer level function that can be applied by a NN to initialize each Linear Layer with Xavier weights 
    + constant bias
    """
    # Initializing all torch.nn.Linear layers with xavier variance maintaining weights + uniform constants
    if isinstance(layer, torch.nn.Linear):
        # Initializing from distribution used to maintain variance so as to avoid exploding and vanishing gradients
        torch.nn.init.xavier_uniform_(layer.weight)
        torch.nn.init.constant(layer.bias, 0.2)
            

In [113]:
X = torch.rand(120, 7) * 12 # Values up to about 20
# Linear fn with Gaussian noise
labels = X @ torch.Tensor([i * 0.1 for i in range(7)]) + 15  + torch.normal(0, 0.3, size = (1, 120))
labels

tensor([[24.7618, 24.5149, 23.3156, 28.6803, 28.5994, 27.4192, 34.0341, 21.8394,
         27.3328, 28.6167, 29.5413, 30.7348, 32.3054, 30.2460, 26.5944, 25.1440,
         28.5584, 27.9645, 24.2513, 30.7692, 29.0718, 31.2432, 33.7478, 31.9613,
         26.9352, 23.7830, 31.6831, 28.2437, 30.9579, 25.8252, 25.8471, 24.7871,
         26.7193, 32.2231, 28.9167, 27.4429, 29.5729, 28.5647, 30.5918, 28.1465,
         30.1989, 30.6434, 36.2119, 30.8035, 27.1705, 32.6814, 30.7025, 28.9919,
         28.8582, 26.5958, 23.7464, 29.6078, 27.9791, 27.5476, 24.2762, 27.1946,
         27.3742, 26.3235, 26.4626, 26.8952, 28.8788, 32.4711, 33.3996, 25.0261,
         29.5017, 28.1793, 33.0077, 30.2728, 32.5918, 24.2421, 28.8471, 32.3291,
         25.1473, 28.3657, 34.2052, 23.9359, 28.8678, 20.9647, 26.9096, 26.8324,
         26.9209, 25.4982, 30.1717, 26.4333, 28.6541, 29.8631, 26.9784, 25.5272,
         31.8813, 22.4839, 29.3475, 26.0065, 31.6725, 32.3125, 30.6244, 32.9005,
         23.5230, 23.6957, 2

In [110]:
model = train_nonseq_nn(X, labels, 0.05, 10000) # Being properly initialized, this model works very well! Variance maintained and gradients flow smoothly

  torch.nn.init.constant(layer.bias, 0.2)


epoch:  0 , cost:  tensor(612.8626, grad_fn=<MseLossBackward>)
epoch:  100 , cost:  tensor(185.6455, grad_fn=<MseLossBackward>)
epoch:  200 , cost:  tensor(92.9905, grad_fn=<MseLossBackward>)
epoch:  300 , cost:  tensor(74.8376, grad_fn=<MseLossBackward>)
epoch:  400 , cost:  tensor(59.4538, grad_fn=<MseLossBackward>)
epoch:  500 , cost:  tensor(40.0095, grad_fn=<MseLossBackward>)
epoch:  600 , cost:  tensor(35.1682, grad_fn=<MseLossBackward>)
epoch:  700 , cost:  tensor(25.8079, grad_fn=<MseLossBackward>)
epoch:  800 , cost:  tensor(22.0676, grad_fn=<MseLossBackward>)
epoch:  900 , cost:  tensor(18.1518, grad_fn=<MseLossBackward>)
epoch:  1000 , cost:  tensor(16.6361, grad_fn=<MseLossBackward>)
epoch:  1100 , cost:  tensor(15.1271, grad_fn=<MseLossBackward>)
epoch:  1200 , cost:  tensor(15.2100, grad_fn=<MseLossBackward>)
epoch:  1300 , cost:  tensor(13.2052, grad_fn=<MseLossBackward>)
epoch:  1400 , cost:  tensor(13.1474, grad_fn=<MseLossBackward>)
epoch:  1500 , cost:  tensor(12.510

In [111]:
model(torch.Tensor([1,2,3,4,5,6,7])), torch.Tensor([1,2,3,4,5,6,7]).dot(torch.Tensor([.1,.2,.3,.4,.5,.6,.7])) + 15

(tensor([27.0731], grad_fn=<AddBackward0>), tensor(29.))

In [112]:
# Not perfect, but not so far off!
# Note: Neural Network seems to have a lot of trouble with understanding what a negative value is, much worse training performance when negatives included