In [1]:
import torch

In [2]:
t1 = torch.rand((3,5))
t1

tensor([[0.2504, 0.5160, 0.5584, 0.7188, 0.1282],
        [0.5198, 0.8539, 0.3556, 0.7066, 0.2145],
        [0.3135, 0.6192, 0.7683, 0.2074, 0.5995]])

In [3]:
t2 = torch.rand((5))
t3 = torch.rand(5)
t2

tensor([0.3582, 0.6703, 0.4028, 0.3830, 0.5069])

In [4]:
t2.dot(t3) 
# Dot product only works with 1D tensors, hadamard matrix multiplication is actually default for equasized matrixes

tensor(1.2591)

## Other Linear Algebra Utils

In [5]:
# Matrix vector products - takes dot product with each row of matrix (AKA computes a similarity score of 2 vectors per training example)
t1.mv(t2)

tensor([1.0008, 1.2811, 1.2202])

In [6]:
# Default is Hadamard product: matrix multiplication runs as the following
t1.mm(torch.rand((3,5)).T) # Matrix multiplication of 3x5 and 5x3 results in 3x3 matrix

tensor([[1.0704, 1.2861, 1.6624],
        [1.1280, 1.3769, 1.9571],
        [1.3348, 1.4462, 1.9983]])

### Norms

In [7]:
# Norm is a representation of a vector's magnitude (whether by absolute value, squared, cubed)
# Must be a positive value
# Default norm is L2 - squared distance
torch.norm(t1)

# Many optimization problems use Norms to calculate desired distance away 
#(can compute squared cost across all examples concurrently with norm)

tensor(2.0783)

In [8]:
# L1 norm - less susceptible to outliers (one variable taking an extreme value does not increase outlier drastically)
torch.sum(torch.abs(t1))

tensor(7.3302)

In [9]:
# Frobenius norm - sum of squares across a matrix - useful for calculating squared cost across all examples (total distance from correctness)
torch.norm(torch.ones(5,5)) # Averaged squared 
#distance per training example = 5 (1^2 + 1^2+ ...)

tensor(5.)

In [10]:
torch.sum(torch.ones(6,3,2), axis = (2,1)) # Sum along the final axis (innermost)then sum across next axis (k groupings of that)

tensor([6., 6., 6., 6., 6., 6.])

In [11]:
torch.ones((6,3,2))

tensor([[[1., 1.],
         [1., 1.],
         [1., 1.]],

        [[1., 1.],
         [1., 1.],
         [1., 1.]],

        [[1., 1.],
         [1., 1.],
         [1., 1.]],

        [[1., 1.],
         [1., 1.],
         [1., 1.]],

        [[1., 1.],
         [1., 1.],
         [1., 1.]],

        [[1., 1.],
         [1., 1.],
         [1., 1.]]])

## Calculus

What follows is reflection of understanding of matrix chain rule

In [12]:
# Norm of a vector. = X^T * X

In [13]:
# Compound chain rule (many variables which are each functions of x):
# dy/du1 * du1/dx + dy/du2 * du2/dx .... 
#(Total scaled impact with respect) with respect to every part of the composition = 
# sum(ROC of fn w.r.t var * ROC of var w.r.t other var) = combined rate of change of every 
#variable of fn w.r.t all of its variables by input = total ROC

In [14]:
# Sum across vectors = averaged vector  = final rate of change

## Automatic Differentiation

In [42]:
t4 = torch.arange(4.0) # Integer tensors cannot have gradients

In [43]:
t4.requires_grad_(True) # Tensor is a numpy array object
# However, it has capacity to store intermediate gradients so that we can attach a vector to its gradient

tensor([0., 1., 2., 3.], requires_grad=True)

In [44]:
print(t4.grad) # Default gradient is None

None


In [45]:
y = 2 * torch.dot(t4, t4)
y # Derivative of xTx should be 2x, so the derivative of this should yield 4x

tensor(28., grad_fn=<MulBackward0>)

Given x required a gradient, y now keeps track of its function so that backward differentiation can be done automatically

In [46]:
y.backward() #automatic backpropagation function, need to specify that we want to hold onto value

In [47]:
t4.grad # current gradient of function with respect to x computed

tensor([ 0.,  4.,  8., 12.])

In [55]:
# Another function
t4.grad_fn # A non writable gradient function attribute

In [56]:
y.grad_fn # Gradient function  = Multiply Backwards (Multiplication of X)

<MulBackward0 at 0x7fbf72e32400>

In [57]:
t4.grad.zero_() # Resetting gradient to 0

tensor([0., 0., 0., 0.])

In [59]:
y = t4.sum()

In [60]:
y # Gradient function now summed backwards (backwards additive fn)

tensor(6., grad_fn=<SumBackward0>)

In [61]:
y.backward()

In [62]:
t4.grad # New gradient is the 1 vector as each factor contributes equivalently to sum (x + y + z + w)

tensor([1., 1., 1., 1.])

In [64]:
t4.grad.zero_()
y = t4 * t4

In [65]:
y.backward(torch.ones(len(t4))) # always return to scope of normal variables for easier derivation

In [66]:
t4.grad # Each contribution 2x because showing up in both row and col
# passed in ones because want contribution to sum of 
# values in matrix (contributes x^2 + y^2 + z^2 so derivative 2x, 2y,2z)

tensor([0., 2., 4., 6.])

### Detaching Computation

We can detach a variable from the computation graph, forcing it to be treated as a constant and derivation to be done w.r.t the other variable

In [75]:
t4.grad.zero_()
y = (t4 * t4).sum()

In [80]:
r = y.detach() # Detaching compuational history so treated as a scalar

In [77]:
z =  r*t4

In [79]:
z.sum().backward() 

In [81]:
y.grad # Not populated as it is detached from compuation

  y.grad


In [91]:
t4.grad == y.sum() # Gradient == sum(y) as derivative inputted w.r.t to t4 and r treated as constant

tensor([True, True, True, True])

In [95]:
x = t4.detach() * t4.detach()

In [96]:
t4.grad # Gradient does not change in detachment, treated as a constant

tensor([14., 14., 14., 14.])

**Note: Gradient stored is gradient of next function w.r.t that variable**

In [140]:
x.grad.zero_() # Ensuring gradients are not overwritten
t4.grad.zero_()
x.grad # Gradient of this fn x w.r.t t4 stored in previous variable
x.requires_grad_(True) # This is a setter, default fn without _ is a checker for whether variable requires gradient computed w.r.t to it

tensor([0., 1., 4., 9.], requires_grad=True)

In [141]:
z = t4 * x         
z.sum().backward() # Computing contribution of each to next function
# Can only compute derivation w.r.t to scalar functions, often w.r.t to sum

In [144]:
t4.grad == x # Each function adjusted by derivative w.r.t to it, holds how the next function is optimized w.r.t to it

tensor([True, True, True, True])

In [146]:
x.grad == t4 # Gradient of fn with respect to x is t4x, t4y, t4z ... (retain only t4 variables because directly dotted in sum)

tensor([True, True, True, True])

In [135]:
x

tensor([0., 1., 4., 9.], requires_grad=True)

In [136]:
t4

tensor([0., 1., 2., 3.], requires_grad=True)

In [139]:
z

tensor([ 0.,  1.,  8., 27.], grad_fn=<MulBackward0>)

In [179]:
# Multivar example
y.requires_grad_(True) # Now enforcing that y require grad
x.grad.zero_()

tensor([0., 0., 0., 0.])

In [191]:
t4 = torch.arange(4.0)
t4.requires_grad_(True)

tensor([0., 1., 2., 3.], requires_grad=True)

In [192]:
y = t4 * t4
y.requires_grad_(True)
z = 2 * y * y # Need to retain graph to calculate backward derivative multiple times

In [193]:
z.sum().backward()

In [195]:
t4.grad # 8t^3  ((8*27 = 216))

tensor([  0.,   8.,  64., 216.])

In [234]:
# Multiplied by i + inp twice, then adds inp one more time
# Gradient of Python control flow
def func(inp: torch.Tensor) -> torch.Tensor:
    t = torch.arange(4)
    print(t)
    for i in range(3):
        t = t + inp
    if True:
        t *= inp
    return t

In [235]:
inp = torch.Tensor(torch.rand(4))
inp.requires_grad_(True)

tensor([0.8480, 0.0274, 0.7218, 0.5064], requires_grad=True)

In [236]:
z = func(inp)

tensor([0, 1, 2, 3])


In [237]:
z.grad_fn # combination of (t + 3inp) * inp

<MulBackward0 at 0x7fbf72ce5a30>

In [238]:
z.sum().backward() # Gradient for sum of cost

In [239]:
inp.grad # == 6 inp + t (proper understanding of control flow multiplication)

tensor([5.0881, 1.1643, 6.3310, 6.0381])

## Probability

## Documentation

In [241]:
dir(torch.Tensor)

['T',
 '__abs__',
 '__add__',
 '__and__',
 '__array__',
 '__array_priority__',
 '__array_wrap__',
 '__bool__',
 '__class__',
 '__complex__',
 '__contains__',
 '__cuda_array_interface__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__div__',
 '__doc__',
 '__eq__',
 '__float__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__idiv__',
 '__ifloordiv__',
 '__ilshift__',
 '__imod__',
 '__imul__',
 '__index__',
 '__init__',
 '__init_subclass__',
 '__int__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__irshift__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__long__',
 '__lshift__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pos__',
 '__pow__',
 '__radd__',
 '__rdiv__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__rfloordiv__',
 '__r

In [242]:
help(torch.Tensor) # More detailed documentation

Help on class Tensor in module torch:

class Tensor(torch._C._TensorBase)
 |  Method resolution order:
 |      Tensor
 |      torch._C._TensorBase
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __abs__ = abs(...)
 |  
 |  __array__(self, dtype=None)
 |  
 |  __array_wrap__(self, array)
 |      # Wrap Numpy array again in a suitable tensor when done, to support e.g.
 |      # `numpy.sin(tensor) -> tensor` or `numpy.greater(tensor, 0) -> ByteTensor`
 |  
 |  __contains__(self, element)
 |      Check if `element` is present in tensor
 |      
 |      Args:
 |          element (Tensor or scalar): element to be checked
 |              for presence in current tensor"
 |  
 |  __deepcopy__(self, memo)
 |  
 |  __dir__(self)
 |      Default dir() implementation.
 |  
 |  __floordiv__(self, other)
 |  
 |  __format__(self, format_spec)
 |      Default object formatter.
 |  
 |  __hash__(self)
 |      Return hash(self).
 |  
 |  __ipow__(self, other)
 |  
 |  __iter__(self)
 |  