# pytorch basics

In [2]:
import torch

In [9]:
# tensor

# create a (3x3) tensor of random values between 0 and 1
x = torch.rand(3, 3)  
print(x)

# create a tensor filled with zeros or ones
zeros = torch.zeros(2, 2)
ones = torch.ones(2, 2)

# create tensors from Python lists
tensor_from_list = torch.tensor([[1, 2], [3, 4]])

# tensor attributes
print("Shape:", x.shape)  # Dimensions of the tensor
print("Data type:", x.dtype)  # Data type of tensor elements
print("Device:", x.device)  # Device (CPU or GPU) where the tensor resides

tensor([[0.6032, 0.3480, 0.1462],
        [0.7424, 0.0804, 0.2987],
        [0.4658, 0.3211, 0.5640]])
Shape: torch.Size([3, 3])
Data type: torch.float32
Device: cpu


In [11]:
# tensor operations

# element-wise addition
a = torch.tensor([2, 3])
b = torch.tensor([5, 6])
print(a + b)  # [7, 9, 11]

# element-wise multiplication
print(a * b)  # [10, 18, 28]

# matrix multiplication
c = torch.rand(2, 3)
d = torch.rand(3, 4)
## dot product
print(torch.mm(c, d))  # Result is a (2x4) matrix
print(torch.matmul(c, d)) # Same result as above

# reshaping tensors
x = torch.rand(3, 3)
x_reshaped = x.view(1, 9)  # Reshape to (1x9)
print(x)
print(x_reshaped) # flattened tensor

# stack
a = torch.tensor([1, 2, 3])
b = torch.tensor([4, 5, 6])
stacked_1d = torch.stack([a, b])
print(stacked_1d)
print(stacked_1d.shape)

c = torch.tensor([[1, 2], [3, 4]])
d = torch.tensor([[5, 6], [7, 8]])
stacked_2d = torch.stack([c, d])
print(stacked_2d)
print(stacked_2d.shape)

tensor([7, 9])
tensor([10, 18])
tensor([[0.4629, 0.1539, 0.4662, 0.5405],
        [1.0802, 0.8159, 1.4383, 0.9259]])
tensor([[0.4629, 0.1539, 0.4662, 0.5405],
        [1.0802, 0.8159, 1.4383, 0.9259]])
tensor([[0.7427, 0.3080, 0.4054],
        [0.4698, 0.5344, 0.5900],
        [0.3126, 0.7473, 0.5904]])
tensor([[0.7427, 0.3080, 0.4054, 0.4698, 0.5344, 0.5900, 0.3126, 0.7473, 0.5904]])
tensor([[1, 2, 3],
        [4, 5, 6]])
torch.Size([2, 3])
tensor([[[1, 2],
         [3, 4]],

        [[5, 6],
         [7, 8]]])
torch.Size([2, 2, 2])


In [72]:
# gradient tracking

# the function is y=f(x), how y changes with respect to x
# gradient is dy/dx
x = torch.ones(2, 2, requires_grad=True)
y = x + 2
z = y * y * 3 
out = z.mean()

# Gradient of out
# out = z/4 = y*y*3/4 = (x+2)*(x+2)*3/4
# gradient = dy/dx = 3/4*(2x+4), set x=1, get gradient = 4.5
out.backward()
print(x.grad)

# use torch.no_grad() to disable gradient tracking
model = torch.nn.Linear(10, 1)
input_data = torch.randn(1, 10)
with torch.no_grad():
    output = model(input_data)
print("Model output:", output)


tensor([[4.5000, 4.5000],
        [4.5000, 4.5000]])
Model output: tensor([[0.2039]])


In [3]:
# torch.nn, neural networks

# neural network layers
# fully connected (dense) layers
# torch.nn.Linear(in_features, out_features)
# Input (x) → Linear Layer → Output (y = xW^T + b)
layer = torch.nn.Linear(1,1) # Linear layer with 1 input and 1 output
layer = torch.nn.Linear(10, 5) # Linear layer with 10 inputs and 5 outputs
# weights
# weights shape should be (out_features, in_features)
# each of the 5 output features is a weighted sum of all 10 input features
print(layer.weight.shape)  
# biases
# biases shape should be (out_features)
# each of the 5 output features has a bias
print(layer.bias.shape)

torch.Size([5, 10])
torch.Size([5])


In [None]:
# embedding layer 
# do lookup, from token id to a tensor
# projection [vocab_size,] to [embedding_dim,]
vocab_size = 20000 # tokenizer token count
embed_dim = 768    # projection layer size
layer = torch.nn.Embedding(vocab_size, embed_dim)

input_token_ids = [
    [1,2],
    [4,5],
    [7,8],
] # batch size = 3, vector size = 2
output_tensor = layer(torch.tensor(input_token_ids))
print(output_tensor.shape) # [3,2,768] ([batch_size, vector_size, embedding_dim])

torch.Size([3, 2, 768])


In [None]:
# dropout layer
layer = torch.nn.Dropout(p=0.5)  # Dropout with probability of 0.5
print(layer.forward(torch.randn(4, 4)))  # Apply dropout to a tensor  

In [None]:
# convolutional layers
layer = torch.nn.Conv2d(in_channels=1, out_channels=1, kernel_size=3)  # 2D convolution
layer = torch.nn.Conv1d(in_channels=1, out_channels=1, kernel_size=3)  # 1D convolution
print(layer.weight.shape)  # Print weights of the convolutional layer

In [None]:
# pooling layers
layer = torch.nn.MaxPool2d(kernel_size=2)  # Max pooling

In [None]:
# recurrent layers
torch.nn.RNN(input_size=10, hidden_size=20)  # RNN layer
torch.nn.LSTM(input_size=10, hidden_size=20)  # LSTM layer
torch.nn.GRU(input_size=10, hidden_size=20)  # GRU layer

In [None]:
# activation functions
# softmax function
torch.nn.Softmax(dim=1) 

# sigmoid function
torch.nn.Sigmoid()

# relu function
torch.nn.ReLU()

# tanh function
torch.nn.Tanh()

In [5]:
# loss functions
# mean squared error loss
torch.nn.MSELoss()
# cross entropy loss - multi-label 
# input: logits (before softmax), target (class index 0,1,2...)
# processing: logits -> softmax -> loss
loss_fn = torch.nn.CrossEntropyLoss()
logits = torch.tensor([[2.0, 1.0, 0.1]])  # shape [1, 3], 3 classes
target = torch.tensor([0])  # correct class = 0
loss = loss_fn(logits, target)
print(f"torch.nn.CrossEntropyLoss loss: {loss}")

# binary cross entropy loss - 0/1 or multi-label
# input: proba (after sigmoid), target (0 or 1)
# processing: proba -> loss
loss_fn = torch.nn.BCELoss()
pred = torch.tensor([0.9])   # already sigmoid probability
target = torch.tensor([1.0]) # true label
loss = loss_fn(pred, target)
print(f"torch.nn.BCELoss: {loss}")

# binary cross entropy loss - 0/1 or multi-label
# input: logits (before sigmoid), target (0 or 1)
# processing: logits -> sigmoid -> BCE
loss_fn = torch.nn.BCEWithLogitsLoss()
logits = torch.tensor([0.0, 2.0, -2.0])  # shape [3]
targets = torch.tensor([0.0, 1.0, 0.0])  # shape [3]
loss =loss_fn(logits, targets)
print(f"torch.nn.BCEWithLogitsLoss: {loss}")

torch.nn.CrossEntropyLoss loss: 0.4170299470424652
torch.nn.BCELoss: 0.10536054521799088
torch.nn.BCEWithLogitsLoss: 0.3156677782535553


In [None]:
# optimizers