In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np

# Tensor Basics

In [None]:
# Creating tensors
x = torch.tensor([1, 2, 3])                    # From list
y = torch.tensor([[1, 2], [3, 4], [5,6]])      # 2D tensor
z = torch.zeros(2, 3)                          # Zeros: shape (2, 3)
a = torch.ones(2, 3)                           # Ones: shape (2, 3)
b = torch.randn(2, 3)                          # Random normal
c = torch.arange(0, 10, 2)                     # Like range(): 0,2,4,6,8
d = torch.linspace(0, 1, 5)                    # 5 points from 0 to 1

# Common attributes
print(f"Shape: {y.shape}")                     # torch.Size([2, 2])
print(f"Size: {y.size()}")                     # Same as shape
print(f"Dim: {y.dim()}")                       # Number of dimensions (2)
print(f"Dtype: {y.dtype}")                     # Data type
print(f"Device: {y.device}")                   # CPU/GPU

Shape: torch.Size([3, 2])
Size: torch.Size([3, 2])
Dim: 2
Dtype: torch.int64
Device: cpu


# Tensor Operations

In [None]:
# Basic math
x = torch.tensor([1.0, 2.0, 3.0])
y = torch.tensor([4.0, 5.0, 6.0])

add = x + y                                    # Element-wise addition
sub = x - y                                    # Subtraction
mul = x * y                                    # Multiplication
div = x / y                                    # Division
pow = x ** 2                                   # Power

# Matrix operations
A = torch.tensor([[1, 2], [3, 4]])
B = torch.tensor([[5, 6], [7, 8]])

matmul = torch.matmul(A, B)                    # Matrix multiplication
matmul_alt = A @ B                             # Same, Python style
dot = torch.dot(x, y)                          # Dot product

# Reduction operations
tensor = torch.tensor([[1, 2], [3, 4]])

sum_all = tensor.sum()                         # Sum all elements
sum_dim0 = tensor.sum(dim=0)                   # Sum along dimension 0
mean = tensor.mean()                           # Mean
std = tensor.std()                             # Standard deviation
max_val, max_idx = tensor.max(dim=1)           # Max values and indices
min_val = tensor.min()                         # Min value

# Reshaping (CRITICAL for deep learning)
tensor = torch.arange(12)

view = tensor.view(3, 4)                       # Reshape to 3x4
reshape = tensor.reshape(3, 4)                 # More flexible reshape
transpose = tensor.view(3, 4).t()              # Transpose
permute = tensor.view(3, 4).permute(1, 0)      # General permutation
squeeze = torch.randn(1, 3, 1, 4).squeeze()    # Remove size 1 dims
unsqueeze = tensor.unsqueeze(0)                # Add dimension at pos 0

# Concatenation and stacking
x1 = torch.tensor([1, 2, 3])
x2 = torch.tensor([4, 5, 6])

cat = torch.cat([x1, x2], dim=0)               # Concatenate
stack = torch.stack([x1, x2], dim=0)           # Stack along new dim

# Neural Network Layer

In [None]:
# Linear (Fully Connected)
linear = nn.Linear(in_features=10, out_features=5)
x = torch.randn(32, 10)  # batch_size=32, features=10
output = linear(x)       # shape: (32, 5)

# Convolutional

# batch_size, num_features/channels, seq_length(number of days in time series)
x = torch.randn(32, 10, 64)  
# kernel_size: moving window size
# padding: number of zeros added to each side
# in_channels: input features/channels
# out_channels: output features/channels
conv1d = nn.Conv1d(in_channels=10, out_channels=20, kernel_size=3, padding=1)


conv2d = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=1, padding=1)
x = torch.randn(32, 3, 64, 64)  # (batch, channels, height, width)
output = conv2d(x)               # (32, 16, 64, 64)

# Recurrent
lstm = nn.LSTM(input_size=10, hidden_size=20, num_layers=2, batch_first=True)
x = torch.randn(32, 5, 10)  # (batch, seq_len, features)
output, (hn, cn) = lstm(x)  # output: (32, 5, 20)

# Normalization layers
batch_norm = nn.BatchNorm1d(num_features=64)
layer_norm = nn.LayerNorm(normalized_shape=64)
instance_norm = nn.InstanceNorm1d(num_features=64)

# Dropout (Regularization)
dropout = nn.Dropout(p=0.5)  # 50% dropout

# Activation Functions

In [None]:
x = torch.tensor([-1.0, 0.0, 1.0, 2.0])

# Functional style (preferred)
relu = F.relu(x)
sigmoid = torch.sigmoid(x)
tanh = torch.tanh(x)
softmax = F.softmax(x, dim=0)
leaky_relu = F.leaky_relu(x, negative_slope=0.01)

# Module style
relu_layer = nn.ReLU()
sigmoid_layer = nn.Sigmoid()
tanh_layer = nn.Tanh()
softmax_layer = nn.Softmax(dim=1)


# Loss Functions

In [None]:
# Regression losses
y_pred = torch.tensor([0.5, 0.8, 1.2])
y_true = torch.tensor([1.0, 1.0, 1.0])

mse_loss = F.mse_loss(y_pred, y_true)          # Mean Squared Error
l1_loss = F.l1_loss(y_pred, y_true)            # L1/MAE Loss
smooth_l1 = F.smooth_l1_loss(y_pred, y_true)   # Huber Loss

# Classification losses
logits = torch.tensor([[2.0, 1.0, 0.1], [0.5, 2.0, 0.3]])
targets = torch.tensor([0, 1])  # Class indices

ce_loss = F.cross_entropy(logits, targets)     # Cross Entropy (most common)
bce_loss = F.binary_cross_entropy_with_logits(logits[:, 0], targets.float())

# Custom loss
def custom_loss(pred, target):
    return ((pred - target) ** 2).mean() + 0.01 * pred.abs().mean()

# Optimizers

In [None]:
# Model setup
model = nn.Linear(10, 1)

# Different optimizers
adam = optim.Adam(model.parameters(), lr=0.001)
sgd = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
rmsprop = optim.RMSprop(model.parameters(), lr=0.001)
adamw = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)

# Learning rate scheduler
scheduler = optim.lr_scheduler.StepLR(adam, step_size=10, gamma=0.1)

# Training Loop Template

In [None]:
def train_one_epoch(model, dataloader, optimizer, device='cpu'):
    """Standard training epoch"""
    model.train()
    total_loss = 0
    
    for batch_idx, (data, target) in enumerate(dataloader):
        # Move to device
        data, target = data.to(device), target.to(device)
        
        # Forward pass
        optimizer.zero_grad() # reset the gradients back to zero for the coming update
        output = model(data) # forward pass
        loss = F.cross_entropy(output, target) # get the loss
        
        # Backward pass
        loss.backward()
        
        # Gradient clipping (prevents exploding gradients)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        # Update weights
        optimizer.step()
        
        total_loss += loss.item()
        
        if batch_idx % 100 == 0:
            print(f'Batch {batch_idx}, Loss: {loss.item():.4f}')
    
    return total_loss / len(dataloader)

# Classic Model Templates

In [None]:
# Template 1: MLP
class SimpleMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.3)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
# Template 2: CNN 
class SimpleCNN(nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(64 * 8 * 8, 128)  # Adjust based on input size
        self.fc2 = nn.Linear(128, num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        x = torch.flatten(x, 1)  # Flatten all dimensions except batch
        x = self.dropout(self.relu(self.fc1(x)))
        x = self.fc2(x)
        return x

# Template 3: Transformer 
class SimpleTransformer(nn.Module):
    def __init__(self, input_dim, d_model=64, nhead=4, num_layers=2, num_classes=2):
        super().__init__()
        self.embedding = nn.Linear(input_dim, d_model)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, 
            nhead=nhead,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.classifier = nn.Linear(d_model, num_classes)
        
    def forward(self, x):
        # x shape: (batch, seq_len, features)
        x = self.embedding(x)
        x = self.transformer(x)
        x = x.mean(dim=1)  # Pool over sequence
        x = self.classifier(x)
        return x


# Dataloader & Dataset

In [None]:
from torch.utils.data import Dataset, DataLoader, TensorDataset

# Method 1: TensorDataset (quick and dirty)
features = torch.randn(1000, 10)
labels = torch.randint(0, 2, (1000,))
dataset = TensorDataset(features, labels)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Method 2: Custom Dataset
class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels
        
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

# Method 3: Advanced Dataset (with transforms)
class AdvancedDataset(Dataset):
    def __init__(self, data_path, transform=None):
        self.data = np.load(data_path)
        self.transform = transform
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        sample = self.data[idx]
        
        if self.transform:
            sample = self.transform(sample)
            
        return sample

# Common transforms
from torchvision import transforms

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5]),
    transforms.RandomHorizontalFlip(),
])


# Device Management

In [None]:
# Detect device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Move tensors/model to device
x = torch.randn(10, 10).to(device)
model = SimpleMLP(10, 20, 2).to(device)

# Multi-GPU (if available)
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs")
    model = nn.DataParallel(model)

# Clear GPU memory (if needed)
torch.cuda.empty_cache()

# Model Saving & Loading

In [None]:
model = SimpleMLP(10, 20, 2)

# Save entire model (easy but less flexible)
torch.save(model, 'model.pth')
loaded_model = torch.load('model.pth')

# Save only state_dict (recommended)
torch.save(model.state_dict(), 'model_state.pth')

# Load state_dict
new_model = SimpleMLP(10, 20, 2)
new_model.load_state_dict(torch.load('model_state.pth'))

# Save checkpoint (for resuming training)
checkpoint = {
    'epoch': 10,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': adam.state_dict(),
    'loss': 0.05,
}
torch.save(checkpoint, 'checkpoint.pth')

# Load checkpoint
checkpoint = torch.load('checkpoint.pth')
model.load_state_dict(checkpoint['model_state_dict'])
adam.load_state_dict(checkpoint['optimizer_state_dict'])

# Gradient & Autodiff

In [None]:
# Enable/disable gradient tracking
x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)

# Manual gradient computation
y = x.sum()
y.backward()  # Compute gradients
print(f"Gradients: {x.grad}")

# Disable gradient for inference
with torch.no_grad():
    output = model(x)
    # No gradient tracking here

# Gradient accumulation (for large batches)
for i, (data, target) in enumerate(dataloader):
    output = model(data)
    loss = F.cross_entropy(output, target)
    
    # Scale loss for accumulation
    loss = loss / 4  # Accumulate 4 batches
    
    loss.backward()
    
    if (i + 1) % 4 == 0:
        optimizer.step()
        optimizer.zero_grad()

# Debugging & Profiling

In [None]:
# Check for NaN/inf
x = torch.tensor([1.0, float('nan'), 3.0])
print(f"Has NaN: {torch.isnan(x).any()}")
print(f"Has Inf: {torch.isinf(x).any()}")

# Gradient checking
for name, param in model.named_parameters():
    if param.grad is not None:
        print(f"{name} gradient mean: {param.grad.mean():.6f}")
        if torch.isnan(param.grad).any():
            print(f"⚠️ NaN gradient in {name}!")

# Model summary
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total parameters: {count_parameters(model):,}")

# Memory usage
print(f"GPU Memory allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
print(f"GPU Memory cached: {torch.cuda.memory_reserved() / 1e9:.2f} GB")


# Useful Utilities

In [None]:
# Set random seeds for reproducibility
def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# One-hot encoding
labels = torch.tensor([0, 2, 1, 0])
one_hot = F.one_hot(labels, num_classes=3)

# Create masks
sequence = torch.tensor([[1, 2, 0, 0], [1, 2, 3, 0]])
mask = (sequence != 0)  # Padding mask

# Top-k predictions
logits = torch.randn(10, 5)  # 10 samples, 5 classes
topk_values, topk_indices = torch.topk(logits, k=3, dim=1)

# Batch matrix multiplication
batch1 = torch.randn(32, 10, 20)  # (batch, n, m)
batch2 = torch.randn(32, 20, 30)  # (batch, m, p)
result = torch.bmm(batch1, batch2)  # (batch, n, p)

# Others

In [None]:
# Learning rate scheduling
def lr_scheduling():
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100)
    for epoch in range(100):
        train()
        scheduler.step()

# Gradient accumulation
def grad_accumulation():
    optimizer.zero_grad()
    for i, (data, target) in enumerate(dataloader):
        output = model(data)
        loss = criterion(output, target) / accumulation_steps
        loss.backward()
        
        if (i + 1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

# Mixed precision training (faster on modern GPUs)
def mixed_precision():
    from torch.cuda.amp import autocast, GradScaler
    
    scaler = GradScaler()
    
    for data, target in dataloader:
        optimizer.zero_grad()
        
        with autocast():
            output = model(data)
            loss = criterion(output, target)
        
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
