# PyTorch .to(device)

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [2]:
# Define your device - GPU if available, else CPU
device= torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Using device: {device}")

Using device: cuda


# Performance Optimization: Efficiently Moving Data Between CPU and GPU

You might be wondering if there's a way to speed up data transfer. Here's the answer: non-blocking transfers can make a difference in performance, especially when loading large batches of data from CPU to GPU.

The non_blocking=True argument allows you to load data asynchronously, minimizing data-loading bottlenecks.

In [3]:
# Initialize a large tensor
data= torch.randn(10000, 128)

In [4]:
# Moving data asynchronously
dataset= data.to(device, non_blocking=True)

In [5]:
# Consider a model with multiple layers
class MyModel(torch.nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.fc1= torch.nn.Linear(128, 256)
        self.activ= nn.ReLU()
        self.fc2= torch.nn.Linear(256, 4)

    def forward(self, x):
        x= self.fc1(x)
        x= self.activ(x)
        x= self.fc2(x)

        return x


# Moving a model to the dynamically selected device
model= MyModel().to(device)

In [6]:
# Ensuring data and model are on the same device
assert data.device== model.fc1.weight.device, "Data and model are on different devices!"

AssertionError: Data and model are on different devices!

In [7]:
# Ensuring data and model are on the same device
assert dataset.device== model.fc1.weight.device, "Data and model are on different devices!"

In [8]:
# Check devices of model parameters
for param in model.parameters():
    print(param.device)

cuda:0
cuda:0
cuda:0
cuda:0


In [9]:
# Define DataLoader
dataloader= DataLoader(data, batch_size=64)

# Within your training loop
for Xb in dataloader:
    Xb= Xb.to(device, non_blocking=True)
    # Continue with training steps

# Practical Applications and Case Study

In [10]:
import torch.optim as optim

# Define a simple CNN model
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.conv1= nn.Conv2d(3, 32, kernel_size=3)
        self.activ= nn.ReLU()
        self.fc1= nn.Linear(28800, 10)

    def forward(self, x):
        x= self.activ(self.conv1(x))
        x= torch.flatten(x, 1)

        return self.fc1(x)


model= SimpleCNN().to(device)
optimizer= optim.AdamW(model.parameters(), lr=0.001)
criterion= nn.CrossEntropyLoss()

Now, let's put it all together with a real-world example. Let's walk through how to efficiently manage device transfers in an image classification task on CIFAR-10, focusing on .to(device) best practices.

In [11]:
import torchvision.transforms as transforms
import torchvision.datasets as datasets

# Set up transformations and load CIFAR-10
transform= transforms.Compose([
    transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))
])

train_dataset= datasets.CIFAR10(
    root='./data', train=True, transform=transform, download=True
)

# Define DataLoader with pin_memory
train_loader= DataLoader(
    train_dataset, batch_size=64, shuffle=True, pin_memory=True
)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170M/170M [00:18<00:00, 9.07MB/s]


Extracting ./data/cifar-10-python.tar.gz to ./data


By adding non_blocking=True, you enable asynchronous operations, reducing the time your code spends waiting for data transfers. This approach works particularly well when used alongside pin_memory=True in DataLoaders.

Pinning memory can speed up data transfer from CPU to GPU, as it allows the data to be directly accessed by the GPU.

In [12]:
# Training loop
for epoch in range(5):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        # Move data and target to the GPU
        data, target= data.to(device, non_blocking=True), target.to(device, non_blocking=True)

        # Forward pass
        optimizer.zero_grad()
        output= model(data)
        loss= criterion(output, target)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")


Epoch 1, Loss: 1.8261
Epoch 2, Loss: 0.4843
Epoch 3, Loss: 0.9790
Epoch 4, Loss: 0.7607
Epoch 5, Loss: 0.9187


# When to Use .detach().cpu()

Sometimes, you need to retrieve data from the GPU for logging, monitoring, or evaluation without affecting the original computation graph. That's where .detach().cpu() comes in handy.

Using .detach().cpu() lets you safely move data back to the CPU without altering the model's gradients or creating unnecessary device conflicts. This approach is ideal for device-agnostic code since it keeps data accessible on the CPU, making it easier to work with in mixed device environments.

In [13]:
# Move tensor to CPU for inspection without affecting gradients
tensor= torch.randn(10, requires_grad=True).to(device)
cpu_tensor= tensor.detach().cpu()
print(cpu_tensor)

tensor([ 1.3866, -1.2000, -1.2601,  0.6263, -0.4033, -0.4472,  0.6083, -0.7221,
        -0.0756,  0.2771])


In [None]:
# https://medium.com/biased-algorithms/mastering-pytorch-to-device-an-advanced-guide-for-efficient-device-management-0290b086f17e