In [1]:
# import libraries
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset

# Datasets

In [2]:
# Create some data in numpy
n_observations = 100
n_features     = 20

data = np.random.randn(n_observations, n_features)

In [3]:
# Convert to pytorch tensor
data_tensor = torch.tensor(data)

# Print out some information
print('Numpy data:')
print(type(data))
# numpy -> .shape
print(data.shape)
# The way that the information contained inside the variable is represented
print(data.dtype)
print('')

print('Tensor data:')
print(type(data_tensor))
# torch -> .size()
print(data_tensor.size())
print(data_tensor.dtype)
print('')

Numpy data:
<class 'numpy.ndarray'>
(100, 20)
float64

Tensor data:
<class 'torch.Tensor'>
torch.Size([100, 20])
torch.float64



In [4]:
# Sometimes you need to convert data types
data_tensor_2 = torch.tensor(data).float()
print(data_tensor_2.dtype)

# "long" is for ints
data_tensor_3 = torch.tensor(data).long()
print(data_tensor_3.dtype)

torch.float32
torch.int64


In [7]:
print(data)
print('')
print(data_tensor_2)
print('')
print(data_tensor_3)

[[-0.29269996 -0.76547053  0.38155091 ... -1.68711303 -0.96522114
  -1.72128114]
 [-0.24806663  0.05221672 -0.94100489 ... -0.19540531  0.15984295
  -0.20650638]
 [ 0.55057475 -1.78516519  0.44988201 ... -1.06474119  1.30921841
   0.85388237]
 ...
 [-0.74095305  0.79021878  0.13239614 ... -0.59160213 -0.17878413
   0.36875579]
 [-0.10260695  0.25591158 -0.01255142 ...  0.12455077  0.19941097
  -0.59145312]
 [-1.15987477  0.56582832 -0.17253258 ...  0.52156693  1.16167555
   0.14769202]]

tensor([[-0.2927, -0.7655,  0.3816,  ..., -1.6871, -0.9652, -1.7213],
        [-0.2481,  0.0522, -0.9410,  ..., -0.1954,  0.1598, -0.2065],
        [ 0.5506, -1.7852,  0.4499,  ..., -1.0647,  1.3092,  0.8539],
        ...,
        [-0.7410,  0.7902,  0.1324,  ..., -0.5916, -0.1788,  0.3688],
        [-0.1026,  0.2559, -0.0126,  ...,  0.1246,  0.1994, -0.5915],
        [-1.1599,  0.5658, -0.1725,  ...,  0.5216,  1.1617,  0.1477]])

tensor([[ 0,  0,  0,  ..., -1,  0, -1],
        [ 0,  0,  0,  ...,  0,  

In [10]:
# Convert tensor into PyTorch Datasets

# dataset = TensorDataset(data) # not a tensor!
data_set = TensorDataset(data_tensor)

# dataset is a two-element tuple comprising (data, labels)
print(data_set.tensors, '\n')
print(len(data_set.tensors), '\n')
print(data_set.tensors[0], '\n')

(tensor([[-0.2927, -0.7655,  0.3816,  ..., -1.6871, -0.9652, -1.7213],
        [-0.2481,  0.0522, -0.9410,  ..., -0.1954,  0.1598, -0.2065],
        [ 0.5506, -1.7852,  0.4499,  ..., -1.0647,  1.3092,  0.8539],
        ...,
        [-0.7410,  0.7902,  0.1324,  ..., -0.5916, -0.1788,  0.3688],
        [-0.1026,  0.2559, -0.0126,  ...,  0.1246,  0.1994, -0.5915],
        [-1.1599,  0.5658, -0.1725,  ...,  0.5216,  1.1617,  0.1477]],
       dtype=torch.float64),) 

1 

tensor([[-0.2927, -0.7655,  0.3816,  ..., -1.6871, -0.9652, -1.7213],
        [-0.2481,  0.0522, -0.9410,  ..., -0.1954,  0.1598, -0.2065],
        [ 0.5506, -1.7852,  0.4499,  ..., -1.0647,  1.3092,  0.8539],
        ...,
        [-0.7410,  0.7902,  0.1324,  ..., -0.5916, -0.1788,  0.3688],
        [-0.1026,  0.2559, -0.0126,  ...,  0.1246,  0.1994, -0.5915],
        [-1.1599,  0.5658, -0.1725,  ...,  0.5216,  1.1617,  0.1477]],
       dtype=torch.float64) 



In [16]:
# Let's try again with labels
labels = torch.ceil(torch.linspace(start=0.01, end=4, steps=n_observations)) # List of numbers

# Transform to an actual matrix (column vector)
labels = labels.reshape((len(labels), 1))
# print(labels, '\n')

# Now make another dataset
data_set = TensorDataset(data_tensor, labels)
print(len(data_set))              # Number of observations
print(len(data_set.tensors))      # (Data, Labels)
print(data_set.tensors[0].size()) # [Observations, Features]
print(data_set.tensors[1].size()) # [Observations, 1]

# For comparison
# print(np.shape(np.random.randint(low=5, size=n_observations)))

100
2
torch.Size([100, 20])
torch.Size([100, 1])


# DataLoaders

In [20]:
# Create a dataloader object
batch_size  = 25
data_loader = DataLoader(dataset=data_set, batch_size=batch_size)

print(data_loader.dataset.tensors[0].shape)

# Sizes of each batch
for data, label in data_loader:
    print(f'BATCH INFO: {data.size()} | {label.size()}\n')

# Inspect the labels
for data, label in data_loader:
    print(label.T, '\n')

torch.Size([100, 20])
BATCH INFO: torch.Size([25, 20]) | torch.Size([25, 1])

BATCH INFO: torch.Size([25, 20]) | torch.Size([25, 1])

BATCH INFO: torch.Size([25, 20]) | torch.Size([25, 1])

BATCH INFO: torch.Size([25, 20]) | torch.Size([25, 1])

tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1.]]) 

tensor([[2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
         2., 2., 2., 2., 2., 2., 2.]]) 

tensor([[3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
         3., 3., 3., 3., 3., 3., 3.]]) 

tensor([[4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4.,
         4., 4., 4., 4., 4., 4., 4.]]) 



In [21]:
# try again with shuffling (shuffling happens during iterations)
data_loader = DataLoader(dataset=data_set, batch_size=batch_size, shuffle=True)

# Inspect the labels
for data, label in data_loader:
    print(label.T, '\n')

tensor([[3., 1., 2., 2., 1., 3., 4., 2., 1., 2., 2., 2., 4., 4., 1., 1., 3., 3.,
         3., 4., 4., 2., 2., 1., 1.]]) 

tensor([[3., 4., 3., 1., 3., 4., 3., 3., 2., 2., 4., 1., 2., 2., 3., 1., 2., 1.,
         3., 4., 2., 3., 1., 3., 2.]]) 

tensor([[3., 2., 2., 4., 3., 2., 3., 3., 3., 3., 2., 2., 1., 1., 4., 2., 2., 1.,
         4., 1., 4., 4., 1., 4., 4.]]) 

tensor([[4., 4., 3., 4., 3., 4., 4., 4., 1., 3., 1., 1., 4., 1., 4., 2., 1., 3.,
         2., 1., 1., 4., 3., 2., 1.]]) 



In [23]:
# To get only one batch (e.g., for testing)

dat, labs = next(iter(data_loader))

labs.T

tensor([[1., 4., 4., 3., 2., 4., 3., 3., 1., 1., 3., 4., 4., 3., 4., 4., 3., 4.,
         2., 1., 1., 1., 2., 1., 2.]])