# PyTorch Data Loaders

Data loaders are used to train models by batching the training dataset. This notebook will break down how it works.

In [1]:
import torch
from torch.utils.data import DataLoader, TensorDataset, Dataset
import numpy as np

In [49]:
import polars as pl
import yfinance as yf
import re

prices = yf.download("SPLG", start='2023-01-01', end='2024-01-01')

df = (
    pl
    .from_pandas(
        prices
        .reset_index()
    ).with_columns(
        pl.lit("SPLG").alias("Ticker")
    )
)

df.columns = [re.sub(r"[^\w\s]","",header.split(",")[0]) for header in df.columns]

df.head()

x_train = torch.FloatTensor(df.select("Close", "High", "Low", "Open").to_numpy()[:100])
y_train = torch.FloatTensor(df["Volume"].to_list()[:100])

print(x_train.shape)
print(y_train.shape)

  prices = yf.download("SPLG", start='2023-01-01', end='2024-01-01')
[*********************100%***********************]  1 of 1 completed

torch.Size([100, 4])
torch.Size([100])





In [50]:
print(x_train.dim())
print(y_train.dim())

2
1


In [51]:
x_tensor_dataset = TensorDataset(x_train)
x_tensor_dataset[0]

(tensor([43.0921, 43.7176, 42.7553, 43.4866]),)

In [52]:
tensor_dataset = TensorDataset(x_train, y_train)
tensor_dataset[0]

(tensor([43.0921, 43.7176, 42.7553, 43.4866]), tensor(3688300.))

In [53]:
x_train[0], y_train[0]

(tensor([43.0921, 43.7176, 42.7553, 43.4866]), tensor(3688300.))

In [54]:
class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels
        
    def __len__(self):
        # returns length of dataset
        return len(self.labels)

    def __getitem__(self, idx):
        # returns data with given index
        return self.features[idx], self.labels[idx]

In [55]:
custom_dataset = CustomDataset(x_train, y_train)
len(custom_dataset)

100

In [56]:
custom_dataset[0:5]

(tensor([[43.0921, 43.7176, 42.7553, 43.4866],
         [43.4289, 43.6598, 42.9959, 43.3423],
         [42.9285, 43.1979, 42.8611, 43.1883],
         [43.9004, 44.0351, 42.9285, 43.2845],
         [43.8812, 44.5452, 43.8619, 44.1506]]),
 tensor([3688300., 4335600., 4449300., 2160500., 4251700.]))

## Dataloader

In [57]:
batch_size = 10

x_train_dataloader = DataLoader(
    x_tensor_dataset,
    batch_size = batch_size,
    shuffle = False,
    num_workers = 4
)

x_train_dataloader

<torch.utils.data.dataloader.DataLoader at 0xffff34369690>

In [60]:
i = 0
for x in x_train_dataloader:
    if i == 0:
        print(len(x))
        print(type(x))
        print(x)
    i += 1    

1
<class 'list'>
[tensor([[43.0921, 43.7176, 42.7553, 43.4866],
        [43.4289, 43.6598, 42.9959, 43.3423],
        [42.9285, 43.1979, 42.8611, 43.1883],
        [43.9004, 44.0351, 42.9285, 43.2845],
        [43.8812, 44.5452, 43.8619, 44.1506],
        [44.1987, 44.1987, 43.7080, 43.8042],
        [44.7569, 44.7569, 44.2854, 44.3623],
        [44.9205, 45.0841, 44.3912, 44.8820],
        [45.1033, 45.1514, 44.5163, 44.5452],
        [44.9974, 45.2669, 44.9301, 45.0744]])]


In [61]:
batch_size = 10

train_dataloader = DataLoader(
    tensor_dataset,
    batch_size = batch_size,
    shuffle = False,
    num_workers = 4
)

train_dataloader

<torch.utils.data.dataloader.DataLoader at 0xffff3436b190>

In [65]:
i = 0
for x in x_train_dataloader:
    if i == 0:
        print(len(x))
        print(type(x))
        print(x)
    i += 1  

1
<class 'list'>
[tensor([[43.0921, 43.7176, 42.7553, 43.4866],
        [43.4289, 43.6598, 42.9959, 43.3423],
        [42.9285, 43.1979, 42.8611, 43.1883],
        [43.9004, 44.0351, 42.9285, 43.2845],
        [43.8812, 44.5452, 43.8619, 44.1506],
        [44.1987, 44.1987, 43.7080, 43.8042],
        [44.7569, 44.7569, 44.2854, 44.3623],
        [44.9205, 45.0841, 44.3912, 44.8820],
        [45.1033, 45.1514, 44.5163, 44.5452],
        [44.9974, 45.2669, 44.9301, 45.0744]])]


In [66]:
i = 0
for x, y in train_dataloader:
    if i == 0:
        print(len(x))
        print(type(x))
        print(x)
        print(y)
    i += 1  

10
<class 'torch.Tensor'>
tensor([[43.0921, 43.7176, 42.7553, 43.4866],
        [43.4289, 43.6598, 42.9959, 43.3423],
        [42.9285, 43.1979, 42.8611, 43.1883],
        [43.9004, 44.0351, 42.9285, 43.2845],
        [43.8812, 44.5452, 43.8619, 44.1506],
        [44.1987, 44.1987, 43.7080, 43.8042],
        [44.7569, 44.7569, 44.2854, 44.3623],
        [44.9205, 45.0841, 44.3912, 44.8820],
        [45.1033, 45.1514, 44.5163, 44.5452],
        [44.9974, 45.2669, 44.9301, 45.0744]])
tensor([3688300., 4335600., 4449300., 2160500., 4251700., 2303400., 8119200.,
        3996900., 3308800., 4826600.])
