# Synthetic Regression Data



In [1]:
%matplotlib inline
import random
import torch
from d2l import torch as d2l

$$\mathbf{y}= \mathbf{X} \mathbf{w} + b + \mathbf\epsilon$$

In [3]:
class SyntheticRegressionData(d2l.DataModule):  
    def __init__(self, w, b, noise=0.01, num_examples=1000,
                 batch_size=8):
        super().__init__()
        self.save_hyperparameters()
        self.X = torch.randn(num_examples, len(w))
        noise = torch.randn(num_examples, 1) * noise
        self.y = torch.matmul(self.X, w.reshape((-1, 1))) + b + noise

data = SyntheticRegressionData(w=torch.tensor([2, -3.4]), b=4.2)

Each row in `features` consists of a vector in $\mathbb{R}^2$ and each row in `labels` is a scalar

In [4]:
print('features:', data.X[0],'\nlabel:', data.y[0])

features: tensor([-0.6172,  0.9769]) 
label: tensor([-0.3432])


Define the `train_dataloader` function
takes a batch size, a matrix of features,
and a vector of labels, yielding minibatches of size `batch_size`

In [6]:
@d2l.add_to_class(SyntheticRegressionData)
def train_dataloader(self):
    indices = list(range(self.num_examples))
    random.shuffle(indices)
    for i in range(0, self.num_examples, self.batch_size):
        batch_indices = torch.tensor(
            indices[i: min(i + self.batch_size, self.num_examples)])
        yield self.X[batch_indices], self.y[batch_indices]

X, y = next(iter(data.train_dataloader()))
print('X shape:', X.shape, '\ny shape:', y.shape)

X shape: torch.Size([8, 2]) 
y shape: torch.Size([8, 1])


Call the existing API in a framework to load data

In [8]:
def tensorloader(tensors, batch_size, shuffle):  
    dataset = torch.utils.data.TensorDataset(*tensors)
    return torch.utils.data.DataLoader(dataset, batch_size, shuffle=shuffle)
@d2l.add_to_class(SyntheticRegressionData)  
def train_dataloader(self):
    return tensorloader((self.X, self.y), self.batch_size, shuffle=True)

next(iter(data.train_dataloader()))

[tensor([[-1.1457,  0.8551],
         [-1.8158, -0.6051],
         [-0.2431, -0.2181],
         [ 0.0175,  0.4087],
         [-0.1382,  1.1670],
         [ 0.0506,  0.7525],
         [-0.4986, -0.0107],
         [ 0.7206, -0.4952]]),
 tensor([[-0.9905],
         [ 2.6317],
         [ 4.4656],
         [ 2.8253],
         [-0.0310],
         [ 1.7274],
         [ 3.2372],
         [ 7.3350]])]

In [9]:
len(data.train_dataloader())

125