# Making a PyTorch Dataset

In [3]:
#@title # Run the following cell to install the necessary libraries for this practical. { display-mode: "form" } 
#@markdown Don't worry about what's in this collapsed cell

!pip install -q torch
!pip install -q pandas
print('Downloading BostonHousing.csv...')
!wget https://s3-eu-west-1.amazonaws.com/aicore-portal-public-prod-307050600709/practicals_files/de5ba7fa-5835-4f4b-87c9-58d0d0bf086a/BostonHousing.csv -q


Downloading BostonHousing.csv...


#### 1. Import `torch` and `pandas`.


In [2]:
# Run this cell to import the relevant dependencies
import torch
from numpy import genfromtxt
import numpy as np

#### 2. Define a class called `BostonHousingDataset` that inherits from `torch.utils.data.Dataset`.

#### 3. Inside the class constructor, read in the dataset csv file using `numpy.genfromtxt`.
You will need to add an argument to set the delimiter to comma, and a second argument to set the skip_header parameter. View the docs to determine how to do this.

#### 4. Assign two attributes, `self.X` and `self.Y`, and assign them to your features and labels.
The labels are in the final column of the data, all the other columns are features. Convert the data to torch tensor format as you assign them, and set the datatype to float32. You can look at the docs for `torch.tensor()` for help.

#### 5. Now define the second crucial method of the dataset class: `__getitem__`.
This needs to take in an index of your dataset and return the features and label corresponding to that index.

#### 6. Finally, define the `__len__` method, which defines how your dataset responds to the len() method in python.
It should print the number of rows in your dataset when called.




In [13]:
data = genfromtxt('BostonHousing.csv', delimiter = ',', skip_header=1)
print(data)
# X = data[:,-1]
# print(X)

[[6.3200e-03 1.8000e+01 2.3100e+00 ... 3.9690e+02 4.9800e+00 2.4000e+01]
 [2.7310e-02 0.0000e+00 7.0700e+00 ... 3.9690e+02 9.1400e+00 2.1600e+01]
 [2.7290e-02 0.0000e+00 7.0700e+00 ... 3.9283e+02 4.0300e+00 3.4700e+01]
 ...
 [6.0760e-02 0.0000e+00 1.1930e+01 ... 3.9690e+02 5.6400e+00 2.3900e+01]
 [1.0959e-01 0.0000e+00 1.1930e+01 ... 3.9345e+02 6.4800e+00 2.2000e+01]
 [4.7410e-02 0.0000e+00 1.1930e+01 ... 3.9690e+02 7.8800e+00 1.1900e+01]]


In [24]:
class BostonHousingDataset(torch.utils.data.Dataset):
    
    def __init__(self):
        #TODO - Read in 'BostonHousing.csv' using the `genfromtxt` method
        data = genfromtxt('BostonHousing.csv', delimiter = ',', skip_header=1)
        #TODO - Select features and assign to `self.X`, converting to torch tensor of dtype float32
        self.X = torch.from_numpy(data[:,:-1])
        self.X.to(torch.float32)
        #TODO - Select labels and assign to `self.y`, converting to torch tensor of dtype float32
        self.y = torch.from_numpy(data[:,-1])
        self.y.to(torch.float32)
    
    def __len__(self):
        #TODO - define the __len__ method.
        return len(self.X)

    def __getitem__(self,idx):
        #TODO - Define the __getitem__ method so that it returns the features and labels associated with a given index
        return self.X[idx], self.y[idx]

#TODO - Create an instance of the `BostonHousingDataset` class
dataset = BostonHousingDataset()
#TODO - Confirm correct feature and label slicing, and data type, by printing their shape.
# print('shape of dataset, ' , (dataset))
print('shape of fature, ' , dataset.X.shape)
print('shape of label, ' , dataset.y.shape)
#TODO - Confirm that len method works
print('len of fature, ' , len(dataset.X))
print('len of label, ' , len(dataset.y))
#TODO - Confirm that __getitem__ method works, and that the features have a batch dimension.
print(dataset[0])
dataset.__getitem__(0)


shape of fature,  torch.Size([506, 13])
shape of label,  torch.Size([506])
len of fature,  506
len of label,  506
(tensor([6.3200e-03, 1.8000e+01, 2.3100e+00,        nan, 5.3800e-01, 6.5750e+00,
        6.5200e+01, 4.0900e+00, 1.0000e+00, 2.9600e+02, 1.5300e+01, 3.9690e+02,
        4.9800e+00], dtype=torch.float64), tensor(24., dtype=torch.float64))


(tensor([6.3200e-03, 1.8000e+01, 2.3100e+00,        nan, 5.3800e-01, 6.5750e+00,
         6.5200e+01, 4.0900e+00, 1.0000e+00, 2.9600e+02, 1.5300e+01, 3.9690e+02,
         4.9800e+00], dtype=torch.float64), tensor(24., dtype=torch.float64))


#### 7. Finally, let's load our data into a dataloader as if we were going to perform minibatch optimisation. 

Create an instance of your BostonHousingDatset class, and pass it as an argument to an instance of the `DataLoader` class (found in `torch.utils.data`). Specify a batch size of 4 and set shuffle to `True`, and call the instance `train_loader`.

In [None]:
# TODO - Create an instance of the `Dataloader` class, passing arguments to dataset, batch size and shuffle parameters.
class Dataloader()





#### 8. We can now test our dataloader by running the command `next(iter(train_loader)`

Print the result.

In [None]:
# TODO - Get an example output from the dataloader.
# TODO - Confirm that features have dimensionality = batch_size x 1 x n_features
# TODO - Confirm labels have dimensionality =  batch_size