In [47]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd 
import numpy as np 
import os
import torch
from torch import nn
from torch.utils.data import Dataset
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader

# Deep Neural Networks Laboratory

For this example, we will use the Wine Quality dataset ( https://archive.ics.uci.edu/dataset/186/wine+quality ) from the UCI ML Repository.

The dataset allows for a classification task on the quality of the wine given a list of attributes.

In [48]:
wine_quality = fetch_ucirepo(id=53)
x = wine_quality['data']['features']
y= wine_quality['data']['targets']

In [49]:
x.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [50]:
y.head()

Unnamed: 0,class
0,Iris-setosa
1,Iris-setosa
2,Iris-setosa
3,Iris-setosa
4,Iris-setosa


In [51]:
label_encoder = LabelEncoder()
y['class'] = label_encoder.fit_transform(y)
y.head()

  y = column_or_1d(y, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['class'] = label_encoder.fit_transform(y)


Unnamed: 0,class
0,0
1,0
2,0
3,0
4,0


Check if the number of rows in x is equal to the number of targets.

In [52]:
assert(x.shape[0] == y.shape[0])

Let's normalize the dataset.

In [53]:
x = (x - x.mean()) / x.std()
x

Unnamed: 0,sepal length,sepal width,petal length,petal width
0,-0.897674,1.028611,-1.336794,-1.308593
1,-1.139200,-0.124540,-1.336794,-1.308593
2,-1.380727,0.336720,-1.393470,-1.308593
3,-1.501490,0.106090,-1.280118,-1.308593
4,-1.018437,1.259242,-1.336794,-1.308593
...,...,...,...,...
145,1.034539,-0.124540,0.816888,1.443121
146,0.551486,-1.277692,0.703536,0.918985
147,0.793012,-0.124540,0.816888,1.050019
148,0.430722,0.797981,0.930239,1.443121


Since we are using Pytorch, we might want to convert the dataset we are using to a Pytorch Dataset.

This will allow us to use Dataloaders with several useful features.

A custom Dataset class must implement three functions: __init__, __len__, and __getitem__. 

In [54]:
class GenericDataset(Dataset):
    def __init__(self, targets_file, data_file, transform=None, target_transform=None):
        self.targets_file = pd.read_csv(targets_file)
        self.data_dir = pd.read_csv(data_file)
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.targets_file)

    def __getitem__(self, idx):
        return self.data_dir.iloc[idx].to_numpy(dtype=np.float32), self.targets_file.iloc[idx].item()

First off, let's save our x and y in a folder.

In [55]:
folder = os.path.join('data', 'iris')
os.makedirs(folder, exist_ok=True)
x.to_csv(os.path.join(folder, 'data.csv'), index=False)
y.to_csv(os.path.join(folder, 'targets.csv'), index=False)

In [56]:
WineDataset = GenericDataset(targets_file=os.path.join(folder, 'targets.csv'), data_file=os.path.join(folder, 'data.csv'))

In [57]:
WineDataset

<__main__.GenericDataset at 0x2b33d43ae50>

Now we can use Pytorch's dataloaders on this dataset.

In [58]:
from torch.utils.data import DataLoader

train_size = int(0.8 * len(WineDataset))
test_size = len(WineDataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(WineDataset, [train_size, test_size])

In [59]:
train_dataset

<torch.utils.data.dataset.Subset at 0x2b33c949670>

Now we can iterate over the train and test datasets.

In [66]:
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True)

In [67]:
x

Unnamed: 0,sepal length,sepal width,petal length,petal width
0,-0.897674,1.028611,-1.336794,-1.308593
1,-1.139200,-0.124540,-1.336794,-1.308593
2,-1.380727,0.336720,-1.393470,-1.308593
3,-1.501490,0.106090,-1.280118,-1.308593
4,-1.018437,1.259242,-1.336794,-1.308593
...,...,...,...,...
145,1.034539,-0.124540,0.816888,1.443121
146,0.551486,-1.277692,0.703536,0.918985
147,0.793012,-0.124540,0.816888,1.050019
148,0.430722,0.797981,0.930239,1.443121


In [68]:
next(iter(train_dataloader))

[tensor([[ 0.6722,  0.1061,  0.9869,  0.7880],
         [ 0.7930, -0.1245,  0.9869,  0.7880],
         [ 0.5515, -1.7390,  0.3635,  0.1328],
         [-1.1392,  0.1061, -1.2801, -1.4396],
         [ 0.5515, -1.2777,  0.6469,  0.3948],
         [-0.4146, -1.5083,  0.0234, -0.1293],
         [ 0.6722, -0.5858,  1.0436,  1.1811],
         [-0.5354,  1.4899, -1.2801, -1.3086],
         [ 2.4837,  1.7205,  1.4970,  1.0500],
         [ 1.1553, -0.5858,  0.5902,  0.2638],
         [ 0.3100, -0.3552,  0.5335,  0.2638],
         [-1.1392,  0.1061, -1.2801, -1.4396],
         [ 0.4307, -0.5858,  0.5902,  0.7880],
         [ 0.1892, -0.1245,  0.5902,  0.7880],
         [ 1.2761,  0.1061,  0.7602,  1.4431],
         [-0.5354, -0.1245,  0.4202,  0.3948],
         [-0.0523, -0.8164,  0.1935, -0.2603],
         [ 1.0345,  0.1061,  1.0436,  1.5742],
         [-0.2939, -1.2777,  0.0801, -0.1293],
         [-1.3807,  0.3367, -1.2234, -1.3086],
         [ 0.3100, -1.0471,  1.0436,  0.2638],
         [ 1.