# Tabular data

In [7]:
import csv
import torch
import numpy as np

# File name
wine_path = 'data/winequality-white.csv'
# Data Type is float, delimiter is ';' as the file uses it to separate data
# This also skips row 1 as that is the row containing the headers
wineq_numpy = np.loadtxt(wine_path, dtype=np.float32, delimiter=";", skiprows=1)
wineq_numpy

array([[ 7.  ,  0.27,  0.36, ...,  0.45,  8.8 ,  6.  ],
       [ 6.3 ,  0.3 ,  0.34, ...,  0.49,  9.5 ,  6.  ],
       [ 8.1 ,  0.28,  0.4 , ...,  0.44, 10.1 ,  6.  ],
       ...,
       [ 6.5 ,  0.24,  0.19, ...,  0.46,  9.4 ,  6.  ],
       [ 5.5 ,  0.29,  0.3 , ...,  0.38, 12.8 ,  7.  ],
       [ 6.  ,  0.21,  0.38, ...,  0.32, 11.8 ,  6.  ]], dtype=float32)

In [8]:
# Check that all the data has been read
col_list = next(csv.reader(open(wine_path), delimiter=';'))
wineq_numpy.shape, col_list

((4898, 12),
 ['fixed acidity',
  'volatile acidity',
  'citric acid',
  'residual sugar',
  'chlorides',
  'free sulfur dioxide',
  'total sulfur dioxide',
  'density',
  'pH',
  'sulphates',
  'alcohol',
  'quality'])

Every row in the table is independent from the others and order doesn't matter. No column encoded information on what rows came before and what rows came after, and as such is a flat table.

In [9]:
# convert from NumPy array to PyTorch Tensor
wineq = torch.from_numpy(wineq_numpy)
wineq.shape, wineq.type()

(torch.Size([4898, 12]), 'torch.FloatTensor')

In [10]:
# strip the categorical column from the tensor
data = wineq[:, :-1]
data, data.shape

(tensor([[ 7.0000,  0.2700,  0.3600,  ...,  3.0000,  0.4500,  8.8000],
         [ 6.3000,  0.3000,  0.3400,  ...,  3.3000,  0.4900,  9.5000],
         [ 8.1000,  0.2800,  0.4000,  ...,  3.2600,  0.4400, 10.1000],
         ...,
         [ 6.5000,  0.2400,  0.1900,  ...,  2.9900,  0.4600,  9.4000],
         [ 5.5000,  0.2900,  0.3000,  ...,  3.3400,  0.3800, 12.8000],
         [ 6.0000,  0.2100,  0.3800,  ...,  3.2600,  0.3200, 11.8000]]),
 torch.Size([4898, 11]))

In [11]:
# Save just the categorical column from the tensro
target = wineq[:, -1] 
target, target.shape

(tensor([6., 6., 6.,  ..., 6., 7., 6.]), torch.Size([4898]))

In [12]:
# Quantitative Labeling of Categorical Values
target = wineq[:, -1].long()
target

tensor([6, 6, 6,  ..., 6, 7, 6])

In [13]:
# Qualitative Labeling of Categorical Values
target_onehot = torch.zeros(target.shape[0], 10)
# scatter_ takes 3 inputs:
# 1) The dimension along which the following two arguments are specified
# 2) A column tensor indicating the indices of the elements to scatter
#   - unsqueeze - add an extra dummy_dimension to match dimensions
# 3) A tensor containing the elements to scatter or a single scalar to scatter (1, in this case)
target_onehot.scatter_(1, target.unsqueeze(1), 1.0)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [15]:
# calculate the mean
data_mean = torch.mean(data, dim=0)
# calculate the std
data_var = torch.var(data, dim=0)
# normalize the data
data_normalized = (data - data_mean) / torch.sqrt(data_var)
data_normalized

tensor([[ 1.7209e-01, -8.1764e-02,  2.1325e-01,  ..., -1.2468e+00,
         -3.4914e-01, -1.3930e+00],
        [-6.5743e-01,  2.1587e-01,  4.7991e-02,  ...,  7.3992e-01,
          1.3467e-03, -8.2418e-01],
        [ 1.4756e+00,  1.7448e-02,  5.4378e-01,  ...,  4.7502e-01,
         -4.3677e-01, -3.3662e-01],
        ...,
        [-4.2042e-01, -3.7940e-01, -1.1915e+00,  ..., -1.3131e+00,
         -2.6152e-01, -9.0544e-01],
        [-1.6054e+00,  1.1666e-01, -2.8253e-01,  ...,  1.0048e+00,
         -9.6250e-01,  1.8574e+00],
        [-1.0129e+00, -6.7703e-01,  3.7852e-01,  ...,  4.7502e-01,
         -1.4882e+00,  1.0448e+00]])

In [16]:
# We could determine which rows in target correspond to a score less than or equal to 3
bad_indexes = torch.le(target, 3)
bad_indexes.shape, bad_indexes.dtype, bad_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(20))

In [17]:
# advanced indexing - use a binary tensor to index the data tensor
bad_data = data[bad_indexes]
bad_data.shape

torch.Size([20, 11])

## Data Exploration
At first glance, the bad wines seem to have higher total sulfur dioxide, among other differences. You could use a threshold on total sulfur dioxide as a crude criterion for discriminating good wines from bad ones.

In [37]:
# extract the three groups of data into their own slices
bad_data = data[torch.le(target, 3)]
mid_data = data[torch.gt(target, 3) & torch.lt(target, 7)] 
good_data = data[torch.ge(target, 7)]

# Calculate the mean of each slice
bad_mean = torch.mean(bad_data, dim=0) 
mid_mean = torch.mean(mid_data, dim=0) 
good_mean = torch.mean(good_data, dim=0)

# enumerate the three slices
for i, args in enumerate(zip(col_list, bad_mean, mid_mean, good_mean)):
    print('{:2} {:20} {:6.2f} {:6.2f} {:6.2f}'.format(i, *args))ormat(i, *args))

0 fixed acidity          7.60   6.89   6.73
 1 volatile acidity       0.33   0.28   0.27
 2 citric acid            0.34   0.34   0.33
 3 residual sugar         6.39   6.71   5.26
 4 chlorides              0.05   0.05   0.04
 5 free sulfur dioxide   53.33  35.42  34.55
 6 total sulfur dioxide 170.60 141.83 125.25
 7 density                0.99   0.99   0.99
 8 pH                     3.19   3.18   3.22
 9 sulphates              0.47   0.49   0.50
10 alcohol               10.34  10.26  11.42


In [38]:
# midpoint of the total sulfur dioxide
total_sulfur_threshold = 141.83
# isolate the data with just the total sulfur dioxide (col 6)
total_sulfur_data = data[:,6]
# get the indexes in which the total sulfur dioxide column is below the mid-point
predicted_indexes = torch.lt(total_sulfur_data, total_sulfur_threshold)
predicted_indexes.shape, predicted_indexes.dtype, predicted_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(2727))

In [39]:
# get the data that is labeled as the best wine
actual_indexes = torch.gt(target, 5)
actual_indexes.shape, actual_indexes.dtype, actual_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(3258))

Because you have about 500 more good wines than your threshold predicted, you already have hard evidence that the threshold isn’t perfect

In [42]:
# compare est with act to determine how well you did
n_matches = torch.sum(actual_indexes & predicted_indexes).item()
n_predicted = torch.sum(predicted_indexes).item()
n_actual = torch.sum(actual_indexes).item()
n_matches, n_matches / n_predicted, n_matches / n_actual

(2018, 0.74000733406674, 0.6193984039287906)

You correctly match 2018 records! That resulted in a 74% accuracy in your single variable prediction.
Unfortunately, the correct score is only 61%.
This shows that making a predicition only using a single variable creates a bad prediction method.
This can be overcome using a layered Deep Learning model, but it is a good example of how to use PyTorch to perform Data Exploration with Tableu data

# Time series

In [54]:
bikes_path = 'data/hour-fixed.csv'
# Convert date strings to numbers corresponding to the day of the month in column 1.
bikes_numpy = np.loadtxt(bikes_path,dtype=np.float32,delimiter=",",skiprows=1,converters={1: lambda x: float(x[8:10])})
bikes = torch.from_numpy(bikes_numpy)
bikes

tensor([[1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 3.0000e+00, 1.3000e+01,
         1.6000e+01],
        [2.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 8.0000e+00, 3.2000e+01,
         4.0000e+01],
        [3.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 5.0000e+00, 2.7000e+01,
         3.2000e+01],
        ...,
        [1.7377e+04, 3.1000e+01, 1.0000e+00,  ..., 7.0000e+00, 8.3000e+01,
         9.0000e+01],
        [1.7378e+04, 3.1000e+01, 1.0000e+00,  ..., 1.3000e+01, 4.8000e+01,
         6.1000e+01],
        [1.7379e+04, 3.1000e+01, 1.0000e+00,  ..., 1.2000e+01, 3.7000e+01,
         4.9000e+01]])

In [55]:
# 17,520 hours, 17 columns
bikes.shape, bikes.stride()

(torch.Size([17520, 17]), (17, 1))

In [56]:
# reshape the data to have three axes (day, hour, and then your 17 columns)
# -1 means the row is inferred, and results in 730
# bikes.shape[1] = 17
# 24 * 17 = 408 -> the number of strides for for row 1
daily_bikes = bikes.view(-1, 24, bikes.shape[1])
# Row 1 is the day, row 2 is the hour, and row 3 is the data
daily_bikes.shape, daily_bikes.stride()

(torch.Size([730, 24, 17]), (408, 17, 1))

In [57]:
# In order to get the proper NxCxL order, you must trasnpose (day, 17 columns, hour)
daily_bikes = daily_bikes.transpose(1, 2)
# think of this as each page is the day, and that page contains a table of the hourly data for the day
daily_bikes.shape, daily_bikes.stride()

(torch.Size([730, 17, 24]), (408, 1, 17))

## Data Exploration
limit yourself to the first day for now

In [59]:
# initialize a zero-filled matrix with a number of rows equal to the number of hours in the day
# and a number of columns equal to the number of weather levels
first_day = bikes[:24].long()
weather_onehot = torch.zeros(first_day.shape[0], 4)
first_day[:,9]

tensor([1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 2, 2, 2, 2])

In [60]:
# complete the one-hot encoding
# You’re decreasing the values by 1 in index because the weather situation ranges from 1 to 4,
# whereas indices are 0-based.
weather_onehot.scatter_(dim=1, index=first_day[:,9].unsqueeze(1) - 1, value=1.0)

tensor([[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.]])

In [61]:
# concatenate the one-hot data with the whole data for the one day
torch.cat((bikes[:24], weather_onehot), 1)[:1]

tensor([[ 1.0000,  1.0000,  1.0000,  0.0000,  1.0000,  0.0000,  0.0000,  6.0000,
          0.0000,  1.0000,  0.2400,  0.2879,  0.8100,  0.0000,  3.0000, 13.0000,
         16.0000,  1.0000,  0.0000,  0.0000,  0.0000]])

In [62]:
# Alternatively you could have used the daily_bikes
daily_weather_onehot = torch.zeros(daily_bikes.shape[0], 4, daily_bikes.shape[2])
daily_weather_onehot.shape

torch.Size([730, 4, 24])

In [64]:
# then scatter
daily_weather_onehot.scatter_(1, daily_bikes[:,9,:].long().unsqueeze(1) - 1, 1.0)
daily_weather_onehot.shape

torch.Size([730, 4, 24])

In [65]:
# concatenate
daily_bikes = torch.cat((daily_bikes, daily_weather_onehot), dim=1)

In [66]:
# Since this is ordinal data, we could normalize it
daily_bikes[:, 9, :] = (daily_bikes[:, 9, :] - 1.0) / 3.0

In [68]:
# multiple ways to rescale the variables, depending on your NN needs
temp = daily_bikes[:, 10, :]
temp_min = torch.min(temp)
temp_max = torch.max(temp)
daily_bikes[:, 10, :] = (daily_bikes[:, 10, :] - temp_min) / (temp_max - temp_min)

In [70]:
temp = daily_bikes[:, 10, :]
daily_bikes[:, 10, :] = (daily_bikes[:, 10, :] - torch.mean(temp)) / torch.std(temp)

# Text

In [71]:
jane_path = 'data/1342-0.txt'
with open(jane_path, encoding='utf8') as f:
    text = f.read()

In [75]:
# split your text into a list of lines and pick an arbitrary line to focus on
lines = text.split('\n')
line = lines[200]
line

'“Impossible, Mr. Bennet, impossible, when I am not acquainted with him'

In [76]:
# Create a tensor that can hold the total number of one-hot encoded characters for the whole line
# 128 hardcoded due to the limits of ASCII
letter_tensor = torch.zeros(len(line), 128)
letter_tensor.shape

torch.Size([70, 128])

In [77]:
for i, letter in enumerate(line.lower().strip()):
    # The text uses directional double quotes, which aren’t valid ASCII, so screen them out here.
    letter_index = ord(letter) if ord(letter) < 128 else 0
    letter_tensor[i][letter_index] = 1

In [78]:
def clean_words(input_str):
    """
    This converts all letters to lower case, replaces any new lines with
    a space, and splits on the words.
    Then it removes the punctuation for all words in the list
    Input:
    * input_str: a string of ascii characters, representing a sentence
    Output:
    * returns a clean list of words based on the input sentence
    """
    punctuation = '.,;:"!?”“_-'
    word_list = input_str.lower().replace('\n',' ').split()
    word_list = [word.strip(punctuation) for word in word_list]
    return word_list

words_in_line = clean_words(line)
line, words_in_line

('“Impossible, Mr. Bennet, impossible, when I am not acquainted with him',
 ['impossible',
  'mr',
  'bennet',
  'impossible',
  'when',
  'i',
  'am',
  'not',
  'acquainted',
  'with',
  'him'])

In [80]:
# build a mapping of words to indexes in your encoding
# Sort the cleansed words of the whole text file
word_list = sorted(set(clean_words(text)))
# create a key:value pair dictionary using key=word and value=index
word2index_dict = {word: i for (i, word) in enumerate(word_list)}
# 7261 words, impossible shows up 3394 times
len(word2index_dict), word2index_dict['impossible']

(7261, 3394)

In [82]:
# create a zero tensor to hold the dictionary for the one liner
word_tensor = torch.zeros(len(words_in_line), len(word2index_dict))
# create key;value pairs for the one liner
for i, word in enumerate(words_in_line):
    # populate the records in the dictionary
    word_index = word2index_dict[word]
    word_tensor[i][word_index] = 1
    print('{:2} {:4} {}'.format(i, word_index, word))
# one sentence of length 11 in an encoding space of size 7261—the number of words in your dictionary
print(word_tensor.shape)

0 3394 impossible
 1 4305 mr
 2  813 bennet
 3 3394 impossible
 4 7078 when
 5 3315 i
 6  415 am
 7 4436 not
 8  239 acquainted
 9 7148 with
10 3215 him
torch.Size([11, 7261])


# Images
img is a NumPy array-like object with three dimensions: two spatial dimensions (width and height) and a third dimension corresponding to the channels red, green, and blue. (W x H x C)

In [83]:
import imageio
img_arr = imageio.imread('data/bobby.jpg')
img_arr.shape

(720, 1280, 3)

PyTorch modules that deal with image data require tensors to be laid out as C x H x W (channels, height, and width, respectively).

In [85]:
img = torch.from_numpy(img_arr)
out = torch.transpose(img, 0, 2)

In [87]:
# you can preallocate a tensor of appropriate size and fill it with images loaded from a directory
batch_size = 100
# N(100) x C(3) x H(256) x W(256)
batch = torch.zeros(100, 3, 256, 256, dtype=torch.uint8)

In [90]:
# load all png images from an input directory and store them in the tensor
import os
data_dir = 'data/image-cats/'
filenames = [name for name in os.listdir(data_dir) if os.path.splitext(name) == '.png']
for i, filename in enumerate(filenames):
    img_arr = imageio.imread(filename)
    batch[i] = torch.transpose(torch.from_numpy(img_arr), 0, 2)

A typical thing that you’ll want to do is cast a tensor to floating-point and normalize the values of the pixels

In [92]:
# One possibility is to divide the values of pixels by 255
batch = batch.float()
batch /= 255.0

In [93]:
# Another possibility is to compute mean and standard deviation of the input data and scale
# it so that the output has zero mean and unit standard deviation across each channel
n_channels = batch.shape[1]
for c in range(n_channels):
    mean = torch.mean(batch[:, c])
    std = torch.std(batch[:, c])
    batch[:, c] = (batch[:, c] - mean) / std

# Volumetric data

In [95]:
import imageio
dir_path = "data/images-lungs/" 
# Load a sample CT scan by using the volread function in the imageio module
# which takes a directory as argument and assembles all DICOM files in a 
# series in a NumPy 3D array
vol_arr = imageio.volread(dir_path, 'DICOM')
vol_arr.shape

Reading DICOM (examining files):1/99 files (1.0%99/99 files (100.0%)
  Found 1 correct series.
Reading DICOM (loading data):74/99  (74.799/99  (100.0%)


(99, 512, 512)

the layout is different from what PyTorch expects, due to the lack of channel information

In [97]:
vol = torch.from_numpy(vol_arr).float()
vol = torch.transpose(vol, 0, 2)
# make room for the channel dimension by using unsqueeze
vol = torch.unsqueeze(vol, 0)
# C x H x W x N
vol.shape
# At this point, you could assemble a 5D data set by stacking multiple volumes along the batch direction
# This example is just with a single volume.

torch.Size([1, 512, 512, 99])

# Audio

In [101]:
import scipy.io.wavfile as wavfile
wave_path = 'data/1-100038-A-14.wav'
# returns two outputs, namely the sampling frequency and the waveform as a 16-bit integer 1D array
freq, waveform_arr = wavfile.read(wave_path)
freq, waveform_arr

(44100, array([ -388, -3387, -4634, ...,  2289,  1327,    90], dtype=int16))

It's a single 1D array, which tells us that it's a mono recording - we'd have two waveforms (two channels) if the sound were stereo.

In [102]:
waveform = torch.from_numpy(waveform_arr).float()
waveform.shape

torch.Size([220500])

For architectures based on filtering the 1D signal with cascades of learned filter banks, such as convolutional networks, we would need to lay out the tensor as `N x C x L`, where `N` is the number of sounds in a dataset, `C` the number of channels and `L` the number of samples in time.

Conversely, for architectures that incorporate the notion of temporal sequences, just as recurrent networks we mentioned for text, data needs to be laid out as `L x N x C` - sequence length comes first. Intuitively, this is because the latter architectures take one set of `C` values at a time - the signal is not considered as a whole, but as an individual input changing in time.

In [106]:
from scipy import signal
# Convert the audio file into a spectrogram
f_arr, t_arr, sp_arr = signal.spectrogram(waveform_arr, freq)
# Convert the resulting NumPy frequency array into a Tensor
sp_mono = torch.from_numpy(sp_arr)
sp_mono.shape

torch.Size([129, 984])

In [105]:
# convert the three spectrograms into Tensors
sp_left = sp_right = sp_arr
sp_left_t = torch.from_numpy(sp_left)
sp_right_t = torch.from_numpy(sp_right)
sp_left_t.shape, sp_right_t.shape

(torch.Size([129, 984]), torch.Size([129, 984]))

In [107]:
# Stack the Tensors into a single spectrogram
sp_t = torch.stack((sp_left_t, sp_right_t), dim=0)
sp_t.shape

torch.Size([2, 129, 984])

If we want to build a dataset to use as input for a network, we will stack multiple spectrograms representing multiple sounds in a dataset along the first dimension, leading to a `N x C x F x T` tensor.

Such tensor is indistinguishable from what we would build for a dataset set of images, where `F` is represents rows and `T` columns of an image. Indeed, we would tackle a sound classification problem on spectrograms with the exact same networks.

# Video

In [120]:
import imageio
video_path = 'data/cockatoo.mp4'
# create a reader instance for the video
reader = imageio.get_reader(video_path)
meta = reader.get_meta_data()
meta

{'plugin': 'ffmpeg',
 'nframes': inf,
 'ffmpeg_version': '4.2.2 built with Apple clang version 11.0.0 (clang-1100.0.33.8)',
 'codec': 'h264',
 'pix_fmt': 'yuv444p',
 'fps': 20.0,
 'source_size': (1280, 720),
 'size': (1280, 720),
 'duration': 14.0}

In [131]:
n_channels = 3
n_frames = meta['nframes']
video = torch.empty(n_channels, n_frames, *meta['size']
ideo.shape
#

0

In [127]:
# iterate over the reader and set the values for all three channels into in the proper `i`-th time slice
for i, frame_arr in enumerate(reader):
    frame = torch.from_numpy(frame_arr).float()
    n_channels = 3
    n_frames = meta['nframes']
    video = torch.empty(n_channels, n_frames, (1280, 720))

video.shape

In the above, we iterate over individual frames and set each frame in the `C x T x H x W` video tensor, after transposing the channel. We can then obtain a batch by stacking multiple 4D tensors or pre-allocating a 5D tensor with a known batch size and filling it iteratively, clip by clip, assuming clips are trimmed to a fixed number of frames.

Equating video data to volumetric data is not the only way to represent video for training purposes. This is a valid strategy if we deal with video bursts of fixed length. An alternative strategy is to resort to network architectures capable of processing long sequences and exploiting short and long-term relationships in time, just like for text or audio.
// We'll see this kind of architectures when we take on recurrent networks.

This next approach accounts for time along the batch dimension. Hence, we'll build our dataset as a 4D tensor, stacking frame by frame in the batch:


In [None]:
time_video = torch.empty(n_frames, n_channels, *meta['size'])

for i, frame in enumerate(reader):
    frame = torch.from_numpy(frame).float()
    time_video[i] = torch.transpose(frame, 0, 2)

time_video.shape