TODO: create B2 cell for loading datasets into B2 storage, and link to it from here
    for example this dataset: https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv

In [238]:
import conda
conda.__version__

'4.4.11'

## 4.1.1 Tabular Data

In [239]:
from torchvision import models
import torch

In [240]:
# CSV writer from: https://stackoverflow.com/questions/45978295/saving-a-downloaded-csv-file-using-python
import requests
import csv
import os.path

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"

data_dir = 'data'
file_name = 'winequality-white.csv'
data = os.path.join(data_dir, file_name)

if not os.path.exists(data_dir):
    os.makedirs(data_dir)

request = requests.get(url)

if not os.path.exists(data):
    with open(data, 'w') as file:
        writer = csv.writer(file)
        reader = csv.reader(request.text.splitlines())

        for row in reader:
            writer.writerow(row)

In [241]:
import io
import numpy as np

wineq_numpy = np.loadtxt(data,
                         dtype=np.float32, delimiter=";",
                         skiprows=1)

wineq_numpy 

array([[ 7.  ,  0.27,  0.36, ...,  0.45,  8.8 ,  6.  ],
       [ 6.3 ,  0.3 ,  0.34, ...,  0.49,  9.5 ,  6.  ],
       [ 8.1 ,  0.28,  0.4 , ...,  0.44, 10.1 ,  6.  ],
       ...,
       [ 6.5 ,  0.24,  0.19, ...,  0.46,  9.4 ,  6.  ],
       [ 5.5 ,  0.29,  0.3 , ...,  0.38, 12.8 ,  7.  ],
       [ 6.  ,  0.21,  0.38, ...,  0.32, 11.8 ,  6.  ]], dtype=float32)

In [242]:
# vectorize/map info: https://stackoverflow.com/questions/9236926/concatenating-two-one-dimensional-numpy-arrays
wineq_header = np.genfromtxt(data,
                             dtype=np.dtype('U'),
                             delimiter=";",
                             autostrip=True,
                             max_rows=1)

wineq_header = np.array([header.strip('"') for header in wineq_header])

wineq_header

array(['fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'quality'], dtype='<U20')

In [243]:
wineq_header.shape

(12,)

In [244]:
wineq_numpy.shape 

(4898, 12)

In [245]:
wineq = torch.from_numpy(wineq_numpy)

wineq.type() 

'torch.FloatTensor'

In [246]:
wineq.shape

torch.Size([4898, 12])

In [247]:
data = wineq[:, :-1]
target = wineq[:, -1].long()

In [248]:
target_onehot = torch.zeros(target.shape[0], 10) 
target_onehot.scatter_(1, target.unsqueeze(1), 1.0) 


    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
       ...          ⋱          ...       
    0     0     0  ...      0     0     0
    0     0     0  ...      1     0     0
    0     0     0  ...      0     0     0
[torch.FloatTensor of size 4898x10]

In [249]:
data_mean = torch.mean(data, dim=0)
data_var = torch.var(data, dim=0)

In [250]:
data_norm = (data - data_mean) / torch.sqrt(data_var)

In [251]:
data_bad = data[torch.le(target, 3).long()]
data_good = data[torch.ge(target, 7).long()]

In [252]:
import itertools

# string formating: https://docs.python.org/3/tutorial/inputoutput.html
print("{:<22} {:>10}".format("column", "mean"))
for i in itertools.zip_longest(wineq_header[:-1], torch.mean(data_bad, dim=0).numpy()):
    print("{:<22} {:10.4f}".format(i[0], i[1]))

column                       mean
fixed acidity              6.9971
volatile acidity           0.2701
citric acid                0.3599
residual sugar            20.6227
chlorides                  0.0450
free sulfur dioxide       44.8734
total sulfur dioxide     169.8448
density                    1.0010
pH                         3.0012
sulphates                  0.4501
alcohol                    8.8033


In [253]:
print("{:<22} {:>10}".format("column", "mean"))
for i in itertools.zip_longest(wineq_header[:-1], torch.mean(data_good, dim=0).numpy()):
    print("{:<22} {:10.4f}".format(i[0], i[1]))

column                       mean
fixed acidity              6.8486
volatile acidity           0.2765
citric acid                0.3557
residual sugar            16.5668
chlorides                  0.0459
free sulfur dioxide       38.2911
total sulfur dioxide     161.7762
density                    0.9995
pH                         3.0649
sulphates                  0.4586
alcohol                    8.9518


In [254]:
residual_sugar = data[:,3]

average_residual_sugar_bad = residual_sugar[torch.le(target, 5)].mean()

average_residual_sugar_good = residual_sugar[torch.gt(target, 5)].mean()

residual_sugar_threshold = 0.5 * (average_residual_sugar_good + average_residual_sugar_bad)

print('avr res sugar bad: {:12.4f}'.format(average_residual_sugar_bad))
print('avr res sugar good: {:11.4f}'.format(average_residual_sugar_good))
print('threshold: {:20.4f}'.format(residual_sugar_threshold))

avr res sugar bad:       7.0545
avr res sugar good:      6.0577
threshold:               6.5561


In [255]:
predicted_quality = torch.lt(residual_sugar, residual_sugar_threshold)

known_quality = torch.gt(target, 5)

n_matches = torch.sum(known_quality * predicted_quality)

n_matches

1989

## 4.1.2 Time Series

Data from: https://archive.ics.uci.edu/ml/datasets/bike+sharing+dataset

In [256]:
# zip download from: https://stackoverflow.com/questions/9419162/download-returned-zip-file-from-url?noredirect=1
import requests, zipfile, io

data_dir = 'data/bikes'
bike_file = 'hour.csv'
bike_data = os.path.join(data_dir, bike_file)

if not os.path.exists(data_dir):
    os.makedirs(data_dir)

bike_zip_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip"

req = requests.get(bike_zip_url)
zipfile = zipfile.ZipFile(io.BytesIO(req.content))
zipfile.extractall(data_dir)


In [257]:
import numpy as np

bikes_numpy = np.loadtxt(bike_data,
                         dtype=np.float32,
                         delimiter=",",
                         skiprows=1,
                         converters={1: lambda x: float(x[8:10])}
                        )
bikes = torch.from_numpy(bikes_numpy)
bikes


     1.0000      1.0000      1.0000  ...       3.0000     13.0000     16.0000
     2.0000      1.0000      1.0000  ...       8.0000     32.0000     40.0000
     3.0000      1.0000      1.0000  ...       5.0000     27.0000     32.0000
                ...                   ⋱                   ...                
 17377.0000     31.0000      1.0000  ...       7.0000     83.0000     90.0000
 17378.0000     31.0000      1.0000  ...      13.0000     48.0000     61.0000
 17379.0000     31.0000      1.0000  ...      12.0000     37.0000     49.0000
[torch.FloatTensor of size 17379x17]

In [258]:
bikes_header = np.genfromtxt(bike_data,
                             dtype=np.dtype('U'),
                             delimiter=",",
                             autostrip=True,
                             max_rows=1)

bikes_header = np.array([header.strip('"') for header in bikes_header])

bikes_header

array(['instant', 'dteday', 'season', 'yr', 'mnth', 'hr', 'holiday',
       'weekday', 'workingday', 'weathersit', 'temp', 'atemp', 'hum',
       'windspeed', 'casual', 'registered', 'cnt'], dtype='<U10')

In [259]:
bikes_header_nc = np.loadtxt(bike_data,
                             dtype=np.dtype('U'),
                             delimiter=",",
                             )

bikes_header_nc

array([['instant', 'dteday', 'season', ..., 'casual', 'registered',
        'cnt'],
       ['1', '2011-01-01', '1', ..., '3', '13', '16'],
       ['2', '2011-01-01', '1', ..., '8', '32', '40'],
       ...,
       ['17377', '2012-12-31', '1', ..., '7', '83', '90'],
       ['17378', '2012-12-31', '1', ..., '13', '48', '61'],
       ['17379', '2012-12-31', '1', ..., '12', '37', '49']], dtype='<U10')

In [260]:
_, sorted_row_idxs = torch.sort(bikes[:, 0], dim=0)

bikes = bikes[sorted_row_idxs]

In [261]:
bikes.stride()

(17, 1)

In [262]:
daily_bikes = bikes.view(-1, 24, bikes.shape[1])

RuntimeError: invalid argument 2: size '[-1 x 24 x 17]' is invalid for input with 295443 elements at /Users/soumith/minicondabuild3/conda-bld/pytorch_1518385717421/work/torch/lib/TH/THStorage.c:37

In [263]:
bikes.transpose(1, 2)

RuntimeError: dimension out of range (expected to be in range of [-2, 1], but got 2)

In [264]:
bikes.shape

torch.Size([17379, 17])

In [265]:
daily_bikes.shape

NameError: name 'daily_bikes' is not defined

## 4.1.3 Text

### Character-level encoding

In [266]:
# CSV writer from: https://stackoverflow.com/questions/45978295/saving-a-downloaded-csv-file-using-python
import requests
import os.path

text_url = "http://www.gutenberg.org/files/1342/1342-0.txt"

data_dir = 'data'
text_filename = '1342-0.txt'
text_path = os.path.join(data_dir, text_filename)

request = requests.get(text_url)

text_data = request.text

if not os.path.exists(text_path):
    with open(text_path, 'w') as file:
        file.write(text_data)
        file.close


In [267]:
with open('data/1342-0.txt') as f:
    text_data = f.read()

In [268]:
import unicodedata
import string

all_letters = string.ascii_letters + " .,l'"
n_letters = len(all_letters)

def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if
                  unicodedata.category(c) != 'Mn' and c in all_letters)

In [269]:
all_letters

"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .,l'"

In [270]:
n_letters

57

In [271]:
unicode_to_ascii("sl0.")

'sl.'

In [272]:
lines = text_data.split('\n')
line = lines[200]
#line = '“Impossible, Mr. Bennet, impossible, when I am not acquainted with him'

In [273]:
import torch

tensor = torch.zeros(len(line), n_letters)
tensor.shape

torch.Size([70, 57])

In [274]:
for i, letter in enumerate(line.lower().strip()):
    letter_index = all_letters.find(letter)
    tensor[i][letter_index] = 1

In [275]:
tensor = torch.unsqueeze(tensor, 1)
tensor.shape

torch.Size([70, 1, 57])

### Word-level encoding

In [276]:
all_words = text_data.lower().replace('\n', ' ').split()
punctuation = '.,;:"!?”“_-'

all_words = {word.strip(punctuation): i for (i, word) in
            enumerate(all_words)}

len(all_words)

7261

In [277]:
line

'“Impossible, Mr. Bennet, impossible, when I am not acquainted with him'

In [278]:
lines = text_data.split('\n')

line = lines[200]

words_in_line = [word.strip(punctuation) for word in line.lower().split(' ')]

words_in_line

['impossible',
 'mr',
 'bennet',
 'impossible',
 'when',
 'i',
 'am',
 'not',
 'acquainted',
 'with',
 'him']

In [279]:
tensor = torch.zeros(len(words_in_line), len(all_words))
tensor.shape

torch.Size([11, 7261])

In [280]:
for i, word in enumerate(words_in_line):
    word_index = all_words[word]
    tensor[i][word_index] = 1

IndexError: index 116803 is out of range for dimension 0 (of size 7261)

In [281]:
tensor = tensor.unsqueeze(1)
tensor.shape

torch.Size([11, 1, 7261])

## 4.1.4 Audio

In [282]:
import scipy.io.wavfile as wavfile
import os

audio_dir = 'data/esc-50/'
audio_file = '1-100038-A-14.wav'
audio = os.path.join(audio_dir, audio_file)

freq, waveform_arr = wavfile.read(audio)

FileNotFoundError: [Errno 2] No such file or directory: 'data/esc-50/1-100038-A-14.wav'

In [106]:
freq

44100

In [107]:
waveform_arr

array([ -388, -3387, -4634, ...,  2289,  1327,    90], dtype=int16)

In [108]:
waveform = torch.from_numpy(waveform_arr).float()

In [109]:
from scipy import signal

f_arr, t_arr, sp_arr = signal.spectrogram(waveform_arr, freq)

In [31]:
sp = torch.from_numpy(sp_arr)
sp


 4.3517e+00  1.4044e+00  2.7865e-04  ...   1.3261e-01  8.4661e-03  7.7487e+00
 4.4579e+01  3.3186e+00  5.3582e+00  ...   2.7802e+01  1.2889e+01  1.6912e+01
 9.5455e+01  2.9964e+01  7.6881e+01  ...   2.8018e+01  2.5155e+01  1.6094e+02
                ...                   ⋱                   ...                
 2.3361e-06  1.3716e-06  1.1413e-05  ...   3.5789e-06  4.2816e-06  6.9388e-06
 2.1429e-06  1.4071e-06  9.2230e-07  ...   7.9787e-07  1.8314e-06  1.0062e-06
 5.5598e-06  3.1114e-06  1.1163e-05  ...   6.2336e-07  6.3296e-07  2.6184e-06
[torch.FloatTensor of size 129x984]

## 4.1.5 Images

In [283]:
!conda install -y -c conda-forge imageio

Solving environment: done


  current version: 4.4.11
  latest version: 4.5.0

Please update conda by running

    $ conda update -n base conda



## Package Plan ##

  environment location: /Users/danimad/anaconda3

  added / updated specs: 
    - imageio


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2018.1.18          |           py36_0         143 KB  conda-forge
    anaconda-custom            |   py36ha4fed55_0           6 KB
    conda-4.3.34               |           py36_0         515 KB  conda-forge
    openssl-1.0.2n             |                0         3.3 MB  conda-forge
    imageio-2.3.0              |           py36_0         3.3 MB  conda-forge
    ca-certificates-2018.1.18  |                0         141 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         7.3 MB

The followi

In [4]:
import imageio
import os

data_dir = 'data'
img_file = 'dog.jpg'
image = os.path.join(data_dir, img_file)

img_arr = imageio.imread(image)

img_arr.shape

(720, 1280, 3)

In [7]:
import torch

img = torch.from_numpy(img_arr)
out = torch.transpose(img, 0, 2)