# Data

## Load Data From Anywhere

Install the required packages

In [None]:
%%capture
%pip install datasets libsvmdata

Define the imports

In [None]:
import os

import numpy as np
import matplotlib.pyplot as plt

import sklearn.datasets as skdata
import libsvmdata
import tensorflow as tf
import tensorflow_datasets as tfds
import datasets

### Simple dataset

In [None]:
def load_data(dataset_id):
    if dataset_id == 'my_dataset':
        x = np.random.randn(100, 10)
        y = np.random.randint(0, 2, 100)
        return x, y
    else:
        raise ValueError(f'Unknown dataset_id: {dataset_id}')

In [None]:
np.random.seed(1337)

dataset_id = 'my_dataset'
x, y = load_data(dataset_id)

x.shape, y.shape


### Scikit-learn

Web: https://scikit-learn.org/stable/datasets.html

Sklearn tabular

In [None]:
# "load" - load small standard datasets
# "fetch" - download and load a dataset

iris = skdata.load_iris()  # small tabular dataset
x, y = iris.data, iris.target

In [None]:
x[:5,:], y[:5]

Sklearn text

In [None]:
news = skdata.fetch_20newsgroups() # text dataset
x, y = news.data, news.target

In [None]:
x[:2]

In [None]:
y[:2]

### LibSVM

Web: https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/

In [None]:
# Beware of the format of the data! Here, sparse data is returned
x, y = libsvmdata.fetch_libsvm('a9a', normalize=True, verbose=True)

In [None]:
type(x)

In [None]:
x = x.toarray()

In [None]:
x[:2,:5]

In [None]:
y

In [None]:
# encode labels since libsvm returns smth like [-1, 1] or [1, 2, 3, ...]
from sklearn.preprocessing import LabelEncoder
y = LabelEncoder().fit_transform(y)

In [None]:
y

### TensorFlow DataSets

Web: https://www.tensorflow.org/datasets/catalog/overview

In [None]:
tf.config.set_visible_devices([], device_type='GPU')  # "conceal" GPUs from TFDS

data, info = tfds.load(
    name='mnist',
    split=['train', ],
    batch_size=-1,
    shuffle_files=False,
    as_supervised=True,
    with_info=True,
)

In [None]:
# Beware of the format of the data! Here, tensors are returned
data = tfds.as_numpy(data)
x, y = data[0]

In [None]:
x[0].shape

In [None]:
plt.imshow(x[0], cmap='gray')

### HuggingFace Datasets

Web: https://huggingface.co/datasets

In [None]:
data = datasets.load_dataset(
    'lizziepikachu/starwars_planets',
    # cache_dir=data_folder,
)

In [None]:
data

In [None]:

df = data['train'].to_pandas()
df.head()

In [None]:
y = df['population'].to_numpy()
x = df.drop(columns=['population']).to_numpy()

In [None]:
x[:5]

### Resume
- data laoding signatures are more or less the same;
- always check the type and range of the returned values;
- more often than not everything can be converted to NumPy arrays.

## DataLoader Class

Needed for
* Batching: load multiple samples at once to optimize GPU usage;
* Shuffling: prevent model overfitting to specific sequences;
* Streaming: load large datasets efficiently from disk;
* Preprocessing on the Fly: apply transformations while loading.

### Simple Dataloader

In [None]:
class SimpleDataLoader:
    def __init__(self, x, y, batch_size):
        self.x = x
        self.y = y
        self.batch_size = batch_size
        self.n = len(x)
        self.indices = np.arange(self.n)

    # makes this class iterable
    def __iter__(self):
        for i in range(0, self.n, self.batch_size):
            batch_indices = self.indices[i:i + self.batch_size]
            yield self.x[batch_indices], self.y[batch_indices]

What is this `yield` in Python? Known as a **generator**, `yield` is a special kind of iterator that allows to pause and resume execution.  

Difference Between `return` and `yield`:
- `return` **stops** the function and returns a value.
- `yield` **pauses** the function and lets it resume later.

In [None]:
batch_size = 4
dataloader = SimpleDataLoader(x, y, batch_size)

i = 0
for x_batch, y_batch in dataloader:
    if i==2:
      break

    # do something with the batch
    print(f'Planets in the batch: {x_batch[:,0]}')

    i += 1

### TFDS DataLoader

Signature: https://www.tensorflow.org/datasets/api_docs/python/tfds/load

In [None]:
# restrict TF from grabbing resources
tf.config.set_visible_devices([], device_type='GPU')

# Train Set: Batch approach
train_loader, info = tfds.load(
    "cifar10",
    split="train",
    as_supervised=True,
    with_info=True,
    # data_dir=data_dir,
)

num_classes = info.features["label"].num_classes
img_size = info.features["image"].shape
n_samples = info.splits["train"].num_examples

In [None]:
img_size, n_samples

In [None]:
x_batched, y_batched = next(iter(train_loader))

In [None]:
x_batched.shape

In [None]:
plt.imshow(x_batched)

In [None]:
def augment(image, label):
    # recommended augmentation from https://optax.readthedocs.io/en/latest/_collections/examples/cifar10_resnet.html
    image = tf.image.resize_with_crop_or_pad(image, 40, 40)
    image = tf.image.random_crop(image, [32, 32, 3])
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_brightness(image, max_delta=0.2)
    image = tf.image.random_contrast(image, 0.8, 1.2)
    image = tf.image.random_saturation(image, 0.8, 1.2)
    return image, label

batch_size = 8
seed = 1337

train_loader_batched = train_loader.map(augment).shuffle(
    buffer_size=10_000,  # 1/5 of the dataset
    reshuffle_each_iteration=True,
    seed=seed,
).batch(batch_size, drop_remainder=True).prefetch(tf.data.AUTOTUNE)

In [None]:
x_batched, y_batched = next(iter(train_loader_batched))

In [None]:
x_batched.shape


In [None]:
plt.imshow(x_batched[1])

## Data Processing

Let's go back to text data that is the focus of our project.

In [None]:
ds = tfds.load("tiny_shakespeare")

In [None]:
# combine train and test examples into a single string
text_train = ""
for example in ds["train"].concatenate(ds["test"]).as_numpy_iterator():
    text_train += example["text"].decode("utf-8")

# similarly, create a single string for validation
text_validation = ""
for example in ds["validation"].as_numpy_iterator():
    text_validation += example["text"].decode("utf-8")

In [None]:
text_train[:100]

### Text Encoding: Character-level

In [None]:
vocab = sorted(list(set(text_train)))

In [None]:
print(vocab)

In [None]:
# create a mapping from characters to integers
stoi = {ch: i for i, ch in enumerate(vocab)}
itos = {i: ch for i, ch in enumerate(vocab)}
encode = lambda s: [
    stoi[c] for c in s
]  # encoder: take a string, output a list of integers
decode = lambda l: "".join(
    [itos[i] for i in l]
)  # decoder: take a list of integers, output a string

In [None]:
train_data = encode(text_train)

In [None]:
text_train[:15]

In [None]:
train_data[:15]

In [None]:
decode(train_data[:15])

### Text Encoding: Subword-based - Byte pair encoding (BPE)

Web: https://github.com/openai/tiktoken

In [None]:
%%capture
%pip install tiktoken

In [None]:
from tiktoken._educational import *

enc = SimpleBytePairEncoding.from_tiktoken("cl100k_base")

In [None]:
enc.encode(text_train[:15])

In [None]:
enc.decode([5451])

In [None]:
aa = enc.encode("aardvark")

In [None]:
for subword in aa:
   print(enc.decode([subword]))

https://en.wikipedia.org/wiki/Aardvark#/media/File:Orycteropus_afer_175359469.jpg

Still remember the news dataset? Let's try to encode it!

In [None]:
news_all_in_one = '\n'.join(news.data)

In [None]:
import sys

def human_readable_size(size, decimal_places=2):
    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
        if size < 1024.0:
            return f"{size:.{decimal_places}f} {unit}"
        size /= 1024.0
    return f"{size:.{decimal_places}f} PB"

size_in_bytes = sys.getsizeof(news_all_in_one)

human_readable_size(size_in_bytes)


In [None]:
enc = tiktoken.get_encoding("o200k_base")
news_enc = enc.encode(news_all_in_one)
len(news_enc)

In [None]:
news_enc[:20]