In [None]:
!pip install -r https://raw.githubusercontent.com/datamllab/automl-in-action-notebooks/master/requirements.txt

## 8.1.1 Loading image classification dataset

In [None]:
!wget https://github.com/datamllab/automl-in-action-notebooks/raw/master/data/mnist.tar.gz
!tar xzf mnist.tar.gz

--2021-07-25 20:09:24--  https://github.com/datamllab/automl-in-action-notebooks/raw/master/data/mnist.tar.gz
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/datamllab/automl-in-action-notebooks/master/data/mnist.tar.gz [following]
--2021-07-25 20:09:24--  https://raw.githubusercontent.com/datamllab/automl-in-action-notebooks/master/data/mnist.tar.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 17472747 (17M) [application/octet-stream]
Saving to: ‘mnist.tar.gz.1’


2021-07-25 20:09:25 (84.3 MB/s) - ‘mnist.tar.gz.1’ saved [17472747/17472747]



```
train/
  0/
    1.png
    21.png
    ...
  1/
  2/
  3/
  ...

test/
  0/
  1/
  ...
```

In [None]:
import os
import autokeras as ak

batch_size = 32
img_height = 28
img_width = 28

parent_dir = 'data'

test_data = ak.image_dataset_from_directory(
    os.path.join(parent_dir, 'test'),
    seed=123,
    color_mode="grayscale",
    image_size=(img_height, img_width),
    batch_size=batch_size,
)
for images, labels in test_data.take(1):
    print(images.shape, images.dtype)
    print(labels.shape, labels.dtype)

Found 10000 files belonging to 10 classes.
(32, 28, 28, 1) <dtype: 'float32'>
(32,) <dtype: 'string'>


## 8.1.2 Splitting the loaded dataset

In [None]:
all_train_data = ak.image_dataset_from_directory(
    os.path.join(parent_dir, 'train'),
    seed=123,
    color_mode="grayscale",
    image_size=(img_height, img_width),
    batch_size=batch_size,
)
train_data = all_train_data.take(int(60000 / batch_size * 0.8))
validation_data = all_train_data.skip(int(60000 / batch_size * 0.8))

Found 60000 files belonging to 10 classes.


In [None]:
train_data = ak.image_dataset_from_directory(
    os.path.join(parent_dir, 'train'),
    validation_split=0.2,
    subset="training",
    seed=123,
    color_mode="grayscale",
    image_size=(img_height, img_width),
    batch_size=batch_size,
)

validation_data = ak.image_dataset_from_directory(
    os.path.join(parent_dir, 'train'),
    validation_split=0.2,
    subset="validation",
    seed=123,
    color_mode="grayscale",
    image_size=(img_height, img_width),
    batch_size=batch_size,
)

Found 60000 files belonging to 10 classes.
Using 48000 files for training.
Found 60000 files belonging to 10 classes.
Using 12000 files for validation.


In [None]:
import tensorflow as tf

train_data = train_data.prefetch(5)
validation_data = validation_data.prefetch(5)
test_data = test_data.prefetch(tf.data.AUTOTUNE)

Then we just do one quick demo of AutoKeras to make sure the dataset works.


In [None]:
clf = ak.ImageClassifier(overwrite=True, max_trials=1)
clf.fit(train_data, epochs=1, validation_data=validation_data)
print(clf.evaluate(test_data))

Trial 1 Complete [00h 03m 44s]
val_loss: 0.06113607808947563

Best val_loss So Far: 0.06113607808947563
Total elapsed time: 00h 03m 44s
INFO:tensorflow:Oracle triggered exit
INFO:tensorflow:Assets written to: ./image_classifier/best_model/assets
[0.05080397054553032, 0.9833999872207642]


## 8.1.3 Loading text classification dataset
You can also load text datasets in the same way.


In [None]:
!wget https://github.com/datamllab/automl-in-action-notebooks/raw/master/data/imdb.tar.gz
!tar xzf imdb.tar.gz

--2021-07-25 20:18:35--  https://github.com/datamllab/automl-in-action-notebooks/raw/master/data/imdb.tar.gz
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/datamllab/automl-in-action-notebooks/master/data/imdb.tar.gz [following]
--2021-07-25 20:18:35--  https://raw.githubusercontent.com/datamllab/automl-in-action-notebooks/master/data/imdb.tar.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 29215039 (28M) [application/octet-stream]
Saving to: ‘imdb.tar.gz.1’


2021-07-25 20:18:35 (62.4 MB/s) - ‘imdb.tar.gz.1’ saved [29215039/29215039]



For this dataset, the data is already split into train and test.
We just load them separately.


In [None]:
import os
import autokeras as ak
import tensorflow as tf

train_data = ak.text_dataset_from_directory(
    "imdb/train",
    validation_split=0.2,
    subset="training",
    seed=123,
    max_length=1000,
    batch_size=32,
).prefetch(1000)

validation_data = ak.text_dataset_from_directory(
    "imdb/train",
    validation_split=0.2,
    subset="validation",
    seed=123,
    max_length=1000,
    batch_size=32,
).prefetch(1000)

test_data = ak.text_dataset_from_directory(
    "imdb/test",
    max_length=1000,
).prefetch(1000)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.


In [None]:
clf = ak.TextClassifier(overwrite=True, max_trials=1)
clf.fit(train_data, epochs=2, validation_data=validation_data)
print(clf.evaluate(test_data))

Trial 1 Complete [00h 05m 40s]
val_loss: 0.33729812502861023

Best val_loss So Far: 0.33729812502861023
Total elapsed time: 00h 05m 40s
INFO:tensorflow:Oracle triggered exit
Epoch 1/2
Epoch 2/2
INFO:tensorflow:Assets written to: ./text_classifier/best_model/assets
[0.35387808084487915, 0.8450400233268738]


## 8.1.4 Handling large dataset in general format

In [None]:
data = [5, 8, 9, 3, 6]
def generator():   
    for i in data: 
        yield i   
          
for x in generator():
    print(x)

5
8
9
3
6


In [None]:
dataset = tf.data.Dataset.from_generator(
    generator,
    output_types=tf.int32)
for x in dataset:
    print(x.numpy())

5
8
9
3
6


In [None]:
import numpy as np

path = os.path.join(parent_dir, "train")

def load_data(path):
    data = []
    for class_label in ["pos", "neg"]:
        for file_name in os.listdir(os.path.join(path, class_label)):
            data.append((os.path.join(path, class_label, file_name), class_label))

    data = np.array(data)
    np.random.shuffle(data)
    return data

def get_generator(data):
    def data_generator():
        for file_path, class_label in data:
            text_file = open(file_path, "r")
            text = text_file.read()
            text_file.close()
            yield text, class_label
    return data_generator
    
all_train_np = load_data(os.path.join(parent_dir, "train"))

def np_to_dataset(data_np):
  return tf.data.Dataset.from_generator(
    get_generator(data_np), 
    output_types=tf.string,
    output_shapes=tf.TensorShape([2]),
  ).map(lambda x: (x[0], x[1])).batch(32).prefetch(5)

train_data = np_to_dataset(all_train_np[:20000])
validation_data = np_to_dataset(all_train_np[20000:])
test_np = load_data(os.path.join(parent_dir, "test"))
test_data = np_to_dataset(test_np)

for texts, labels in train_data.take(1):
    print(texts.shape)
    print(labels.shape)

(32,)
(32,)


In [None]:
clf = ak.TextClassifier(overwrite=True, max_trials=1)
clf.fit(train_data, epochs=2, validation_data=validation_data)
print(clf.evaluate(test_data))

Trial 1 Complete [00h 07m 08s]
val_loss: 0.2818661630153656

Best val_loss So Far: 0.2818661630153656
Total elapsed time: 00h 07m 08s
INFO:tensorflow:Oracle triggered exit
Epoch 1/2
Epoch 2/2
INFO:tensorflow:Assets written to: ./text_classifier/best_model/assets
[0.282741516828537, 0.8845199942588806]
