## IMDB Dataset (NLP)
IMDB ist beliebtestes Dataset und besteht aus 25_000 Film-Reviews fürs Training und 25_000 fürs Testing. Review ist entweder positiv oder negativ. Netzwerk soll erkennen, ob der Text einer Review positiv oder negativ ist --> Text Klassifikation.

In [None]:
import numpy as np
from typing import Tuple
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.layers import TextVectorization 
    # Transformiert Wörter in Integers. Jedes Wort wird durch einen Index ersetzt (gleiche Wörter -> gleicher Index)
    # Per Default alles zu lowercase
    # ... --> https://www.tensorflow.org/api_docs/python/tf/keras/layers/TextVectorization
from tensorflow.keras.models import Sequential
from tensorcross.utils import dataset_split

In [None]:
np.random.seed(0)
tf.random.set_seed(0)

In [None]:
# %%
class IMDB:
    def __init__(self, vocab_size: int, sequence_length: int, validation_size: float = 0.33) -> None:
        # User-defined constants
        self.num_classes = 2
        self.batch_size = 128
        self.vocab_size = vocab_size
        self.sequence_length = sequence_length
        # Load dataset
        dataset = tfds.load('imdb_reviews', as_supervised=True)
        self.train_dataset, self.val_dataset = dataset_split(dataset['train'], split_fraction=validation_size)
        self.test_dataset = dataset['test']
        # Dataset attributes
        self.train_size = len(self.train_dataset)
        self.test_size = len(self.test_dataset)
        self.tval_size = len(self.val_dataset)
        # Vectorization layer
        self.vectorization_layer = TextVectorization(
            max_tokens=self.vocab_size,
            output_mode='int',
            output_sequence_length=self.sequence_length
        )
        text_data = self.train_dataset.map(lambda x,y: x) # Nur Wörter und nicht etwa die Klasse übergeben
        self.vectorization_layer.adapt(text_data) # Schaut sich das Dataset an, welche Wörter es gibt und bestimmt damit das Vokabular
        self.vocabulary = self.vectorization_layer.get_vocabulary()
        self.word_index = dict(zip(self.vocabulary, range(len(self.vocabulary))))
        # Prepare Datasets
        self.train_dataset = self._prepare_dataset(self.train_dataset)
        self.test_dataset = self._prepare_dataset(self.test_dataset) 
        self.val_dataset = self._prepare_dataset(self.val_dataset) 

    def get_train_set(self) -> tf.data.Dataset:
        return self.train_dataset
    
    def get_test_set(self) -> tf.data.Dataset:
        return self.test_dataset

    def get_val_set(self) -> tf.data.Dataset:
        return self.val_dataset

    def _build_preprocessing(self) -> Sequential:
        model = Sequential()
        model.add(self.vectorization_layer)
        return model

    def _mask_to_categorical(self, x: tf.Tensor, y: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
        y = tf.one_hot(tf.cast(y, tf.int32), depth=self.num_classes)
        y = tf.cast(y, tf.float32)
        return x, y

    def _prepare_dataset(
        self,
        dataset: tf.data.Dataset,
        shuffle: bool = False,
        augment: bool = False
    ) -> tf.data.Dataset:
        preprocessing_model = self._build_preprocessing()
        dataset = dataset.map(
            map_func=lambda x, y: (preprocessing_model(x, training=False), y),
            num_parallel_calls=tf.data.experimental.AUTOTUNE
        )
        dataset = dataset.map(
            map_func=lambda x, y: self._mask_to_categorical(x, y),
            num_parallel_calls=tf.data.experimental.AUTOTUNE
        )

        dataset = dataset.batch(batch_size=self.batch_size)

        return dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

In [None]:
vocab_size = 20_000
sequence_length = 80
imdb_data = IMDB(vocab_size, sequence_length)
train_dataset = imdb_data.get_train_set()
# print(imdb_data.vocabulary)
# print(imdb_data.word_index)
for text_batch, label_batch in train_dataset.take(1):
    for i in range(3):    
        print(text_batch[i].numpy(), label_batch[i].numpy())