# Deep Learning Exam - University of Bologna - A.Y. 2023/2024

### Student: Om Mishra
### Matricola: 0001101279

## Sentence reconstruction





## Imports


In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.1 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2

## Dataset

In [None]:
#Download the dataset

from datasets import load_dataset
from keras.layers import TextVectorization
import tensorflow as tf
import numpy as np
np.random.seed(42)
ds = load_dataset('generics_kb',trust_remote_code=True)['train']

#Filter row with length greater than 8.

ds = ds.filter(lambda row: len(row["generic_sentence"].split(" ")) > 8 )
corpus = [ '<start> ' + row['generic_sentence'].replace(","," <comma>") + ' <end>' for row in ds ]
corpus = np.array(corpus)

#Create a tokenizer and Detokenizer

tokenizer=TextVectorization( max_tokens=10000, standardize="lower_and_strip_punctuation", encoding="utf-8",) #con il max prende le piu frequenti. ordina i token del vocab dal piu frequente al meno frequente
tokenizer.adapt(corpus)

class TextDetokenizer:
    def __init__(self, vectorize_layer):
        self.vectorize_layer = vectorize_layer
        vocab = self.vectorize_layer.get_vocabulary()
        self.index_to_word = {index: word for index, word in enumerate(vocab)}

    def __detokenize_tokens(self, tokens):
        def check_token(t):
          if t == 3:
            s="<start>"
          elif t == 2:
            s="<end>"
          elif t == 7:
            s="<comma>"
          else:
            s=self.index_to_word.get(t, '[UNK]')
          return s

        return ' '.join([ check_token(token) for token in tokens if token != 0])

    def __call__(self, batch_tokens):
       return [self.__detokenize_tokens(tokens) for tokens in batch_tokens]


detokenizer = TextDetokenizer( tokenizer )
sentences = tokenizer( corpus ).numpy()

#Remove from corpus the sentences where any unknow word appears

mask = np.sum( (sentences==1), axis=1) >= 1
original_data = np.delete( sentences, mask , axis=0)
original_data.shape

#Shuffle the sentences

from tensorflow.keras.utils import Sequence

class DataGenerator(Sequence):
    def __init__(self, data, batch_size=32, shuffle=True, seed=42):
        self.data = data
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.seed = seed
        self.on_epoch_end()


    def __len__(self):
        return int(np.floor(len(self.data) / self.batch_size))

    def __getitem__(self, index):
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        data_batch = np.array([self.data[k] for k in indexes])
        #copy of ordered sequences
        result = np.copy(data_batch)
        #shuffle only the relevant positions for each batch
        for i in range(data_batch.shape[0]):
          np.random.shuffle(data_batch[i,1:data_batch[i].argmin() - 1])

        return data_batch , result

    def on_epoch_end(self):
        self.indexes = np.arange(len(self.data))
        if self.shuffle:
            if self.seed is not None:
                np.random.seed(self.seed)
            np.random.shuffle(self.indexes)


# Make a random permutation of training and test set
np.random.seed(42)

# Shuffle all the data
shuffled_indices = np.random.permutation(len(original_data))
shuffled_data = original_data[shuffled_indices]

#split the dataset
train_generator = DataGenerator(shuffled_data[:220000])
test_generator = DataGenerator(shuffled_data[220000:])

x, y = test_generator.__getitem__(1)
x = detokenizer(x)
y = detokenizer(y)

for i in range(7):
  print("original: ", y[i])
  print("shuffled: ", x[i])
  print("\n")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/8.64k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/11.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/27.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1020868 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1020868 [00:00<?, ? examples/s]

original:  <start> ranchers clear large areas of rainforest to become pastures for their cattle <end>
shuffled:  <start> large their areas for cattle ranchers rainforest clear pastures become to of <end>


original:  <start> some earwigs have stripes on the thorax and abdomen <end>
shuffled:  <start> stripes thorax some and the earwigs on abdomen have <end>


original:  <start> magnetic manipulation can turn molecules in a liquid into computing such devices <end>
shuffled:  <start> into in magnetic such a liquid molecules can manipulation computing turn devices <end>


original:  <start> healthy wetlands means cleaner water <comma> reduced flooding and more places for recreation <end>
shuffled:  <start> reduced wetlands and recreation for water places healthy cleaner flooding <comma> means more <end>


original:  <start> market share is the percent share in sales one company controls in a particular market <end>
shuffled:  <start> company percent share one controls a sales in market is

## Model

In [None]:
from datasets import load_dataset
from keras.layers import TextVectorization
import tensorflow as tf
import numpy as np
from tensorflow.keras.utils import Sequence
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate, Attention
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy

# Download the dataset
ds = load_dataset('generics_kb', trust_remote_code=True)['train']

# Filter rows with length greater than 8
ds = ds.filter(lambda row: len(row["generic_sentence"].split(" ")) > 8)
corpus = ['<start> ' + row['generic_sentence'].replace(",", " <comma>") + ' <end>' for row in ds]
corpus = np.array(corpus)

# Create a tokenizer and Detokenizer
tokenizer = TextVectorization(max_tokens=10000, standardize="lower_and_strip_punctuation", encoding="utf-8")
tokenizer.adapt(corpus)

class TextDetokenizer:
    def __init__(self, vectorize_layer):
        self.vectorize_layer = vectorize_layer
        vocab = self.vectorize_layer.get_vocabulary()
        self.index_to_word = {index: word for index, word in enumerate(vocab)}

    def __detokenize_tokens(self, tokens):
        def check_token(t):
            if t == 3:
                s = "<start>"
            elif t == 2:
                s = "<end>"
            elif t == 7:
                s = "<comma>"
            else:
                s = self.index_to_word.get(t, '[UNK]')
            return s

        return ' '.join([check_token(token) for token in tokens if token != 0])

    def __call__(self, batch_tokens):
        return [self.__detokenize_tokens(tokens) for tokens in batch_tokens]

detokenizer = TextDetokenizer(tokenizer)
sentences = tokenizer(corpus).numpy()

# Remove from corpus the sentences where any unknown word appears
mask = np.sum((sentences == 1), axis=1) >= 1
original_data = np.delete(sentences, mask, axis=0)

# Shuffle the sentences
np.random.seed(42)
shuffled_indices = np.random.permutation(len(original_data))
shuffled_data = original_data[shuffled_indices]

# Define hyperparameters
max_sequence_len = original_data.shape[1]  # Get maximum sequence length from processed data
vocab_size = len(tokenizer.get_vocabulary())
embedding_dim = 128
rnn_units = 256  # Experiment with different values

# Encoder
encoder_inputs = Input(shape=(max_sequence_len,))
encoder_embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(rnn_units, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_sequence_len,))
decoder_embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(rnn_units, return_sequences=True)
decoder_outputs = decoder_lstm(decoder_embedding, initial_state=encoder_states)

# Attention mechanism
attention = Attention()
attention_output = attention([decoder_outputs, encoder_outputs])
decoder_concat_input = Concatenate()([decoder_outputs, attention_output])

# Dense layer
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_concat_input)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
optimizer = Adam(learning_rate=0.001)
loss = SparseCategoricalCrossentropy(from_logits=False)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

# Summary of the model
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 28)]                 0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 28)]                 0         []                            
                                                                                                  
 embedding (Embedding)       (None, 28, 128)              1280000   ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, 28, 128)              1280000   ['input_2[0][0]']             
                                                                                              

## Training

In [None]:

# Define the custom training class
class CustomTraining(tf.keras.Model):
    def train_step(self, data):
        inputs, targets = data
        encoder_input, decoder_input = inputs
        decoder_target = targets

        with tf.GradientTape() as tape:
            predictions = self([encoder_input, decoder_input], training=True)
            loss = self.compiled_loss(decoder_target, predictions, regularization_losses=self.losses)

        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        self.compiled_metrics.update_state(decoder_target, predictions)
        return {m.name: m.result() for m in self.metrics}

# Create an instance of the custom model
custom_model = CustomTraining(inputs=model.input, outputs=model.output)
custom_model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

# Update DataGenerator class to handle sequence-to-sequence data
class Seq2SeqDataGenerator(Sequence):
    def __init__(self, data, batch_size=42, shuffle=True, seed=52):
        self.data = data
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.seed = seed
        self.on_epoch_end()

    def __len__(self):
        return int(np.floor(len(self.data) / self.batch_size))

    def __getitem__(self, index):
        indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
        data_batch = np.array([self.data[k] for k in indexes])
        result = np.copy(data_batch)

        # Shuffle input sequences except for the start and end tokens
        for i in range(data_batch.shape[0]):
            np.random.shuffle(data_batch[i, 1:data_batch[i].argmin() - 1])

        # Create decoder inputs by shifting target sequences
        decoder_input_data = np.zeros(data_batch.shape)
        decoder_input_data[:, 1:] = result[:, :-1]

        return [data_batch, decoder_input_data], result

    def on_epoch_end(self):
        self.indexes = np.arange(len(self.data))
        if self.shuffle:
            if self.seed is not None:
                np.random.seed(self.seed)
            np.random.shuffle(self.indexes)

# Create data generators
train_generator = Seq2SeqDataGenerator(shuffled_data[:220000])
test_generator = Seq2SeqDataGenerator(shuffled_data[220000:])

# Train the model
epochs = 20  # Adjust the number of epochs as needed
custom_model.fit(train_generator, validation_data=test_generator, epochs=epochs)

Epoch 1/15
Epoch 2/15
 63/547 [==>...........................] - ETA: 1:35 - loss: 0.2654 - accuracy: 0.9206

KeyboardInterrupt: 

## Inference and Evaluation

In [3]:
# Normal Inference


def batch_infer(input_sequence, start_token, end_token, max_length=max_sequence_len):
    # Encoding the input
    encoder_input = input_sequence
    # Initializing the decoder input as a tensor filled with zeros with shape (batch_size, max_sequence_len)
    decoder_input = np.zeros((len(encoder_input), max_length))
    # Filling the first character of decoder input with the start of sequence token
    decoder_input[:, 0] = start_token  # assuming you have defined start of sequence token (sos_token)

    stop_conditions = np.zeros(input_sequence.shape[0], dtype=bool)
    encoded_sentences = [[] for _ in range(input_sequence.shape[0])]
    decoded_sentences = [[] for _ in range(input_sequence.shape[0])]

    i = 0
    while not all(stop_conditions):
        # Make a prediction using the model
        predictions = model.predict([encoder_input, decoder_input])

        # Find the most probable word ID from softmax output
        predicted_ids = np.argmax(predictions, axis=-1)

        for idx, token in enumerate(predicted_ids):
            if not stop_conditions[idx]:
                encoded_sentences[idx].append(token)

                # Exit condition: either hit max length or find stop character.
                if token == end_token:
                    encoded_sentences[idx].pop()
                    decoded_sentences[idx] = detokenizer([encoded_sentences[idx]])[0]
                    stop_conditions[idx] = True
                elif len(encoded_sentences[idx]) == max_length:
                    decoded_sentences[idx] = detokenizer([encoded_sentences[idx]])[0]
                    print("Warning: sentence reached max length without EOS token:\n"
                          "SHUFFLED: %s\nDECODED: %s" % (detokenizer([encoder_input[idx]]),
                                                         decoded_sentences[idx]))
                    stop_conditions[idx] = True

        # Place the predicted_id to the appropriate timestep within the batch
        if i != max_length - 1:
            decoder_input[:, i + 1] = predicted_ids[:, i]

        i += 1

    return decoded_sentences


NameError: name 'max_sequence_len' is not defined

## Evaluation

In [1]:
from difflib import SequenceMatcher
import random
import numpy as np
import tensorflow as tf

def score(s, p):
    s = s.replace("<start>", "").replace("<end>", "")
    p = p.replace("<start>", "").replace("<end>", "")
    match = SequenceMatcher(None, s, p).find_longest_match(0, len(s), 0, len(p))
    return match.size / max(len(s), len(p))

def batch_infer(input_sequence, start_token, end_token, max_length):
    encoder_input = np.array(input_sequence)  # Convert to NumPy array and take encoder input part
    decoder_input = np.zeros((encoder_input.shape[0], max_length))
    decoder_input[:, 0] = start_token

    stop_conditions = np.zeros(encoder_input.shape[0], dtype=bool)
    encoded_sentences = [[] for _ in range(encoder_input.shape[0])]
    decoded_sentences = [[] for _ in range(encoder_input.shape[0])]

    i = 0
    while not all(stop_conditions) and i < max_length:
        predictions = model.predict([encoder_input, decoder_input], verbose=0)
        predicted_ids = np.argmax(predictions, axis=-1)

        for idx, token in enumerate(predicted_ids[:, i]):
            if not stop_conditions[idx]:
                encoded_sentences[idx].append(token)
                if token == end_token or len(encoded_sentences[idx]) == max_length:
                    decoded_sentences[idx] = detokenizer([encoded_sentences[idx]])[0]
                    stop_conditions[idx] = True

        if i != max_length - 1:
            decoder_input[:, i + 1] = predicted_ids[:, i]

        i += 1

    return decoded_sentences

# Ensure test generator has enough examples
test_size = len(test_generator)
num_samples = min(3000, test_size)

# Choose random examples from the test set
random_indices = random.sample(range(test_size), num_samples)
average_score = 0.0

for index in random_indices:
    input_sequence, target_sequence = test_generator.__getitem__(index)
    predictions = batch_infer(input_sequence[0], start_token_index, end_token_index, max_length=max_sequence_len)

    for i in range(len(input_sequence[0])):
        generated_sentence = predictions[i]
        target_sentence = detokenizer([target_sequence[i]])[0]
        average_score += score(target_sentence, generated_sentence)

average_score /= num_samples
print(f"Average Score over {num_samples} random examples: {average_score}")


NameError: name 'test_generator' is not defined