## Seq2Seq mapping

In [81]:
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense

In [82]:
# Load the dataset
df = pd.read_csv('train.csv')

In [83]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,gloss
0,0,How are you today?,"TODAY, HOW ARE YOU?"
1,1,What's your favorite food?,"FAVORITE FOOD, WHAT?"
2,2,Where did you go on vacation?,"VACATION WHERE, YOU GO?"
3,3,Did you watch the game last night?,"GAME LAST NIGHT, YOU WATCH?"
4,4,How's work going?,"WORK, HOW GO?"


In [84]:
df.isnull().sum()

Unnamed: 0    0
text          0
gloss         0
dtype: int64

In [85]:
df.describe(include="all")

Unnamed: 0.1,Unnamed: 0,text,gloss
count,225.0,225,225
unique,,225,225
top,,How are you today?,"TODAY, HOW ARE YOU?"
freq,,1,1
mean,73.111111,,
std,52.138304,,
min,0.0,,
25%,28.0,,
50%,62.0,,
75%,118.0,,


In [86]:
df.shape

(225, 3)

In [87]:
def remove_n(inp):
    return inp.replace("\n", "")

In [88]:
df["gloss"]=df["gloss"].apply(remove_n)

In [89]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,gloss
0,0,How are you today?,"TODAY, HOW ARE YOU?"
1,1,What's your favorite food?,"FAVORITE FOOD, WHAT?"
2,2,Where did you go on vacation?,"VACATION WHERE, YOU GO?"
3,3,Did you watch the game last night?,"GAME LAST NIGHT, YOU WATCH?"
4,4,How's work going?,"WORK, HOW GO?"


In [90]:
df["text"]=df["text"].apply(remove_n)

In [91]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,gloss
0,0,How are you today?,"TODAY, HOW ARE YOU?"
1,1,What's your favorite food?,"FAVORITE FOOD, WHAT?"
2,2,Where did you go on vacation?,"VACATION WHERE, YOU GO?"
3,3,Did you watch the game last night?,"GAME LAST NIGHT, YOU WATCH?"
4,4,How's work going?,"WORK, HOW GO?"


In [92]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [93]:
text_texts=df["text"].values
gloss_texts=df["gloss"].values
text_texts = ['<start> ' + sentence + ' <end>' for sentence in text_texts]
gloss_texts = ['<start>' + sentence + ' <end>' for sentence in gloss_texts]

In [94]:
print(text_texts)
print('-'*60)
print(gloss_texts)

['<start> How are you today? <end>', "<start> What's your favorite food? <end>", '<start> Where did you go on vacation? <end>', '<start> Did you watch the game last night? <end>', "<start> How's work going? <end>", '<start> Have you read any good books lately? <end>', '<start> What time is it? <end>', '<start> Do you have any plans for the weekend? <end>', '<start> Did you hear about the new restaurant? <end>', '<start> Are you feeling better? <end>', '<start> Can you pass me the salt, please? <end>', "<start> What's the weather like today? <end>", '<start> Do you need any help? <end>', '<start> How was your day? <end>', '<start> What do you do for fun? <end>', '<start> Are you busy right now? <end>', '<start> Have you seen that movie? <end>', "<start> What's your favorite color? <end>", '<start> Would you like some coffee? <end>', '<start> Are you going to the party? <end>', "<start> What's your favorite TV show? <end>", '<start> Did you have a good weekend? <end>', '<start> Where did

In [95]:
text_tokenizer = Tokenizer()
text_tokenizer.fit_on_texts(text_texts)
text_vocab_size = len(text_tokenizer.word_index) + 1
text_sequences = text_tokenizer.texts_to_sequences(text_texts)

In [96]:
gloss_tokenizer = Tokenizer()
gloss_tokenizer.fit_on_texts(gloss_texts)
gloss_vocab_size = len(gloss_tokenizer.word_index) + 1
gloss_sequences = gloss_tokenizer.texts_to_sequences(gloss_texts)

In [97]:
max_text_seq_length = max([len(seq) for seq in text_sequences])
max_gloss_seq_length = max([len(seq) for seq in gloss_sequences])

text_sequences = pad_sequences(text_sequences, maxlen=max_text_seq_length, padding='post')
gloss_sequences = pad_sequences(gloss_sequences, maxlen=max_gloss_seq_length, padding='post')

In [98]:
# Split data into training and validation sets
encoder_input_data, encoder_input_data_val, decoder_input_data, decoder_input_data_val = train_test_split(
    text_sequences, gloss_sequences, test_size=0.2)

# Create decoder target data
decoder_target_data = np.zeros_like(decoder_input_data)
decoder_target_data[:, :-1] = decoder_input_data[:, 1:]
#decoder_target_data[:, -1] = gloss_tokenizer.word_index['<end>']  # Ensure you have an end token

decoder_target_data_val = np.zeros_like(decoder_input_data_val)
decoder_target_data_val[:, :-1] = decoder_input_data_val[:, 1:]
#decoder_target_data_val[:, -1] = gloss_tokenizer.word_index['<end>']  # Ensure you have an end token

In [99]:
print(encoder_input_data_val)

[[  2  59   5 ...   0   0   0]
 [  2 673  12 ...   0   0   0]
 [  2 268 153 ...   0   0   0]
 ...
 [  2  47  12 ...   0   0   0]
 [  2  15 204 ...   0   0   0]
 [  2 152 232 ...   0   0   0]]


In [100]:
print(encoder_input_data_val)

[[  2  59   5 ...   0   0   0]
 [  2 673  12 ...   0   0   0]
 [  2 268 153 ...   0   0   0]
 ...
 [  2  47  12 ...   0   0   0]
 [  2  15 204 ...   0   0   0]
 [  2 152 232 ...   0   0   0]]


In [101]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding

latent_dim = 256

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(text_vocab_size, latent_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(gloss_vocab_size, latent_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(gloss_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Seq2Seq Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

# Training
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=64, epochs=10,
          validation_data=([encoder_input_data_val, decoder_input_data_val], decoder_target_data_val))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x272ca277bd0>

In [102]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Bidirectional, Dropout
from tensorflow.keras.optimizers import Adam

latent_dim = 256
embedding_dim = 300  # Example size, adjust as needed

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(text_vocab_size, embedding_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = Bidirectional(LSTM(latent_dim, return_state=True))
encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_lstm(encoder_embedding)
encoder_states = [tf.keras.layers.Concatenate()([forward_h, backward_h]), tf.keras.layers.Concatenate()([forward_c, backward_c])]

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(gloss_vocab_size, embedding_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(latent_dim * 2, return_sequences=True, return_state=True)  # Adjust based on encoder output
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dropout = Dropout(0.5)(decoder_outputs)
decoder_dense = Dense(gloss_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_dropout)

# Seq2Seq Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Training
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=64, epochs=40,
          validation_data=([encoder_input_data_val, decoder_input_data_val], decoder_target_data_val))


Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.src.callbacks.History at 0x272d4278050>

In [103]:
def decode_sequence(input_seq):
    # Encode the input as state vectors
    states_value = encoder_model.predict(input_seq)
    
    # Generate empty target sequence of length 1
    target_seq = np.zeros((1, 1))
    
    # Check for the correct index of '<start>'
    start_token_index = gloss_tokenizer.word_index.get('<start>', None)
    if start_token_index is None:
        raise ValueError("'<start>' token is not in the tokenizer word index.")
    
    target_seq[0, 0] = start_token_index

    # Sampling loop for a batch of sequences
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:

        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        print("Sampled Token Index:", sampled_token_index)
        print("Sampled Char:", gloss_tokenizer.index_word.get(sampled_token_index, ''))


        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        
        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = gloss_tokenizer.index_word.get(sampled_token_index, '')
        decoded_sentence += ' ' + sampled_char

        # Exit condition: either hit max length or find stop character
        if (sampled_char == '<end>' or len(decoded_sentence) > max_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1)
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence.strip()


In [104]:
print(gloss_tokenizer.word_index)


{'start': 1, 'end': 2, 'you': 3, 'books': 4, 'reading': 5, 'what': 6, 'favorite': 7, 'work': 8, 'read': 9, 'important': 10, 'travel': 11, 'improve': 12, 'time': 13, 'new': 14, 'help': 15, 'benefit': 16, 'how': 17, 'people': 18, 'have': 19, 'health': 20, 'skills': 21, 'go': 22, 'good': 23, 'better': 24, 'enjoy': 25, 'more': 26, 'book': 27, 'like': 28, 'activity': 29, 'life': 30, 'keep': 31, 'tasks': 32, 'be': 33, 'stress': 34, 'make': 35, 'explore': 36, 'different': 37, 'provide': 38, 'create': 39, 'ensure': 40, 'offer': 41, 'with': 42, 'to': 43, 'show': 44, 'sleep': 45, 'outdoors': 46, 'reduce': 47, 'social': 48, 'encourage': 49, 'world': 50, 'experience': 51, 'learn': 52, 'personal': 53, 'growth': 54, 'enhance': 55, 'events': 56, 'stay': 57, 'others': 58, 'tools': 59, 'prevent': 60, 'maintain': 61, 'are': 62, 'weekend': 63, 'day': 64, 'fun': 65, 'grow': 66, 'job': 67, 'hobby': 68, 'spend': 69, 'many': 70, 'physical': 71, 'mental': 72, 'increase': 73, 'nature': 74, 'promote': 75, 'well

In [105]:
# Ensure the special tokens are included in the tokenizer
special_tokens = {'<start>': len(gloss_tokenizer.word_index) + 1, '<end>': len(gloss_tokenizer.word_index) + 2}
gloss_tokenizer.word_index.update(special_tokens)
gloss_tokenizer.index_word.update({v: k for k, v in special_tokens.items()})


In [106]:
# Example input sequence
input_seq = encoder_input_data_val[1:2]  # Take the second sample as an example
predicted_sentence = decode_sequence(input_seq)
print('Predicted Sentence:', predicted_sentence)


Sampled Token Index: 1696
Sampled Char: 
Sampled Token Index: 1015
Sampled Char: 
Sampled Token Index: 1015
Sampled Char: 
Sampled Token Index: 1175
Sampled Char: 
Sampled Token Index: 1015
Sampled Char: 
Sampled Token Index: 4946
Sampled Char: 
Sampled Token Index: 1042
Sampled Char: 
Sampled Token Index: 518
Sampled Char: clearly
Sampled Token Index: 816
Sampled Char: 
Sampled Token Index: 2274
Sampled Char: 
Sampled Token Index: 2571
Sampled Char: 
Sampled Token Index: 2571
Sampled Char: 
Sampled Token Index: 2571
Sampled Char: 
Sampled Token Index: 165
Sampled Char: wonders
Sampled Token Index: 652
Sampled Char: confidence
Sampled Token Index: 2372
Sampled Char: 
Sampled Token Index: 3546
Sampled Char: 
Sampled Token Index: 4165
Sampled Char: 
Sampled Token Index: 757
Sampled Char: 
Sampled Token Index: 719
Sampled Char: 
Sampled Token Index: 2372
Sampled Char: 
Sampled Token Index: 1137
Sampled Char: 
Sampled Token Index: 1288
Sampled Char: 
Sampled Token Index: 4863
Sampled Char:

In [107]:
for seq_index in [99,2,45,40]:
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', text_texts[seq_index: seq_index + 1])
    print('Decoded sentence:', decoded_sentence)

Sampled Token Index: 1729
Sampled Char: 
Sampled Token Index: 215
Sampled Char: essential
Sampled Token Index: 252
Sampled Char: family
Sampled Token Index: 3436
Sampled Char: 
Sampled Token Index: 1187
Sampled Char: 
Sampled Token Index: 4609
Sampled Char: 
Sampled Token Index: 463
Sampled Char: doing
Sampled Token Index: 2983
Sampled Char: 
Sampled Token Index: 314
Sampled Char: languages
Sampled Token Index: 2983
Sampled Char: 
Sampled Token Index: 2200
Sampled Char: 
Sampled Token Index: 2200
Sampled Char: 
Sampled Token Index: 4524
Sampled Char: 
Sampled Token Index: 3281
Sampled Char: 
Sampled Token Index: 1601
Sampled Char: 
Sampled Token Index: 4377
Sampled Char: 
Sampled Token Index: 2200
Sampled Char: 
Sampled Token Index: 2200
Sampled Char: 
Sampled Token Index: 2316
Sampled Char: 
Sampled Token Index: 307
Sampled Char: deep
Sampled Token Index: 1492
Sampled Char: 
Sampled Token Index: 3321
Sampled Char: 
Sampled Token Index: 78
Sampled Char: outside
Sampled Token Index: 184

In [108]:
# def calculate_accuracy(encoder_model, decoder_model, data, target_sequences):
#     total_correct_tokens = 0
#     total_tokens = 0

#     for i, input_seq in enumerate(data):
#         predicted_sequence = decode_sequence(np.expand_dims(input_seq, axis=0))
#         target_sequence = target_sequences[i]

#         predicted_tokens = predicted_sequence.split()
#         target_tokens = target_sequence.split()

#         correct_tokens = sum([1 for j in range(min(len(predicted_tokens), len(target_tokens))) if predicted_tokens[j] == target_tokens[j]])
        
#         total_correct_tokens += correct_tokens
#         total_tokens += len(target_tokens)

#     accuracy = total_correct_tokens / total_tokens if total_tokens > 0 else 0
#     return accuracy

# # Example usage
# # Example target sequences for validation data
# target_texts_val = [
#     "hello my name <end>",
#     "how are you <end>",
#     # Add more sequences here corresponding to your validation data
# ]

# accuracy = calculate_accuracy(encoder_model, decoder_model, encoder_input_data_val, target_texts_val)
# print("Token-Level Accuracy: {:.2f}%".format(accuracy * 100))


In [109]:
import numpy as np

def calculate_accuracy(encoder_model, decoder_model, data, target_sequences, reverse_gloss_word_index, max_seq_length=400):
    total_correct_tokens = 0
    total_tokens = 0
    print("len(data)=",len(data))
    print(len(target_sequences))

    # Ensure the lengths of data and target_sequences match
    # assert len(data) == len(target_sequences), "Length mismatch between data and target_sequences"

    for i in range(len(data)):
        input_seq = data[i]
        target_sequence = target_sequences[i]

        # Predict the output sequence
        predicted_sequence = decode_sequence(np.expand_dims(input_seq, axis=0))

        # Convert predicted sequence to tokens
        predicted_tokens = predicted_sequence.split()
        
        # Convert target sequence to tokens
        target_tokens = [reverse_gloss_word_index.get(idx, '') for idx in target_sequence if idx > 0]
        
        # Compare the predicted tokens with the target tokens
        correct_tokens = sum([1 for j in range(min(len(predicted_tokens), len(target_tokens))) if predicted_tokens[j] == target_tokens[j]])
        
        total_correct_tokens += correct_tokens
        total_tokens += len(target_tokens)
    
    # Calculate token-level accuracy
    accuracy = total_correct_tokens / total_tokens if total_tokens > 0 else 0
    return accuracy

# Example usage
# Make sure target_sequences_val and encoder_input_data_val have matching lengths
target_sequences_val = [
    "hello my name",
    "how are you",
    # Add more sequences here corresponding to your validation data
]

accuracy = calculate_accuracy(encoder_model, decoder_model, encoder_input_data_val, target_sequences_val, reverse_gloss_word_index)
print("Token-Level Accuracy: {:.2f}%".format(accuracy * 100))


len(data)= 45
2
Sampled Token Index: 4332
Sampled Char: 
Sampled Token Index: 3870
Sampled Char: 
Sampled Token Index: 3870
Sampled Char: 
Sampled Token Index: 4335
Sampled Char: 
Sampled Token Index: 4335
Sampled Char: 
Sampled Token Index: 826
Sampled Char: 
Sampled Token Index: 648
Sampled Char: leadership
Sampled Token Index: 242
Sampled Char: was
Sampled Token Index: 242
Sampled Char: was
Sampled Token Index: 242
Sampled Char: was
Sampled Token Index: 1251
Sampled Char: 
Sampled Token Index: 4032
Sampled Char: 
Sampled Token Index: 4189
Sampled Char: 
Sampled Token Index: 3030
Sampled Char: 
Sampled Token Index: 3458
Sampled Char: 
Sampled Token Index: 2338
Sampled Char: 
Sampled Token Index: 2338
Sampled Char: 
Sampled Token Index: 1549
Sampled Char: 
Sampled Token Index: 2338
Sampled Char: 
Sampled Token Index: 854
Sampled Char: 
Sampled Token Index: 2987
Sampled Char: 
Sampled Token Index: 3640
Sampled Char: 
Sampled Token Index: 2607
Sampled Char: 
Sampled Token Index: 3008
Sa

TypeError: '>' not supported between instances of 'str' and 'int'