## Seq2Seq mapping

In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense




In [3]:
# Load the dataset
df = pd.read_csv('train.csv')

In [4]:
df.head()

Unnamed: 0,gloss,text
0,﻿MEMBERSHIP PARLIAMENT SEE MINUTE\n,﻿membership of parliament see minutes\n
1,APPROVAL MINUTE DESC-PREVIOUS SIT SEE MINUTE\n,approval of minutes of previous sitting see mi...
2,MEMBERSHIP PARLIAMENT SEE MINUTE\n,membership of parliament see minutes\n
3,VERIFICATION CREDENTIALS SEE MINUTE\n,verification of credentials see minutes\n
4,DOCUMENT RECEIVE SEE MINUTE\n,documents received see minutes\n


In [5]:
df.isnull().sum()

gloss    0
text     0
dtype: int64

In [6]:
df.describe(include="all")

Unnamed: 0,gloss,text
count,87710,87710
unique,81016,81123
top,APPLAUSE\n,applause \n
freq,595,595


In [7]:
df.shape

(87710, 2)

In [8]:
def remove_n(inp):
    return inp.replace("\n", "")

In [9]:
df["gloss"]=df["gloss"].apply(remove_n)

In [10]:
df.head()

Unnamed: 0,gloss,text
0,﻿MEMBERSHIP PARLIAMENT SEE MINUTE,﻿membership of parliament see minutes\n
1,APPROVAL MINUTE DESC-PREVIOUS SIT SEE MINUTE,approval of minutes of previous sitting see mi...
2,MEMBERSHIP PARLIAMENT SEE MINUTE,membership of parliament see minutes\n
3,VERIFICATION CREDENTIALS SEE MINUTE,verification of credentials see minutes\n
4,DOCUMENT RECEIVE SEE MINUTE,documents received see minutes\n


In [11]:
df["text"]=df["text"].apply(remove_n)

In [12]:
df.head()

Unnamed: 0,gloss,text
0,﻿MEMBERSHIP PARLIAMENT SEE MINUTE,﻿membership of parliament see minutes
1,APPROVAL MINUTE DESC-PREVIOUS SIT SEE MINUTE,approval of minutes of previous sitting see mi...
2,MEMBERSHIP PARLIAMENT SEE MINUTE,membership of parliament see minutes
3,VERIFICATION CREDENTIALS SEE MINUTE,verification of credentials see minutes
4,DOCUMENT RECEIVE SEE MINUTE,documents received see minutes


In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [14]:
text_texts=df["text"].values
gloss_texts=df["gloss"].values
text_texts = ['<start> ' + sentence + ' <end>' for sentence in text_texts]
gloss_texts = ['<start>' + sentence + ' <end>' for sentence in gloss_texts]

In [15]:
text_tokenizer = Tokenizer()
text_tokenizer.fit_on_texts(text_texts)
text_vocab_size = len(text_tokenizer.word_index) + 1
text_sequences = text_tokenizer.texts_to_sequences(text_texts)

In [16]:
gloss_tokenizer = Tokenizer()
gloss_tokenizer.fit_on_texts(gloss_texts)
gloss_vocab_size = len(gloss_tokenizer.word_index) + 1
gloss_sequences = gloss_tokenizer.texts_to_sequences(gloss_texts)

In [17]:
max_text_seq_length = max([len(seq) for seq in text_sequences])
max_gloss_seq_length = max([len(seq) for seq in gloss_sequences])

text_sequences = pad_sequences(text_sequences, maxlen=max_text_seq_length, padding='post')
gloss_sequences = pad_sequences(gloss_sequences, maxlen=max_gloss_seq_length, padding='post')

In [18]:
# Split data into training and validation sets
encoder_input_data, encoder_input_data_val, decoder_input_data, decoder_input_data_val = train_test_split(
    text_sequences, gloss_sequences, test_size=0.2)

# Create decoder target data
decoder_target_data = np.zeros_like(decoder_input_data)
decoder_target_data[:, :-1] = decoder_input_data[:, 1:]
#decoder_target_data[:, -1] = gloss_tokenizer.word_index['<end>']  # Ensure you have an end token

decoder_target_data_val = np.zeros_like(decoder_input_data_val)
decoder_target_data_val[:, :-1] = decoder_input_data_val[:, 1:]
#decoder_target_data_val[:, -1] = gloss_tokenizer.word_index['<end>']  # Ensure you have an end token

In [19]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding

# Define the dimensions
latent_dim = 256

# Define the encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(text_vocab_size, latent_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Define the decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(gloss_vocab_size, latent_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(gloss_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the Seq2Seq model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

# Train the model
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=64, epochs=1, validation_data=([encoder_input_data_val, decoder_input_data_val], decoder_target_data_val))







<keras.src.callbacks.History at 0x2aa523af010>

In [1]:
# Necessary imports
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding

# Assuming these variables are defined elsewhere in your script
# Define these variables based on your actual data and model parameters
latent_dim = 256
gloss_vocab_size = 5000  # Example size, replace with your actual vocab size
encoder_inputs = Input(shape=(None,))  # Shape should match your input shape
decoder_inputs = Input(shape=(None,))  # Shape should match your input shape

# Define the encoder LSTM
encoder_lstm = LSTM(latent_dim, return_state=True)
_, state_h, state_c = encoder_lstm(encoder_inputs)
encoder_states = [state_h, state_c]

# Encoder model for inference
encoder_model = Model(encoder_inputs, encoder_states)

# Define the decoder LSTM
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_dense = Dense(gloss_vocab_size, activation='softmax')

# Decoder model for inference
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_embedding_inf = Embedding(gloss_vocab_size, latent_dim, mask_zero=True)(decoder_inputs)
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_embedding_inf, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

# Reverse word index for decoding
reverse_text_word_index = {i: word for word, i in text_tokenizer.word_index.items()}
reverse_gloss_word_index = {i: word for word, i in gloss_tokenizer.word_index.items()}

max_seq_length = 400

# Function to decode the sequence
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1))
    # Populate the first token of target sequence with the start token.
    # target_seq[0, 0] = gloss_tokenizer.word_index['<start>']

    # Sampling loop for a batch of sequences
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_gloss_word_index.get(sampled_token_index, '')
        decoded_sentence += ' ' + sampled_char

        # Exit condition: either hit max length or find stop character.
        if (sampled_char == '<end>' or len(decoded_sentence) > max_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence.strip()

# Example of using the inference function
input_seq = encoder_input_data[1:2]  # Taking the first sequence for translation
translation = decode_sequence(input_seq)
print('Encoded:', translation)


NameError: name 'Model' is not defined