In [60]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd

In [61]:
train_data = pd.read_csv('./dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv', sep='\t')
train_data

Unnamed: 0,अं,an,3
0,अंकगणित,ankganit,3
1,अंकल,uncle,4
2,अंकुर,ankur,4
3,अंकुरण,ankuran,3
4,अंकुरित,ankurit,3
...,...,...,...
44197,ह्वेनसांग,hiuentsang,1
44198,ह्वेनसांग,hsuantsang,1
44199,ह्वेनसांग,hyensang,1
44200,ह्वेनसांग,xuanzang,1


In [62]:
train_data.columns = ['native_word','latin_word','-']
train_data = train_data[['native_word','latin_word']]
train_data

Unnamed: 0,native_word,latin_word
0,अंकगणित,ankganit
1,अंकल,uncle
2,अंकुर,ankur
3,अंकुरण,ankuran
4,अंकुरित,ankurit
...,...,...
44197,ह्वेनसांग,hiuentsang
44198,ह्वेनसांग,hsuantsang
44199,ह्वेनसांग,hyensang
44200,ह्वेनसांग,xuanzang


In [63]:
input_words = train_data['native_word'].to_list()
target_words = train_data['latin_word'].to_list()

In [64]:
i=0
for word in target_words:
    target_words[i]='\t'+word+'\n'
    

In [65]:
input_characters=set()
target_characters=set()
for word in input_words:
    for char in word:
        if char not in input_characters:
            input_characters.add(char)
for word in target_words:
    for char in word:
        if char not in target_characters:
            target_characters.add(char)

In [81]:
input_characters.add(" ")
target_characters.add(" ")

In [82]:
input_words[0]
print(input_words[0])
for char in input_words[0]:
    print(char)

अंकगणित
अ
ं
क
ग
ण
ि
त


In [83]:
input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])

In [84]:
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_words])
max_decoder_seq_length = max([len(txt) for txt in target_words])

In [85]:
print("Number of samples:", len(input_words))
print("Number of unique input tokens:", num_encoder_tokens)
print("Number of unique output tokens:", num_decoder_tokens)
print("Max sequence length for inputs:", max_encoder_seq_length)
print("Max sequence length for outputs:", max_decoder_seq_length)

Number of samples: 44202
Number of unique input tokens: 64
Number of unique output tokens: 29
Max sequence length for inputs: 19
Max sequence length for outputs: 20


In [91]:
batch_size = 64  # Batch size for training.
epochs = 10  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.
num_samples = 44202  # Number of samples to train on.

In [87]:

# the input is a tensor, the 1st dimension is the number of training examples, 2nd dimension is the maximum word length, 3rd dimension is the vocabulary 
# i think the input size is still flexible, this is just asking the number of states in worst case.
encoder_input_data = np.zeros(
    (len(input_words), max_encoder_seq_length, num_encoder_tokens), dtype="float32"
)
decoder_input_data = np.zeros(
    (len(input_words), max_decoder_seq_length, num_decoder_tokens), dtype="float32"
)
decoder_target_data = np.zeros(
    (len(input_words), max_decoder_seq_length, num_decoder_tokens), dtype="float32"
)

In [88]:


# one hot encoding is done here


for i, (input_text, target_text) in enumerate(zip(input_words, target_words)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.0
    encoder_input_data[i, t + 1 :, input_token_index[" "]] = 1.0
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1.0
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            # example:  "\t dog", this will be decoder_input, "dog \n", this should be decoder_target.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
    decoder_input_data[i, t + 1 :, target_token_index[" "]] = 1.0
    decoder_target_data[i, t:, target_token_index[" "]] = 1.0

In [89]:
# Define an input sequence and process it.
encoder_inputs = keras.Input(shape=(None, num_encoder_tokens))
encoder = keras.layers.LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)

# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = keras.Input(shape=(None, num_decoder_tokens))

# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = keras.layers.Dense(num_decoder_tokens, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [92]:
model.compile(
    optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]
)
model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2,
)
# Save model
model.save("s2s")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




INFO:tensorflow:Assets written to: s2s\assets


INFO:tensorflow:Assets written to: s2s\assets


In [21]:
model = keras.Sequential()
# Add an Embedding layer expecting input vocab of size 1000, and
# output embedding dimension of size 64.
model.add(layers.Embedding(input_dim=1000, output_dim=64))

# Add a LSTM layer with 128 internal units.
model.add(layers.LSTM(128))

# Add a Dense layer with 10 units.
model.add(layers.Dense(10))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 64)          64000     
                                                                 
 lstm (LSTM)                 (None, 128)               98816     
                                                                 
 dense (Dense)               (None, 10)                1290      
                                                                 
Total params: 164,106
Trainable params: 164,106
Non-trainable params: 0
_________________________________________________________________
