<a href="https://colab.research.google.com/github/girish445ai/Recurrent_Neural_networks/blob/main/SeqtoSeq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import os
import numpy as np
import matplotlib.pyplot as plt
from random import shuffle
import pandas as pd
import keras
from keras.layers import Input, LSTM, Dense, Embedding, GRU, SimpleRNN, Dropout, Activation, dot, concatenate, TimeDistributed
from keras.models import Model

!pip3 install tensorflow -qqq
!pip3 install wandb -qqq
import wandb
!wandb login
from wandb.keras import WandbCallback

[K     |████████████████████████████████| 462 kB 4.1 MB/s 
[K     |████████████████████████████████| 1.8 MB 4.1 MB/s 
[K     |████████████████████████████████| 181 kB 22.9 MB/s 
[K     |████████████████████████████████| 144 kB 27.2 MB/s 
[K     |████████████████████████████████| 63 kB 917 kB/s 
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
!wget -nc https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar

--2022-04-17 04:51:44--  https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar
Resolving storage.googleapis.com (storage.googleapis.com)... 108.177.97.128, 108.177.125.128, 142.250.157.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|108.177.97.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2008340480 (1.9G) [application/x-tar]
Saving to: ‘dakshina_dataset_v1.0.tar’


2022-04-17 04:52:00 (124 MB/s) - ‘dakshina_dataset_v1.0.tar’ saved [2008340480/2008340480]



In [None]:
!yes | tar xopf dakshina_dataset_v1.0.tar

In [None]:
# The folder containing the datasets to be used in this program
!ls dakshina_dataset_v1.0/te/lexicons

te.translit.sampled.dev.tsv   te.translit.sampled.train.tsv
te.translit.sampled.test.tsv


In [None]:
test_path = './dakshina_dataset_v1.0/te/lexicons/te.translit.sampled.test.tsv'
val_path = './dakshina_dataset_v1.0/te/lexicons/te.translit.sampled.dev.tsv'
train_path = './dakshina_dataset_v1.0/te/lexicons/te.translit.sampled.train.tsv'

In [None]:
START_CHAR = '\t'
END_CHAR = '\n'
BLANK_CHAR = ' '

## Input and output pairs 

In [None]:
def reading_data(data_path, characters = False):
    with open(data_path, "r", encoding="utf-8") as f:
        lines = [line.split("\t") for line in f.read().split("\n") if line != '']
    
    input, target = [val[1] for val in lines], [val[0] for val in lines] # READING input/output samples as string when characters= False 
    if characters:
        input, target = [list(inp_str) for inp_str in input], [list(tar_str) for tar_str in target]
    return input, target # READING input/output samples as list of characters when characters= True 


### Processing the data

In [None]:
def processing_data(input, enc_timesteps, input_char_enc, target = None, dec_timesteps = None, target_char_enc = None):
    # Returns the input and target data word embeddings. 
    
    # BLANK_CHAR is the space input (if spaces are in input) 
    encoder_input = np.array([[input_char_enc[ch] for ch in string] + [input_char_enc[BLANK_CHAR]] * (enc_timesteps - len(string)) for string in input])

    decoder_input, decoder_target = None, None
    if target is not None and dec_timesteps is not None and target_char_enc is not None:
        # START_CHAR is the start of sequence, END_CHAR is end of sequence
        decoder_input = np.array([[target_char_enc[START_CHAR]] + [target_char_enc[ch] for ch in string] + [target_char_enc[END_CHAR]] 
                                    + [target_char_enc[BLANK_CHAR]] * (dec_timesteps - len(string) - 2) for string in target])
        decoder_target = np.zeros((decoder_input.shape[0], dec_timesteps, len(target_char_enc)), dtype='float32')

        for i in range(decoder_input.shape[0]):
            for t, char_ind in enumerate(decoder_input[i]):
                if t > 0:
                    decoder_target[i,t-1,char_ind] = 1.0
            decoder_target[i,t:,target_char_enc[BLANK_CHAR]] = 1.0

    return encoder_input, decoder_input, decoder_target


def encode_decode_characters(train_input, train_target, val_input, val_target):
    # Returns the encoding for characters to integer (as a dictionary) and decoding for integers to characters (as a list) for input and target data

    # Encoding and decoding of input vocabulary
    input_char_enc = {}
    input_char_dec = []
    max_encoder_seq_length = 1
    for string in train_input + val_input:
        max_encoder_seq_length = max(max_encoder_seq_length, len(string))
        for char in string:
            if char not in input_char_enc:
                input_char_enc[char] = len(input_char_dec)
                input_char_dec.append(char)
    if BLANK_CHAR not in input_char_enc:
        input_char_enc[BLANK_CHAR] = len(input_char_dec)
        input_char_dec.append(BLANK_CHAR)

    # Encoding and decoding of target vocabulary
    target_char_enc = {}
    target_char_dec = []
    target_char_enc[START_CHAR] = len(target_char_dec)
    target_char_dec.append(START_CHAR)
    max_decoder_seq_length = 1
    for string in train_target + val_target:
        max_decoder_seq_length = max(max_decoder_seq_length, len(string)+2)
        for char in string:
            if char not in target_char_enc:
                target_char_enc[char] = len(target_char_dec)
                target_char_dec.append(char)
    target_char_enc[END_CHAR] = len(target_char_dec)
    target_char_dec.append(END_CHAR)
    if ' ' not in target_char_enc:
        target_char_enc[BLANK_CHAR] = len(target_char_dec)
        target_char_dec.append(BLANK_CHAR)

    print("Number of training samples:", len(train_input))
    print("Number of validation samples:", len(val_input))
    print("Number of unique input tokens:", len(input_char_dec))
    print("Number of unique output tokens:", len(target_char_dec))
    print("Max sequence length for inputs:", max_encoder_seq_length)
    print("Max sequence length for outputs:", max_decoder_seq_length)

    return input_char_enc, input_char_dec, target_char_enc, target_char_dec, max_encoder_seq_length, max_decoder_seq_length
