<a href="https://colab.research.google.com/github/chandragupta0001/NLP/blob/master/NMT/dates_NMT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
!pip install Faker
from faker import Faker
import random
from tqdm import tqdm
from babel.dates import format_date
from keras.utils import to_categorical
import keras.backend as K
import matplotlib.pyplot as plt

fake = Faker()
# fake.seed(12345)
random.seed(12345)

FORMATS = ['short',
           'medium',
           'long',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'd MMM YYY', 
           'd MMMM YYY',
           'dd MMM YYY',
           'd MMM, YYY',
           'd MMMM, YYY',
           'dd, MMM YYY',
           'd MM YY',
           'd MMMM YYY',
           'MMMM d YYY',
           'MMMM d, YYY',
           'dd.MM.YY']


def load_date():
    """
        Loads some fake dates 
        :returns: tuple containing human readable string, machine readable string, and date object
    """
    dt = fake.date_object()

    try:
        human_readable = format_date(dt, format=random.choice(FORMATS),  locale='en_US') # locale=random.choice(LOCALES))
        human_readable = human_readable.lower()
        human_readable = human_readable.replace(',','')
        machine_readable = dt.isoformat()
        
    except AttributeError as e:
        return None, None, None

    return human_readable, machine_readable, dt

def load_dataset(m):
    """
        Loads a dataset with m examples and vocabularies
        :m: the number of examples to generate
    """
    
    human_vocab = set()
    machine_vocab = set()
    dataset = []
    Tx = 30
    

    for i in tqdm(range(m)):
        h, m, _ = load_date()
        if h is not None:
            dataset.append((h, m))
            human_vocab.update(tuple(h))
            machine_vocab.update(tuple(m))
    
    human = dict(zip(sorted(human_vocab) + ['<unk>', '<pad>'], 
                     list(range(len(human_vocab) + 2))))
    inv_machine = dict(enumerate(sorted(machine_vocab)))
    machine = {v:k for k,v in inv_machine.items()}
    
 
    return dataset, human, machine, inv_machine

Collecting Faker
[?25l  Downloading https://files.pythonhosted.org/packages/51/50/fc971c0d1bbbd6442adf390bd5d354138d187f399ca5d5e0f7cad99314ea/Faker-4.17.1-py3-none-any.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 5.9MB/s 
Installing collected packages: Faker
Successfully installed Faker-4.17.1


In [2]:
m = 10000
dataset, human_vocab, machine_vocab, inv_machine_vocab = load_dataset(m)

100%|██████████| 10000/10000 [00:00<00:00, 20878.24it/s]


In [3]:
dataset[:5]

[('13 sep 1990', '1990-09-13'),
 ('30.07.77', '1977-07-30'),
 ('3/23/12', '2012-03-23'),
 ('thursday june 7 2018', '2018-06-07'),
 ('tuesday august 13 1985', '1985-08-13')]

In [5]:
X,Y=zip(*dataset)

In [24]:
X[i]

'tuesday august 13 1985'

In [6]:
import tensorflow as tf
from tensorflow import keras

In [161]:
token_human=tf.keras.preprocessing.text.Tokenizer(char_level=True, oov_token=2
)
token_machine=tf.keras.preprocessing.text.Tokenizer(num_words=11,char_level=True, oov_token=0
)
token_human.fit_on_texts(X)
token_machine.fit_on_texts(Y)

In [44]:
def encode_X(X,human_vocab,token_human,max_len=30):
  human_dates=np.zeros((len(X),max_len,len(human_vocab)))
  for i in range(len(X)):
    a=token_human.texts_to_matrix(X[i])
    human_dates[i]=np.pad(a,[(0,30-a.shape[0]),(0,0)], mode='constant', constant_values=0)

  return  human_dates


def encode_y(y,machine_vocab,token_machine,max_len=10):
  machine_dates=np.empty((len(y),max_len,len(machine_vocab)))
  for i in range(len(y)):
       machine_dates[i]=token_machine.texts_to_matrix(y[i])
  return machine_dates

  


In [45]:
X_train=encode_X(X,human_vocab,token_human)

In [46]:
y_train=encode_y(Y,machine_vocab,token_machine)

In [47]:
print(X_train.shape,y_train.shape)

(10000, 30, 37) (10000, 10, 11)


In [48]:
encoder = keras.models.Sequential([

  keras.layers.Input(shape=(30,37)),
    keras.layers.LSTM(128)
])


In [49]:
decoder = keras.models.Sequential([
    keras.layers.LSTM(128, return_sequences=True),
    keras.layers.Dense(11, activation="softmax")
])

In [50]:
model = keras.models.Sequential([
    encoder,
    keras.layers.RepeatVector(10),
    decoder
])

In [51]:
optimizer = keras.optimizers.Nadam()
model.compile(loss="categorical_crossentropy", optimizer=optimizer,
              metrics=["accuracy"])

In [53]:
history=model.fit(X_train, y_train, epochs=25,validation_split=0.2)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [141]:
reverse_word_map = dict(map(reversed, token_machine.word_index.items()))

# Function takes a tokenized sentence and returns the words
def sequence_to_text(list_of_indices):
    # Looking up words in dictionary
    words = [reverse_word_map.get(letter) for letter in list_of_indices]
    return(words)

In [157]:
date=encode_X(["30 march 1995"],human_vocab,token_human)
my_texts = list(map(sequence_to_text, model.predict_classes(date).tolist()))
print(my_texts)

[['1', '9', '9', 0, '-', '0', '3', '-', '3', '0']]


In [159]:
model.predict_classes(date)

array([[4, 6, 6, 1, 2, 3, 9, 2, 9, 3]])

In [163]:
print(token_machine.word_index.items())

dict_items([(0, 1), ('-', 2), ('0', 3), ('1', 4), ('2', 5), ('9', 6), ('8', 7), ('7', 8), ('3', 9), ('4', 10), ('5', 11), ('6', 12)])


In [175]:
token_machine.texts_to_matrix(["1995-06-30"])


array([[0., 1., 1., 1., 1., 0., 1., 0., 0., 1., 0.]])

In [168]:
token_machine.sequences_to_texts([[4, 6, 6, 1, 2, 3, 9, 2, 9, 3]])

TypeError: ignored

In [173]:
y_train[:10]

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 1., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 1., 0., 0.],
        ...,
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 1., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0.