# Year format conversion learning

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf

from keras.layers import Dense, Bidirectional, LSTM, RepeatVector
from keras.models import Sequential

from utils import CharacterTable, vectorization, train_val_split, model_inference, fitting_visualize

# Parameters for the model and dataset.
training_size = 50000

# Maximum length of Answer
maxlen = 20

## Data Generation

In [None]:
# Generate the 'month' string to be used
month=['january','february','march','april','may','june','july','august','september','october','november','december']

# A dictionary mapping the corresponding 'month' to a number
month_to_ind=dict((c,i+1) for i,c in enumerate(month))

In [None]:
# Generate answer data matching the date in character format
# Create a date in three formats
def data_generation(size):
  questions=[]
  answers=[]

  for i in range(size):
    seed = np.random.randint(0,3)
    
    if seed==0:
      q = np.random.choice(month)+' '+str(np.random.randint(1,32))+'th, '+str(np.random.randint(1900,2022))
      a = q.split()[2]+'-'+str(month_to_ind[q.split()[0]])+'-'+q.split()[1][:-3]

    if seed==1:
      q = str(np.random.randint(1900,2022))+' '+np.random.choice(month)+' '+str(np.random.randint(1,32))+'th'
      a = q.split()[0]+'-'+str(month_to_ind[q.split()[1]])+'-'+q.split()[2][:-2]

    if seed==2:
      q = str(np.random.randint(1,32))+'th '+np.random.choice(month)+' '+str(np.random.randint(1900,2022))
      a = q.split()[2]+'-'+str(month_to_ind[q.split()[1]])+'-'+q.split()[0][:-2]
    
    q += ' '*(20-len(q))
    a += ' '*(10-len(a))

    questions.append(q)
    answers.append(a)

  return questions, answers 

In [None]:
questions, answers = data_generation(training_size)

print('Question Samples:\n',questions[:5],'\n')
print('Answer Samples:\n',answers[:5])

Question Samples:
 ['october 19th, 1932  ', 'december 21th, 1915 ', '3th april 2003      ', '11th september 1946 ', '1966 december 4th   '] 

Answer Samples:
 ['1932-10-19', '1915-12-21', '2003-4-3  ', '1946-9-11 ', '1966-12-4 ']


## Vectorization

In [None]:
# The calculated number is 3 or 4 digits, and there are cases where ' ' is included, so the string of ' ' is also considered
chars='0123456789-abcdefghijklmnopqrstuvwxyz, '
ctable=CharacterTable(chars)

In [None]:
x, y = vectorization(questions, answers, chars, maxlen, 10, ctable)

print("x shape", x.shape)
print("y shape", y.shape)

x shape (50000, 20, 39)
y shape (50000, 10, 39)


In [None]:
x_train, x_val, y_train, y_val = train_val_split(x,y)

Training Data:
(45000, 20, 39)
(45000, 10, 39) 

Validation Data:
(5000, 20, 39)
(5000, 10, 39)


In [None]:
print('Question:', questions[53],'\n')
print('Question:', answers[53],'\n')

print('Encoded Question:\n\n', x_train[53],'\n')
print('Encoded Answer:\n\n', y_train[53])

Question: 2006 november 7th    

Question: 2006-11-7  

Encoded Question:

 [[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

## Modeling

In [None]:
# Build Bidirectional LSTM Sequence Model
def bd_lstm_model(num_layers):
  model=Sequential()
  model.add(Bidirectional(LSTM(128),input_shape=(maxlen, len(chars))))
  model.add(RepeatVector(10)) # convert because target value has 10 rows
  for _ in range(num_layers):
    model.add(LSTM(128, return_sequences=True))
  model.add(Dense(len(chars),activation='softmax'))

  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

  return model

model=bd_lstm_model(1)

In [None]:
epochs=5
batch_size=32

fitting_visualize(x_train, y_train, x_val, y_val, model, epochs, batch_size, ctable)


Iteration 1
Q may 25th, 1935       T 1935-5-25  x 1955-5-25 
Q march 24th, 1910     T 1910-3-24  x 1900-3-24 
Q 1915 december 18th   T 1915-12-18 x 1911-12-18
Q 26th january 1943    T 1943-1-26  x 1944-1-26 
Q 23th april 1955      T 1955-4-23  v 1955-4-23 
Q december 8th, 1909   T 1909-12-8  x 1900-12-8 
Q november 17th, 1968  T 1968-11-17 x 1966-11-17
Q july 11th, 1907      T 1907-7-11  x 1917-7-11 
Q 2th september 1985   T 1985-9-2   x 1955-9-2  
Q 30th june 1990       T 1990-6-30  v 1990-6-30 

Iteration 2
Q 2014 december 10th   T 2014-12-10 v 2014-12-10
Q 14th february 2006   T 2006-2-14  v 2006-2-14 
Q november 7th, 1938   T 1938-11-7  x 1988-11-7 
Q 1968 november 30th   T 1968-11-30 x 1988-11-30
Q 1905 september 11th  T 1905-9-11  v 1905-9-11 
Q january 7th, 1993    T 1993-1-7   v 1993-1-7  
Q september 6th, 2021  T 2021-9-6   x 2011-9-6  
Q march 31th, 2012     T 2012-3-31  x 2011-3-31 
Q 1976 february 23th   T 1976-2-23  x 1966-2-23 
Q september 24th, 2005 T 2005-9-24  v 2005-

In [None]:
# Build Basic LSTM Sequence Model
def bd_lstm_model(num_layers):
  model=Sequential()
  model.add(LSTM(128,input_shape=(maxlen, len(chars))))
  model.add(RepeatVector(10)) # convert because target value has 10 rows
  for _ in range(num_layers):
    model.add(LSTM(128, return_sequences=True))
  model.add(Dense(len(chars),activation='softmax'))

  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

  return model

model=bd_lstm_model(1)

In [None]:
epochs=5
batch_size=32

fitting_visualize(x_train, y_train, x_val, y_val, model, epochs, batch_size, ctable)


Iteration 1
Q february 7th, 1952   T 1952-2-7   x 1922-2-5  
Q september 4th, 1930  T 1930-9-4   x 1900-9-4  
Q 1942 may 15th        T 1942-5-15  x 1955-5-25 
Q august 15th, 2000    T 2000-8-15  x 2008-8-18 
Q september 6th, 1992  T 1992-9-6   x 1996-9-9  
Q 1974 april 29th      T 1974-4-29  x 1944-4-24 
Q october 23th, 1944   T 1944-10-23 v 1944-10-23
Q february 26th, 1996  T 1996-2-26  v 1996-2-26 
Q 1932 april 26th      T 1932-4-26  x 1922-4-22 
Q 7th march 1955       T 1955-3-7   x 1955-5-5  

Iteration 2
Q 1989 march 3th       T 1989-3-3   v 1989-3-3  
Q 1960 december 1th    T 1960-12-1  v 1960-12-1 
Q july 1th, 1964       T 1964-7-1   x 1966-7-1  
Q january 21th, 1927   T 1927-1-21  x 1927-1-22 
Q 1928 may 29th        T 1928-5-29  x 1922-5-29 
Q 1987 february 23th   T 1987-2-23  x 1988-2-23 
Q december 5th, 1924   T 1924-12-5  x 1922-12-5 
Q september 12th, 2013 T 2013-9-12  x 2013-9-11 
Q 2012 july 23th       T 2012-7-23  x 2022-7-23 
Q 26th february 1986   T 1986-2-26  x 1988-

In [None]:
model_inference('april 5th 2012', model, ctable).strip()

'2012-4-5'

In [None]:
print('Inaccurate Test 1:',model_inference('2013 sepstemer 25th').strip())
print('Inaccurate Test 2:',model_inference('2019 julqy 29th').strip())
print('Inaccurate Test 3:',model_inference('15th octobe, 1944').strip())
print('Inaccurate Test 4:',model_inference('marh, 22th 1903').strip())

Inaccurate Test 1: 2013-9-25
Inaccurate Test 2: 2019-7-29
Inaccurate Test 3: 1944-10-15
Inaccurate Test 4: 1903-3-2
