In [52]:
import pandas as pd
df=pd.read_csv("train.csv")
df.head()

Unnamed: 0,statement,Significant number 1 detected,Significant number 2 detected,equation generated,answer evaluated
0,add 45 and 71,45,71,x+y,116
1,add 53 and 34,53,34,x+y,87
2,add 22 and 35,22,35,x+y,57
3,add 38 and 97,38,97,x+y,135
4,add 87 and 71,87,71,x+y,158


In [53]:
df.tail(3)

Unnamed: 0,statement,Significant number 1 detected,Significant number 2 detected,equation generated,answer evaluated
7997,58 raise thrice,58,3,x^y,195112
7998,81 raise thrice,81,3,x^y,531441
7999,62 raise thrice,62,3,x^y,238328


In [54]:
dt=pd.read_csv("test.csv")
dt.head()

Unnamed: 0,statement,Significant number 1 detected,Significant number 2 detected,equation generated,answer evaluated
0,88 raise twice,,,,
1,add 43 and 11,,,,
2,multiply 33 with 95,,,,
3,96 times 5,,,,
4,increase 43 by 16,,,,


In [55]:
df['statement']=df['statement'].str.replace('thrice','3')
df['statement']=df['statement'].str.replace('twice','2')
dt['statement']=dt['statement'].str.replace('thrice','3')
dt['statement']=dt['statement'].str.replace('twice','2')
df.tail(3)

Unnamed: 0,statement,Significant number 1 detected,Significant number 2 detected,equation generated,answer evaluated
7997,58 raise 3,58,3,x^y,195112
7998,81 raise 3,81,3,x^y,531441
7999,62 raise 3,62,3,x^y,238328


In [0]:
def convert_eqn(eqn):
  '''
  Add a space 
  '''
  elements = list(eqn)
  return ' '.join(elements)

In [0]:
input_exps = list(df['statement'].values)
target_exps = list(df['equation generated'].apply(lambda x: convert_eqn(x)).values)

In [0]:
def preprocess_input(sentence):

  sentence = sentence.lower().strip()
  sentence = re.sub(r"([?.!,’])", r" \1 ", sentence)
  sentence = re.sub(r"([0-9])", r" \1 ", sentence)
  sentence = re.sub(r'[" "]+', " ", sentence)
  sentence = sentence.rstrip().strip()
  return '<start> ' + sentence + ' <end>'

def preprocess_target(sentence):

  sentence = sentence.lower().strip()
  return '<start> ' + sentence + ' <end>'

In [59]:
import re
preprocessed_input_exps = list(map(preprocess_input, input_exps))
preprocessed_target_exps = list(map(preprocess_target, target_exps))
print(preprocessed_input_exps[:3])
print(preprocessed_target_exps[:3])

['<start> add 4 5 and 7 1 <end>', '<start> add 5 3 and 3 4 <end>', '<start> add 2 2 and 3 5 <end>']
['<start> x + y <end>', '<start> x + y <end>', '<start> x + y <end>']


In [60]:
import tensorflow as tf
def tokenize(lang):
  
  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
  lang_tokenizer.fit_on_texts(lang)
  tensor = lang_tokenizer.texts_to_sequences(lang)
  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post', truncating='post')

  return tensor, lang_tokenizer

input_tensor, inp_lang_tokenizer = tokenize(preprocessed_input_exps)
len(inp_lang_tokenizer.word_index) 

23

In [61]:
target_tensor, targ_lang_tokenizer = tokenize(preprocessed_target_exps)
len(targ_lang_tokenizer.word_index)

8

In [0]:
import pandas as pd
import re
import os
import time
import random
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
import pickle


#train/test
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.01, random_state=42)

In [0]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 512

steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 64
units = 256

vocab_inp_size = len(inp_lang_tokenizer.word_index)+1
vocab_tar_size = len(targ_lang_tokenizer.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [64]:
vocab_inp_size, vocab_tar_size

(24, 9)

In [65]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([512, 8]), TensorShape([512, 5]))

ENCODER - DECODER seq2seq model

In [0]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')

  def call(self, x, hidden):
    x = self.embedding(x)
    #get the gru
    output, state = self.gru(x, initial_state = hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

In [67]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (512, 8, 256)
Encoder Hidden state shape: (batch size, units) (512, 256)


In [0]:
class Attention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(Attention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    
    hidden_with_time_axis = tf.expand_dims(query, 1)

    score = self.V(tf.nn.tanh(
        self.W1(values) + self.W2(hidden_with_time_axis)))

    #attention_weights
    attention_weights = tf.nn.softmax(score, axis=1)

    #context vector
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights


attention_layer = Attention(100)

In [69]:
attention_layer = Attention(100)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (512, 256)
Attention weights shape: (batch_size, sequence_length, 1) (512, 8, 1)


In [0]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)

    # used for attention
    self.attention = Attention(self.dec_units)

  def call(self, x, hidden, enc_output):
    # enc_output shape == (batch_size, max_length, hidden_size)
    context_vector, attention_weights = self.attention(hidden, enc_output)

    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    x = self.embedding(x)

    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    # passing the concatenated vector to the GRU
    output, state = self.gru(x)

    # output shape == (batch_size * 1, hidden_size)
    output = tf.reshape(output, (-1, output.shape[2]))

    # output shape == (batch_size, vocab)
    x = self.fc(output)

    return x, state, attention_weights

In [71]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (512, 9)


In [72]:
sample_hidden.shape

TensorShape([512, 256])

In [0]:
optimizer = tf.keras.optimizers.Adam(lr=0.001)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [74]:
#MAKE CHECKPOINT
checkpoint_dir = os.path.join("/content/checkpoint/")
print("Checkpoints directory is", checkpoint_dir)
if os.path.exists(checkpoint_dir):
  print("Checkpoints folder already exists")
else:
  print("Creating a checkpoints directory")
  os.makedirs(checkpoint_dir)


checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)



latest = tf.train.latest_checkpoint(checkpoint_dir)
if latest:
  epoch_num = int(latest.split('/')[-1].split('-')[-1])
  checkpoint.restore(latest)
else:
  epoch_num = 0

Checkpoints directory is /content/checkpoint/
Checkpoints folder already exists


In [0]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([targ_lang_tokenizer.word_index['<start>']] * BATCH_SIZE, 1)

    for t in range(1, targ.shape[1]):
      # passing enc_output to the decoder
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

      loss += loss_function(targ[:, t], predictions)

      # using teacher forcing
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))
  variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [76]:
EPOCHS = 30

for epoch in range(epoch_num, EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
        print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                     batch,
                                                     batch_loss.numpy()))
  checkpoint.save(file_prefix = checkpoint_prefix)
  print('Saved epoch: {}'.format(epoch+1))

  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 18 Batch 0 Loss 0.0009
Saved epoch: 18
Epoch 18 Loss 0.0009
Time taken for 1 epoch 14.036675930023193 sec

Epoch 19 Batch 0 Loss 0.0008
Saved epoch: 19
Epoch 19 Loss 0.0007
Time taken for 1 epoch 6.931444406509399 sec

Epoch 20 Batch 0 Loss 0.0007
Saved epoch: 20
Epoch 20 Loss 0.0007
Time taken for 1 epoch 6.915988445281982 sec

Epoch 21 Batch 0 Loss 0.0006
Saved epoch: 21
Epoch 21 Loss 0.0006
Time taken for 1 epoch 6.8942553997039795 sec

Epoch 22 Batch 0 Loss 0.0006
Saved epoch: 22
Epoch 22 Loss 0.0005
Time taken for 1 epoch 6.969532489776611 sec

Epoch 23 Batch 0 Loss 0.0005
Saved epoch: 23
Epoch 23 Loss 0.0005
Time taken for 1 epoch 6.939271926879883 sec

Epoch 24 Batch 0 Loss 0.0005
Saved epoch: 24
Epoch 24 Loss 0.0005
Time taken for 1 epoch 6.9743287563323975 sec

Epoch 25 Batch 0 Loss 0.0004
Saved epoch: 25
Epoch 25 Loss 0.0004
Time taken for 1 epoch 6.920475244522095 sec

Epoch 26 Batch 0 Loss 0.0004
Saved epoch: 26
Epoch 26 Loss 0.0004
Time taken for 1 epoch 6.9047210216

In [0]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))



def evaluate_accuracy(inputs):
    attention_plot = np.zeros((max_length_targ, max_length_inp))
    sentence = ''
    for i in range(len(inputs.numpy()[0])):
      if inputs.numpy()[0][i] != 0:
        sentence += inp_lang_tokenizer.index_word[inputs.numpy()[0][i]] + ' '

    inputs = tf.convert_to_tensor(inputs)
    
    result = ''
    result_seq = ''
    
    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)
    
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang_tokenizer.word_index['<start>']], 0)
    
    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                             dec_hidden,
                                                             enc_out)

        # storing the attention weights to plot later on
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()

        predicted_id = tf.argmax(predictions[0]).numpy()
        
        result_seq += str(predicted_id) +' '
        
        result += targ_lang_tokenizer.index_word[predicted_id] + ' '

        if targ_lang_tokenizer.index_word[predicted_id] == '<end>':
            return result_seq, result, sentence, attention_plot

        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    return result_seq, result, sentence, attention_plot

In [0]:
dataset_val = tf.data.Dataset.from_tensor_slices((input_tensor_val, target_tensor_val)).shuffle(BUFFER_SIZE)
dataset_val = dataset_val.batch(1, drop_remainder=True)

def max_length(tensor):
    return max(len(t) for t in tensor)

max_length_targ, max_length_inp = max_length(target_tensor), max_length(input_tensor)



y_true = []
y_pred = []
acc_cnt = 0

a = 0
for (inp_val_batch, target_val_batch) in iter(dataset_val):
  a += 1
  if a % 500 == 0:
    print(a)
    print("Accuracy count: ",acc_cnt)
    print('------------------')
  target_sentence = ''
  for i in target_val_batch.numpy()[0]:
    if i!= 0:
      target_sentence += (targ_lang_tokenizer.index_word[i] + ' ')
  target_sentence = target_sentence.split('<start> ')[1]
  
  y_true.append([target_sentence.split(' ')])

  res_seq, res, sent, att = evaluate_accuracy(inp_val_batch)
  y_pred.append(res.split(' '))
  
  if target_sentence == res:
    acc_cnt += 1

In [79]:
from nltk.translate.bleu_score import corpus_bleu
print('Corpus BLEU score of the model: ', corpus_bleu(y_true, y_pred))
print('Accuracy of the model: ',acc_cnt/len(input_tensor_val))

Corpus BLEU score of the model:  1.0
Accuracy of the model:  1.0


In [0]:
def preprocess_strings(sentence):
  sentence = sentence.lower().strip()
  return '<start> ' + sentence + ' <end>'


def evaluate(sentence):
    attention_plot = np.zeros((max_length_targ, max_length_inp))

    sentence = preprocess_strings(sentence)

    inputs = [inp_lang_tokenizer.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                           maxlen=max_length_inp,
                                                           padding='post')
    inputs = tf.convert_to_tensor(inputs)
    
    result = ''

    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang_tokenizer.word_index['<start>']], 0)

    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                             dec_hidden,
                                                             enc_out)

        # storing the attention weights to plot later on
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()

        predicted_id = tf.argmax(predictions[0]).numpy()

        result += targ_lang_tokenizer.index_word[predicted_id] + ' '

        if targ_lang_tokenizer.index_word[predicted_id] == '<end>':
          return result, sentence, attention_plot

        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence, attention_plot



def translate(sentence):
    result, sentence, attention_plot = evaluate(sentence)
    #predicted_sentence = [targ_lang_tokenizer.index_word[i] for i in list(result()) if (i < len(targ_lang_tokenizer.word_index) and i not in [0,46,47])]    
    return result,sentence

In [81]:
#checking string from VALIDATION SET
check_str = ' '.join([inp_lang_tokenizer.index_word[i] for i in input_tensor_val[10] if i != 0][1:-1])

#check_str = ' '.join([inp_lang_tokenizer.index_word[i] for i in input_tensor_val[10] if i not in [0, len(inp_lang_tokenizer.word_index), len(inp_lang_tokenizer.word_index)]])
check_str

'increase 3 3 by 4 0'

In [82]:
a,b = translate(check_str)
a = a.replace('<end>','')
a

'x + y  '

In [0]:
#test.csv testing  #dt   #PREPROCESSING TEST
def preprocess_ins(sentence):

  sentence = sentence.lower().strip()
  sentence = re.sub(r"([?.!,’])", r" \1 ", sentence)
  sentence = re.sub(r"([0-9])", r" \1 ", sentence)
  sentence = re.sub(r'[" "]+', " ", sentence)
  sentence = sentence.rstrip().strip()
  return sentence

ins = list(dt['statement'].values)
preprocessed_ins = list(map(preprocess_ins, ins))
e=[]
for x in range(80):
  ins_tensor, ins_lang_tokenizer = tokenize(preprocessed_ins)
  test_str = ' '.join([ins_lang_tokenizer.index_word[i] for i in ins_tensor[x] if i!= 0][1:-1])
  t,b = translate(test_str)
  t = t.replace("<end>",'').strip()
  e.append(t)

dt['equation generated'] = e

In [84]:
dt.head(10)

Unnamed: 0,statement,Significant number 1 detected,Significant number 2 detected,equation generated,answer evaluated
0,88 raise 2,,,x ^ y,
1,add 43 and 11,,,x * y,
2,multiply 33 with 95,,,x * y,
3,96 times 5,,,x * y,
4,increase 43 by 16,,,x * y,
5,reduce 95 by 96,,,x * y,
6,58 times 89,,,x * y,
7,subtract 7 from 86,,,x * y,
8,multiply 29 with 41,,,x * y,
9,75 raise 3,,,x ^ y,


In [0]:
dt.statement = dt.statement.str.replace('twice','2').str.replace('thrice','3')
pattern = re.compile(r'([A-Za-z]*\s?)(\d+)\s(\D+)\s(\d+)')
def expp(x,pattern):
  matches = pattern.finditer(x)
  for match in matches:
    return(match.group(2),match.group(4))

l1 = []
l2 = []
for i in dt['statement']:
  # print(i)
  a,b = expp(i,pattern)
  l1.append(int(a))
  l2.append(int(b))

dt['Significant number 1 detected'] = l1
dt['Significant number 2 detected'] = l2

In [92]:
dt.head

<bound method NDFrame.head of               statement  ...  answer evaluated
0            88 raise 2  ...              7744
1         add 43 and 11  ...               473
2   multiply 33 with 95  ...              3135
3            96 times 5  ...               480
4     increase 43 by 16  ...               688
..                  ...  ...               ...
75           37 times 3  ...               111
76   subtract 59 from 8  ...               472
77    increase 67 by 13  ...               871
78  multiply 98 with 81  ...              7938
79      reduce 22 by 31  ...               682

[80 rows x 5 columns]>

In [0]:
def xp(n1, n2, expression):
  x = list(expression)
  x.remove('x')
  x.remove('y')
  if '+' in x:
    return n1 + n2
  elif '-' in x:
    return n1 - n2
  elif '*' in x:
    return n1 * n2
  elif '^' in x:
    return n1**n2

ans = []
for i in range(dt.shape[0]):
  ans.append(xp(dt.loc[i,'Significant number 1 detected'], dt.loc[i,'Significant number 2 detected'],dt.loc[i,'equation generated']))
dt['answer evaluated'] = ans

In [88]:

dt.head()

Unnamed: 0,statement,Significant number 1 detected,Significant number 2 detected,equation generated,answer evaluated
0,88 raise 2,88,2,x ^ y,7744
1,add 43 and 11,43,11,x * y,473
2,multiply 33 with 95,33,95,x * y,3135
3,96 times 5,96,5,x * y,480
4,increase 43 by 16,43,16,x * y,688


In [94]:
df['statement']=df['statement'].str.replace('3','thrice')
df['statement']=df['statement'].str.replace('2','twice')
dt['statement']=dt['statement'].str.replace('3','thrice')
dt['statement']=dt['statement'].str.replace('2','twice')
dt.head()

Unnamed: 0,statement,Significant number 1 detected,Significant number 2 detected,equation generated,answer evaluated
0,88 raise twice,88,2,x ^ y,7744
1,add 4thrice and 11,43,11,x * y,473
2,multiply thricethrice with 95,33,95,x * y,3135
3,96 times 5,96,5,x * y,480
4,increase 4thrice by 16,43,16,x * y,688


In [0]:
dt.to_csv(r'/content/output.csv')