In [6]:
import argparse
from datetime import date
import sys
from copy import deepcopy

from mealy_trie import Trie
from mealy_machine import Mealy
from utils import *
from mealy_machine import Mealy
import os
import numpy as np
import tensorflow as tf
#tf.config.run_functions_eagerly(True)
from utils import *
import matplotlib.pyplot as plt
from model import Tagger, load_weights
import pickle

In [3]:
def build_fsm_from_dict(id, dict, labels, nfa=False):
    t = Trie(dict, labels)
    my_mealy = Mealy(id, t.root.id, t.states, t.arcs)
    # states are represented in a dfs fashion
    return my_mealy

In [7]:

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--id", type=int, default=0)
    parser.add_argument("--train_length", type=int, default=10)
    parser.add_argument("--n_train_low", type=int, default=2)
    parser.add_argument("--n_train_high", type=int, default=300)
    parser.add_argument("--sim_threshold", type=float, default=.9)
    parser.add_argument("--find_threshold", default=False, action=argparse.BooleanOptionalAction)
    parser.add_argument("--seeds", type=int, default=1)
    parser.add_argument("--hidden_size", type=float, default=10)
    parser.add_argument('--eval', type=str, default="labels")
    return parser.parse_args()

In [11]:

id = 0
print('\n\n\n'+'*'*20+f' ID {id}: '+' EXTRACTION OF MEALY MACHINE FROM RNN '+'*'*20+'\n\n\n')

init_train_acc, init_dev_acc, train_acc, dev_acc = {}, {}, {}, {}
train_acc["0"] = []
n_train = range(1)

fsm_filepath = f'./FSMs/fsm{id}.txt'
expected_fsm = getFsm(fsm_filepath)

data_filepath = f'./datasets/dataset{id}.txt'
    
corpus, labels = get_data(data_filepath)
assert(len(corpus) == len(labels))
max_length = len(max(corpus, key=len))




******************** ID 0:  EXTRACTION OF MEALY MACHINE FROM RNN ********************





In [12]:
print('Some words of our dataset')
print(f'Corpus: {corpus[:5]}')
print(f'Labels: {labels[:5]}')

split_index = 100
dev_corpus = corpus[split_index:]
dev_labels = labels[split_index:]


corpus = corpus[:split_index]
labels = labels[:split_index]

corpus_, labels_ = preprocessing(corpus, labels, max_length)
dev_corpus_, dev_labels_ = preprocessing(dev_corpus, dev_labels, max_length)

dev_mask = [masking(x,'2') for x in dev_labels_]

labels__ = np.array([np.array(list(x)) for x in labels_])
mask = [masking(x) for x in corpus_]

x_train = np.array([tokenization(x) for x in corpus_])
train_sents = [tokenization(x) for x in corpus]
y_train = np.array([class_mapping(x) for x in labels_])
mask_ = np.array([masking(x) for x in corpus_])


Some words of our dataset
Corpus: ['bba', 'abbabbbbba', 'aabaaba', 'bb', 'aa']
Labels: ['111', '1011011111', '1001001', '11', '10']


In [59]:
print("\n\033[FData Preprocessing... Done\n")

trained_model = Tagger(4, 10, 10, 3)
trained_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
filename = f'./weights/weights{id}.txt'
with open(filename, 'rb') as f:
    weights = pickle.load(f)

print('\033[FModel definition... Done\n')

trained_model.set_weights(weights)

print('\033[FModel Update... Done\n')

predictions = trained_model.predict(x_train)
"""predictions = predictions.argmax(axis=-1)
pred_labels = nparray_to_string(predictions, mask)
print(pred_labels[:5])"""
pred_labels =0
print(labels[:5])

print('\033[FTrie Building... Done\n')
trained_model.summary()


[FData Preprocessing... Done

[FModel definition... Done

[FModel Update... Done

['111', '1011011111', '1001001', '11', '10']
[FTrie Building... Done

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, None, 10)          40        
                                                                 
 simple_rnn_9 (SimpleRNN)    (None, None, 10)          210       
                                                                 
 dense_9 (Dense)             (None, None, 3)           33        
                                                                 
Total params: 283
Trainable params: 283
Non-trainable params: 0
_________________________________________________________________


In [43]:
eval = 'labels'
if eval == 'preds' :
    redundant_fsm = build_fsm_from_dict(id, corpus, pred_labels)
    #assert(score_all_prefixes(redundant_fsm, corpus, labels) == 100.0), '\nPredictions are not the same with labels'
else:
    redundant_fsm = build_fsm_from_dict(id, corpus, labels)
    #assert(score_all_prefixes(redundant_fsm, corpus, pred_labels) == 100.0), '\nLabels are not the same with predictions'
#redundant_fsm.print()


The number of states of the Trie: 214


********************* Prefix Tree of Mealy Machine corpus **********************


Number of states: 214
Number of transitions: 213
Initial state: 0
Input vocabulary: ['b', 'a']
Output vocabulary: ['1', '0']


First 10 over 213 transitions of the Tree: 
-> 0 --> b/1 --> 1
-> 1 --> b/1 --> 2
-> 2 --> a/1 --> 3
-> 0 --> a/1 --> 4
-> 4 --> b/0 --> 5
-> 5 --> b/1 --> 6
-> 6 --> a/1 --> 7
-> 7 --> b/0 --> 8
-> 8 --> b/1 --> 9
-> 9 --> b/1 --> 10

********************************************************************************




In [60]:
print('\033[FChecking if the trie get the right ouput for each input... Done\n')

trained_model.pop()
trained_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
#new_model = deepcopy(trained_model)
#print(new_model.summary())
representations = trained_model.predict(x_train)

[FChecking if the trie get the right ouput for each input... Done



In [61]:
representations.shape

(100, 12, 10)

In [66]:
print(predictions[0])
print(representations[0][0])
representations[1][0]

[[9.9863178e-01 5.1860930e-04 8.4957550e-04]
 [3.8189924e-04 9.9900287e-01 6.1517052e-04]
 [4.4097751e-04 9.9894959e-01 6.0941861e-04]
 [6.0455268e-04 9.9828053e-01 1.1149200e-03]
 [9.9985671e-01 4.3103890e-05 1.0009547e-04]
 [9.9985230e-01 6.4483196e-05 8.3235398e-05]
 [9.9986792e-01 6.5048866e-05 6.7045614e-05]
 [9.9986625e-01 6.7022040e-05 6.6779590e-05]
 [9.9986839e-01 6.6663924e-05 6.4961052e-05]
 [9.9986839e-01 6.6744593e-05 6.4825021e-05]
 [9.9986851e-01 6.6726396e-05 6.4721251e-05]
 [9.9986851e-01 6.6728557e-05 6.4704283e-05]]
[-0.17211303  0.26192328  0.85746074 -0.09794594 -0.67063165  0.48278904
 -0.94284075 -0.9149922  -0.9720395  -0.5291131 ]


array([-0.17211303,  0.26192328,  0.85746074, -0.09794594, -0.67063165,
        0.48278904, -0.94284075, -0.9149922 , -0.9720395 , -0.5291131 ],
      dtype=float32)

In [63]:

print('\033[FGetting states... Done\n')

idx = [redundant_fsm.return_states(sent) for sent in corpus] # maps strings to states
n_states = len(redundant_fsm.states)
states = np.zeros((n_states, 10))
states_mask = np.zeros(n_states)

print('\033[FStates Mapping preparation... Done\n')
print(corpus[:5])
print(mask_[:5])
print(representations[0][mask_[0]])
idx[:5]

[FGetting states... Done

[FStates Mapping preparation... Done

['bba', 'abbabbbbba', 'aabaaba', 'bb', 'aa']
[[ True  True  True  True False False False False False False False False]
 [ True  True  True  True  True  True  True  True  True  True  True False]
 [ True  True  True  True  True  True  True  True False False False False]
 [ True  True  True False False False False False False False False False]
 [ True  True  True False False False False False False False False False]]
[[-0.17211303  0.26192328  0.85746074 -0.09794594 -0.67063165  0.48278904
  -0.94284075 -0.9149922  -0.9720395  -0.5291131 ]
 [ 0.36985126  0.38259214 -0.5516439  -0.86095196  0.8985502   0.8429272
   0.91613066 -0.8990081   0.685846    0.06565572]
 [ 0.57503617  0.44572073 -0.46135584 -0.90672046  0.85027635  0.82383704
   0.8736894  -0.80343986  0.6629787   0.18829003]
 [ 0.96302897  0.93109095 -0.7505582   0.08382813  0.7523115   0.90976954
   0.8970763   0.69255567  0.9068847  -0.60051644]]


[[0, 1, 2, 3],
 [0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
 [0, 4, 14, 15, 16, 17, 18, 19],
 [0, 1, 2],
 [0, 4, 14]]

In [64]:
for i, _r in enumerate(representations):
    states[idx[i]] = _r[mask_[i]]
    states_mask[idx[i]] = labels__[i][mask_[i]]
states

array([[-0.17211303,  0.26192328,  0.85746074, ..., -0.91499221,
        -0.97203952, -0.52911311],
       [ 0.36985126,  0.38259214, -0.55164391, ..., -0.8990081 ,
         0.68584597,  0.06565572],
       [ 0.57503617,  0.44572073, -0.46135584, ..., -0.80343986,
         0.66297871,  0.18829003],
       ...,
       [ 0.57375389,  0.44103438, -0.40026191, ..., -0.9043811 ,
         0.6314857 , -0.26362786],
       [ 0.61830693,  0.51953465, -0.27519906, ..., -0.81907737,
         0.48997593, -0.07041915],
       [ 0.64176989,  0.5943312 , -0.30004212, ..., -0.81972021,
         0.54984033,  0.02144895]])