In [None]:
#!/usr/local/bin/python

# This version of the code trains the attractor connections with a separate
# objective function than the objective function used to train all other weights
# in the network (on the prediction task).

from __future__ import print_function
import itertools
import tensorflow as tf
import numpy as np
import sys
import argparse
import datetime


% load_ext autoreload
% autoreload

from tensorflow_helpers import *
from data_generator import generate_examples, pick_task
from information_trackers import MutInfSaver, WeightSaver, compute_entropy_fullvec, get_mut_inf_for_fullvec, \
    flat_mutual_inf, compute_avg_entropy_vec
from helper_functions import get_batches, load_pretrained_embeddings, \
    get_model_type_str, translate_ids_to_words, \
    save_results, print_into_log, print_some_translated_sentences, \
    get_training_progress_comment
from graph_init import GRU_attractor, TANH_attractor


class EarlyStopper():
    def __init__(self, patience_max, disp_epoch, min_delta = 0.03):
        self.best = 1e10
        self.patience = 0  # our patience
        self.patience_max = patience_max
        self.display_epoch = disp_epoch
        self.min_delta = min_delta

    def update(self, current):
        if self.best > current:
            self.best = current
            self.patience = 0
        elif abs(self.best - current) > self.min_delta:
            self.patience += 1

    def patience_ran_out(self):
        if self.patience*self.display_epoch > self.patience_max:
            return True
        else:
            False
            
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0) # only difference


ops = {
    'model_type': "GRU",  # OPTIONS: GRU, TANH
    'hid': 5,
    'in': None,  # TBD
    'out': 1,
    #         'batch_size':n_examples, #since the sequences are 1-dimensional it's easier to just run them all at once
    'n_attractor_iterations': 0,
    'attractor_dynamics': "projection2",  # OPTIONS:  "" (for no attractor dynamics),
    #           "direct" (simple attractor weights applied to hidden states directly, trained with noise addition)
    #           "projection" (project the hidden state into a separate space via weights, do attraction, project back)
    #           "helper_hidden" (hidden-hidden neurons) - IMPORTANT: don't forget to add h_hid number
    'h_hid': 10,  # helper hidden for "helper hidden" "attractory_dynamics" mode
    'attractor_noise_level': 0.2,
    'attractor_noise_type': "bernoilli",  # OPTIONS: "gaussian", "dropout", "random_drop"

    'training_mode': "",  # 'attractor_on_task',

    'attractor_regularization': "l2_norm",  # OPTIONS: "l2_regularization", "l2_norm"
    'attractor_regularization_lambda': 0.05,

    'record_mutual_information': True,
    'problem_type': "majority",  # OPTIONS: parity, parity_length, majority, reber, kazakov, pos_brown, ner_german, sentimend_imdb
    'masking': False,#"seq", "final"
    'prediction_type': 'final', #'seq', 'final'
    'seq_len': 5,

    'save_best_model': True,
    'reshuffle_data_each_replication': False,  # relevant for POS datasets (since they are loaded from files)
    'test_partition': 0.3,
    'lrate': 0.003,  # was 0.008

    # NLP related (pos_brown task)
    'bidirectional': False,
    'embedding_size': 100,
    'load_word_embeddings': True,
    'train_word_embeddings': True,
    'input_type': "embed",  # embed&prior, embed, prior
    'dropout': 0.0  # in range(0,1)
}

# !!!!!!!!!!!!!!!!!!!!!!
# SEQ_LEN = 12 # number of bits in input sequence
N_HIDDEN = ops['hid']  # number of hidden units
N_H_HIDDEN = ops['h_hid']
NOISE_LEVEL = ops['attractor_noise_level']
# noise in training attractor net
# if >=0, Gaussian with std dev NOISE_LEVEL
# if < 0, Bernoulli dropout proportion -NOISE_LEVEL

# !!!!!!!!!!!!!!!!!!!!!!
INPUT_NOISE_LEVEL = 0.1
# number of time steps in attractor dynamics
# if = 0, then no attractor net
# !!!!!!!!!!!!!!!!!!!!!!
# ATTR_WEIGHT_CONSTRAINTS = True
# True: make attractor weights symmetric and have zero diag
# False: unconstrained
TRAIN_ATTR_WEIGHTS_ON_PREDICTION = False
# True: train attractor weights on attractor net _and_ prediction
REPORT_BEST_TRAIN_PERFORMANCE = True
# True: save the train/test perf on the epoch for which train perf was best
LOSS_SWITCH_FREQ = 1
# how often (in epochs) to switch between attractor
# and prediction loss

# Training Parameters

TRAINING_EPOCHS = 5000
N_REPLICATIONS = 1
BATCH_SIZE = 5000
DISPLAY_EPOCH = 200
EARLY_STOPPING_THRESH = 1e-3
EARLY_STOPPING_PATIENCE = 1000  # in epochs

# NOTEBOOK CODE
WS = WeightSaver()
MIS = MutInfSaver()

######### MAIN CODE #############################################################
#0.02, 0.05, 0.1, 0.2, 0.35, 0.5, 

for problem in ['parity', 'parity_length', 'majority', 'reber']:
    ops['problem_type'] = problem
    TASK = ops['problem_type']
    for training_procedure in ['no_att', 'att', 'att_on_task']:
        print(TASK, training_procedure)
        if training_procedure == 'no_att':
            ops['n_attractor_iterations'] = 0
            ops['training_mode'] = ''
        elif training_procedure == 'att':
            ops['n_attractor_iterations'] = 10
            ops['training_mode'] = ''
        elif training_procedure == 'att_on_task':
            ops['n_attractor_iterations'] = 10
            ops['training_mode'] = 'attractor_on_task'
        ops, SEQ_LEN, N_INPUT, N_CLASSES, N_TRAIN, N_TEST = pick_task(ops['problem_type'],
                                                              ops)  # task (parity, majority, reber, kazakov)
        N_ATTRACTOR_STEPS = ops['n_attractor_iterations']
        ARCH = ops['model_type']  # hidden layer type: 'GRU' or 'tanh'
        ATTRACTOR_TYPE = ops['attractor_dynamics']
        # the tf seed needs to be within the context of the graph.
        tf.reset_default_graph()
    #     np.random.seed(100)
    #     tf.set_random_seed(100)
    #         ops['n_attractor_iterations'] = attractor_steps
    #         N_ATTRACTOR_STEPS = ops['n_attractor_iterations']
        #
        # PLACEHOLDERS
        #
        if 'pos' in ops['problem_type']:
            # X will be looked up in the embedding table, so the last dimension is just a number
            X = tf.placeholder("int64", [None, SEQ_LEN], name='X')
            # last dimension is left singular, tensorflow will expect it to be an id number, not 1-hot embed
            Y = tf.placeholder("int64", [None, SEQ_LEN], name='Y')
        elif ops['problem_type'] == 'ner_german':
            X = tf.placeholder("float", [None, SEQ_LEN, N_INPUT])
            Y = tf.placeholder("int64", [None, SEQ_LEN])
        else:  # single output
            X = tf.placeholder("float", [None, SEQ_LEN, N_INPUT])
            Y = tf.placeholder("float", [None, N_CLASSES])
        attractor_tgt_net = tf.placeholder("float", [None, N_HIDDEN], name='attractor_tgt')

        # Embedding matrix initialization
        if 'pos' in ops['problem_type']:
            [_, _, _, _, _, _, maps] = generate_examples(SEQ_LEN, N_TRAIN, N_TEST,
                                                         INPUT_NOISE_LEVEL, TASK, ops)

            if ops['load_word_embeddings']:
                embeddings_loaded = load_pretrained_embeddings('data/glove.6B.{}d.txt'.format(ops['embedding_size']),
                                                               maps, ops)
                embedding = tf.get_variable("embedding",
                                            initializer=embeddings_loaded,
                                            dtype=tf.float32,
                                            trainable=ops['train_word_embeddings'])
            else:  # initialize randomly
                embedding = tf.get_variable("embedding",
                                            initializer=tf.truncated_normal_initializer(stddev=0.05),
                                            shape=[ops['vocab_size'], ops['embedding_size']],
                                            dtype=tf.float32,
                                            trainable=ops['train_word_embeddings'])
            embed_lookup = tf.nn.embedding_lookup(embedding, X)

            # load priors information
            if ops['input_type'] == 'prior' or ops['input_type'] == 'embed&prior':
                id2prior = maps['id2prior']
                word2id = maps['word2id']
                priors = np.zeros([len(id2prior), len(id2prior[0])]).astype("float32")
                for id, prior in id2prior.items():
                    priors[id] = prior
                priors_op = tf.get_variable("priors",
                                            initializer=priors,
                                            dtype=tf.float32,
                                            trainable=False)
                prior_lookup = tf.nn.embedding_lookup(priors_op, X)

            if ops['input_type'] == 'embed':
                embed = embed_lookup
            elif ops['input_type'] == 'prior':
                embed = prior_lookup
            elif ops['input_type'] == 'embed&prior':
                embed = tf.concat([embed_lookup, prior_lookup], axis=2)

        # Graph + all the training variables
        if 'pos' in ops['problem_type']:
            net_inputs = {'X': embed, 'mask': Y, 'attractor_tgt_net': attractor_tgt_net}
        else:
            net_inputs = {'X': X, 'mask': Y, 'attractor_tgt_net': attractor_tgt_net}

        if ops['bidirectional']:
            G_attractors = {'forw': [], 'back': []}
            names = G_attractors.keys()
            # Forward:
            G_forw = GRU_attractor(ops, inputs=net_inputs, direction='forward', suffix=names[0])
            attr_loss_op_forw = G_forw.attr_loss_op
            attr_train_op_forw = G_forw.attr_train_op
            h_clean_seq_flat_forw = G_forw.h_clean_seq_flat  # for computing entropy of states
            h_net_seq_flat_forw = G_forw.h_net_seq_flat  # -> attractor_tgt_net placeholder
            G_attractors['forw'] = {'attr_loss_op': attr_loss_op_forw, "attr_train_op": attr_train_op_forw,
                                    'h_clean_seq_flat': h_clean_seq_flat_forw, 'h_net_seq_flat': h_net_seq_flat_forw}
            G_forw_output = G_forw.output

            # Backward:
            G_back = GRU_attractor(ops, inputs=net_inputs, direction='backward', suffix=names[1])
            attr_loss_op_back = G_back.attr_loss_op
            attr_train_op_back = G_back.attr_train_op
            h_clean_seq_flat_back = G_back.h_clean_seq_flat  # for computing entropy of states
            h_net_seq_flat_back = G_back.h_net_seq_flat  # -> attractor_tgt_net placeholder
            G_attractors['back'] = {'attr_loss_op': attr_loss_op_back, "attr_train_op": attr_train_op_back,
                                    'h_clean_seq_flat': h_clean_seq_flat_back, 'h_net_seq_flat': h_net_seq_flat_back}
            G_back_output = G_back.output

            # Merge: [seq_len, batch_size, n_hid*2]
            # Note that we reverse the backward cell's output to align with original direction
            output = tf.concat([G_forw_output, tf.reverse(G_back_output, axis=[0])], axis=2)

            if ops['dropout'] > 0.0:
                # note keep_prob = 1.0 - drop_probability (not sure why they implemented it this way)
                # tensorflow implementation scales by 1/keep_prob automatically
                output_dropped = tf.nn.dropout(output, keep_prob=1.0 - ops['dropout'])
            else:
                output_dropped = output

            input_size_final_projection = 2 * ops['hid']
            Y_ = project_into_output(Y, output, input_size_final_projection, ops['out'], ops)

            # LOSS, ACC, & TRAIN OPS
            pred_loss_op = task_loss(Y, Y_, ops)
            optimizer_pred = tf.train.AdamOptimizer(learning_rate=0.008)
            prediction_parameters = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "TASK_WEIGHTS")
            pred_train_op = optimizer_pred.minimize(pred_loss_op, var_list=prediction_parameters)
            accuracy = task_accuracy(Y, Y_, ops)
        else:
            # TODO: change to map with just ont entry as well.
            G_attractors = {'forw': []}
            names = G_attractors.keys()
            # Forward:
            if ops['model_type'] == 'GRU':
                G_forw = GRU_attractor(ops, inputs=net_inputs, direction='forward', suffix=names[0])
            elif ops['model_type'] == 'TANH':
                G_forw = TANH_attractor(ops, inputs=net_inputs, direction='forward', suffix=names[0])
            attr_loss_op_forw = G_forw.attr_loss_op
            attr_train_op_forw = G_forw.attr_train_op
            h_clean_seq_flat_forw = G_forw.h_clean_seq_flat  # for computing entropy of states
            h_net_seq_flat_forw = G_forw.h_net_seq_flat  # -> attractor_tgt_net placeholder
            G_attractors['forw'] = {'attr_loss_op': attr_loss_op_forw, "attr_train_op": attr_train_op_forw,
                                    'h_clean_seq_flat': h_clean_seq_flat_forw, 'h_net_seq_flat': h_net_seq_flat_forw}
            output = G_forw.output

            h_net_seq_flat = G_forw.h_net_seq_flat # pure cell ouptut (before attractor was applied)
            h_attractor_collection_flat = G_forw.h_attractor_collection
            h_clean_seq_flat = G_forw.h_clean_seq_flat
            input_size_final_projection = ops['hid']
            Y_ = project_into_output(Y, output, input_size_final_projection, ops['out'], ops)

            # LOSS, ACC, & TRAIN OPS
            pred_loss_op = task_loss(Y, Y_, ops)
            optimizer_pred = tf.train.AdamOptimizer(learning_rate=0.008)
            prediction_parameters = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "TASK_WEIGHTS")
            if ops['training_mode'] == 'attractor_on_task':
                print("adding attractor params to task training op")
                prediction_parameters += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "ATTRACTOR_WEIGHTS")
            pred_train_op = optimizer_pred.minimize(pred_loss_op, var_list=prediction_parameters)
            accuracy = task_accuracy(Y, Y_, ops)


        mask_op = tf.cast(tf.sign(Y), dtype=tf.float32)
        # Initialize the variables (i.e. assign their default value)
        init = tf.global_variables_initializer()

        with tf.Session() as sess:
            # TODO: make a class for all "best" quantities (a lot of space)
            saved_train_acc = []
            saved_test_acc = []
            saved_epoch = []
            saved_att_loss = []
            saved_entropy_final = []
            saved_entropy_final_test = []
            saved_val_acc = []
            saved_val_loss = []
            saved_traini_loss = []
            
            saver = tf.train.Saver()

            # Start training
            for replication in range(N_REPLICATIONS):
                print("********** replication ", replication, " **********")
                early_stopper = EarlyStopper(EARLY_STOPPING_PATIENCE, DISPLAY_EPOCH)
                [X_train, Y_train, X_test, Y_test, X_val, Y_val, maps] = generate_examples(SEQ_LEN, N_TRAIN, N_TEST,
                                                                                           INPUT_NOISE_LEVEL, TASK, ops)


                # Log Path init-n:
                COMMENT = 'training_procedure_test'
                MODEL_NAME_FILE = '{}_{}.txt'.format(ops['problem_type'],
                                                                                   COMMENT)
                LOG_DIRECTORY = 'experiments/logs/{}'.format(MODEL_NAME_FILE)
                MODEL_DIRECTORY = 'experiments/logs/{}_{}'.format(datetime.date.today(), MODEL_NAME_FILE)
                print_into_log(LOG_DIRECTORY, get_model_type_str(ops, N_TRAIN, N_TEST, SEQ_LEN) + ops['training_mode'])
                print_into_log(MODEL_DIRECTORY, get_model_type_str(ops, N_TRAIN, N_TEST, SEQ_LEN), supress=True)

                sess.run(init)  # Run the initializer

                train_prediction_loss = True
                best_train_acc = -1000.
                best_test_acc = 0
                best_entropy = 0.0
                best_entropy_test = 0.0
                best_att_loss = 0
                best_train_loss = 0
                best_val_loss = 0.0
                best_val_acc = 0.0
                best_epoch = 0
                for epoch in range(1, TRAINING_EPOCHS + 2):
                    if (epoch - 1) % DISPLAY_EPOCH == 0:
                        # TRAIN set:
                        ploss, train_acc = batch_tensor_collect(sess, [pred_loss_op, accuracy],
                                                                X, Y, X_train, Y_train, BATCH_SIZE)
                        # TEST set:
                        test_acc = batch_tensor_collect(sess, [accuracy], X, Y, X_test, Y_test, BATCH_SIZE)[0]

    #                         # Validation set & Early stopping:
    #                         ploss_val, val_acc = batch_tensor_collect(sess, [pred_loss_op, accuracy],
    #                                                                   X, Y, X_val, Y_val, BATCH_SIZE)

                        # Precistion/Recall:
                        def _get_metrics(X_data, Y_data):
                            y_pred, y_true, mask_val = batch_tensor_collect(sess, [Y_, Y, mask_op],
                                                                X, Y, X_data, Y_data, BATCH_SIZE)
                            y_pred = np.argmax(y_pred, axis=2)

                            Y_pred_flat = np.extract(mask_val.astype(bool), y_pred)
                            Y_test_flat = np.extract(mask_val.astype(bool), y_true)
                            print("PRECISION:",compute_f1(Y_pred_flat, Y_test_flat, maps['id2tag']))

                        if ops['problem_type'] == 'ner_german':
                            _get_metrics(X_test, Y_test)
                            _get_metrics(X_train, Y_train)

    # #                         print(early_stopper.patience, early_stopper.best, ploss_val)
    #                         early_stopper.update(ploss_val)
    #                         if (epoch > 100) and early_stopper.patience_ran_out():
    #                             print_into_log(LOG_DIRECTORY, "STOPPED EARLY AT {}".format(epoch))
    #                             break

                        # ATTRACTOR(s) LOSS
                        aloss = {}
                        entropy = {}
                        entropy_test = {}
                        hid_vals_arr = batch_tensor_collect(sess, [A['h_net_seq_flat'] for att_name, A in
                                                                   G_attractors.items()],
                                                            X, Y, X_train, Y_train, BATCH_SIZE)
                        h_clean_val_arr = batch_tensor_collect(sess, [A['h_clean_seq_flat'] for att_name, A in
                                                                      G_attractors.items()],
                                                               X, Y, X_train, Y_train, BATCH_SIZE)
                        h_clean_val_arr_test = batch_tensor_collect(sess, [A['h_clean_seq_flat'] for att_name, A in
                                                                      G_attractors.items()],
                                                               X, Y, X_test, Y_test, BATCH_SIZE)
                        for i, attractor_name in enumerate(G_attractors.keys()):
                            A = G_attractors[attractor_name]
                            a_loss_val = []
                            n_splits = np.max([1, int(len(X_train) / BATCH_SIZE)])
                            for batch_hid_vals in np.array_split(hid_vals_arr[i], n_splits):
                                a_loss_val.append(
                                    sess.run(A['attr_loss_op'], feed_dict={attractor_tgt_net: batch_hid_vals}))
                            aloss[attractor_name] = "{:.4f}".format(np.mean(a_loss_val))

                            entropy[attractor_name] = "{:.4f}".format(
                                compute_entropy_fullvec(h_clean_val_arr[i], ops, n_bins=8))
                            
                        for i, attractor_name in enumerate(G_attractors.keys()):
                            entropy_test[attractor_name] = "{:.4f}".format(
                                compute_entropy_fullvec(h_clean_val_arr_test[i], ops, n_bins=8))    

                        # Print training information:
                        print_into_log(LOG_DIRECTORY, datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + get_training_progress_comment(epoch, ploss, aloss, 0.0, 0.0, train_acc,
                                                                     test_acc, entropy, entropy_test))
                        # Update the logs:
                        WS.update_conservative(epoch_number=epoch, loss_att=aloss,
                                               loss_task=ploss, acc=test_acc, entropy=entropy)
                        if ops['record_mutual_information']:
                            h_attractor_val, h_clean_val = batch_tensor_collect(sess, [h_attractor_collection_flat, h_clean_seq_flat],
                                                                                X, Y, X_train, Y_train, BATCH_SIZE)
                            MIS.update(ploss, aloss, train_acc, test_acc, np.tanh(hid_vals_arr[0]), h_attractor_val, h_clean_val)

                        if (train_acc > best_train_acc):
                            best_train_acc = train_acc
                            best_test_acc = test_acc
                            best_att_loss = aloss
                            best_epoch = epoch
    #                         best_val_acc = val_acc

    #                         best_val_loss = ploss_val
    #                         best_train_loss = ploss
                            if ops['save_best_model']:
                                save_path = saver.save(sess, MODEL_DIRECTORY)
                            best_entropy = entropy
                            best_entropy_test = entropy_test
                        if (train_acc == 1.0):
                            print("Reached Peak!")
                            break
                    if epoch > 1 and LOSS_SWITCH_FREQ > 0 \
                            and (epoch - 1) % LOSS_SWITCH_FREQ == 0:
                        train_prediction_loss = not train_prediction_loss

                    # MODEL TRAINING
                    batches = get_batches(BATCH_SIZE, X_train, Y_train)
                    for (batch_x, batch_y) in batches:
                        if (LOSS_SWITCH_FREQ == 0 or train_prediction_loss):
                            # Optimize all parameters except for attractor weights
                            _ = sess.run([pred_train_op],
                                         feed_dict={X: batch_x, Y: batch_y})
                        if ops['training_mode'] != 'attractor_on_task':
                            if (LOSS_SWITCH_FREQ == 0 or not train_prediction_loss):
                                if (N_ATTRACTOR_STEPS > 0):
                                    # ATTRACTOR(s) training
                                    for i, attractor_name in enumerate(G_attractors.keys()):
                                        A = G_attractors[attractor_name]
                                        _ = sess.run(A['attr_train_op'], feed_dict={attractor_tgt_net: hid_vals_arr[i]})

                print("Optimization Finished!")

                if (REPORT_BEST_TRAIN_PERFORMANCE):
                    saved_train_acc.append(best_train_acc)
                    saved_test_acc.append(best_test_acc)
                    saved_att_loss.append(best_att_loss)
                    saved_entropy_final.append(best_entropy)
                    saved_entropy_final_test.append(best_entropy_test)
                    saved_epoch.append(best_epoch)

    #                 saved_val_acc.append(best_val_acc)
    #                 saved_val_loss.append(best_val_loss)
                    saved_traini_loss.append(best_train_loss)
                else:
                    saved_train_acc.append(train_acc)
                    saved_test_acc.append(test_acc)
                    #             saved_att_loss.append(aloss)

            save_results(ops, saved_epoch, saved_train_acc, saved_test_acc, saved_att_loss, saved_entropy_final, saved_val_acc,
                 saved_val_loss, saved_traini_loss, N_TRAIN, N_TEST, SEQ_LEN, COMMENT, saved_entropy_final_test)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
parity no_att
L2 norm
********** replication  0  **********
parity
Logged Successfully: 

    model_type: 		GRU bidir(False), task: parity
    hid: 			5,
    h_hid: 			10
    n_attractor_iterations: 	0,
    attractor_dynamics: 	projection2
    attractor_noise_level: 	0.2
    attractor_noise_type: 	bernoilli
    attractor_regu-n: 		l2_norm(lambda:0.05)
    word_embedding: size	(100), train(True)
    dropout: 			0.0
    TRAIN/TEST_SIZE: 	32/32, SEQ_LEN: 5
Logged Successfully: 
Logged Successfully: 
2018-04-16 15:37:38epoch=0; Loss Pred=1.0781; Val Loss=0.0000; Val Acc=0.0000; Loss Att={'forw': '1.0054'}; Train Acc=0.500; Test Acc=0.4531; Entropy={'forw': '2.0409'}; Entropy_Test={'forw': '2.6641'}

Logged Successfully: 
2018-04-16 15:37:40epoch=200; Loss Pred=0.9291; Val Loss=0.0000; Val Acc=0.0000; Loss Att={'forw': '1.0054'}; Train Acc=0.469; Test Acc=0.4531; Entropy={'forw': '2.9966'}; Entropy_Test=