In [1]:
"""
Training script to train a model on MultiNLI and, optionally, on SNLI data as well.
The "alpha" hyperparamaters set in paramaters.py determines if SNLI data is used in training. If alpha = 0, no SNLI data is used in training. If alpha > 0, then down-sampled SNLI data is used in training. 
"""


%tb

import tensorflow as tf
import os
import importlib
import random
from util import logger
import util.parametersipynb as params
from util.data_processing_ipynb import *
from util.evaluate import *


args = params.argparser("cbow petModel-0 --keep_rate 0.9 --seq_length 25 --emb_train")
FIXED_PARAMETERS = params.load_parameters(args)
test_matched = "{}/multinli_0.9/multinli_0.9_test_matched_unlabeled.jsonl".format(args.datapath)

if os.path.isfile(test_matched):
    test_matched = "{}/multinli_0.9/multinli_0.9_test_matched_unlabeled.jsonl".format(args.datapath)
    test_mismatched = "{}/multinli_0.9/multinli_0.9_test_matched_unlabeled.jsonl".format(args.datapath)
    test_path = "{}/multinli_0.9/".format(args.datapath)
else:
    test_path = "{}/multinli_0.9/".format(args.datapath)
    temp_file = os.path.join(test_path, "temp.jsonl")
    io.open(temp_file, "wb")
    test_matched = temp_file
    test_mismatched = temp_file

modname = FIXED_PARAMETERS["model_name"]
logpath = os.path.join(FIXED_PARAMETERS["log_path"], modname) + ".log"
logger = logger.Logger(logpath)

model = FIXED_PARAMETERS["model_type"]

module = importlib.import_module(".".join(['models', model])) 
MyModel = getattr(module, 'MyModel')

# Logging parameter settings at each launch of training script
# This will help ensure nothing goes awry in reloading a model and we consistenyl use the same hyperparameter settings. 
logger.Log("FIXED_PARAMETERS\n %s" % FIXED_PARAMETERS)


######################### LOAD DATA #############################

logger.Log("Loading data")
training_snli = load_nli_data(FIXED_PARAMETERS["training_snli"], snli=True)
dev_snli = load_nli_data(FIXED_PARAMETERS["dev_snli"], snli=True)
test_snli = load_nli_data(FIXED_PARAMETERS["test_snli"], snli=True)

training_mnli = load_nli_data(FIXED_PARAMETERS["training_mnli"])
dev_matched = load_nli_data(FIXED_PARAMETERS["dev_matched"])
dev_mismatched = load_nli_data(FIXED_PARAMETERS["dev_mismatched"])
# test_matched = load_nli_data(FIXED_PARAMETERS["test_matched"])
# test_mismatched = load_nli_data(FIXED_PARAMETERS["test_mismatched"])

No traceback available to show.
usage: ipykernel_launcher.py [-h] [--datapath DATAPATH] [--ckptpath CKPTPATH]
                             [--logpath LOGPATH] [--emb_to_load EMB_TO_LOAD]
                             [--learning_rate LEARNING_RATE]
                             [--keep_rate KEEP_RATE] [--seq_length SEQ_LENGTH]
                             [--emb_train] [--genre GENRE] [--alpha ALPHA]
                             [--test]
                             {esim,cbow,bilstm,lstm,cnn_model1,cnn_model_2}
                             model_name
ipykernel_launcher.py: error: argument model_type: invalid choice: '/run/user/1000/jupyter/kernel-b61b347d-2fb6-4cc3-8e1d-408b1c1efc06.json' (choose from 'esim', 'cbow', 'bilstm', 'lstm', 'cnn_model1', 'cnn_model_2')


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [2]:
# if 'temp.jsonl' in FIXED_PARAMETERS["test_matched"]:
#     # Removing temporary empty file that was created in parameters.py
#     os.remove(FIXED_PARAMETERS["test_matched"])
#     logger.Log("Created and removed empty file called temp.jsonl since test set is not available.")

dictpath_uni = os.path.join(FIXED_PARAMETERS["log_path"], modname) + "_uni" + ".p"
dictpath_bi = os.path.join(FIXED_PARAMETERS["log_path"], modname) + "_bi" + ".p"
dictpath_tri = os.path.join(FIXED_PARAMETERS["log_path"], modname) + "_tri" + ".p"

if not os.path.isfile(dictpath_uni): 
    logger.Log("Building dictionary")
    if FIXED_PARAMETERS["alpha"] == 0:
        word_indices_uni, word_indices_bi, word_indices_tri = build_dictionary_ngrams([training_mnli])
    else:
        word_indices_uni, word_indices_bi, word_indices_tri = build_dictionary_ngrams([training_mnli, training_snli])
    
    logger.Log("Padding and indexifying sentences")
    sentences_to_padded_index_sequences_ngrams(word_indices_uni, word_indices_bi, word_indices_tri, [training_mnli, training_snli, dev_matched, dev_mismatched, dev_snli, test_snli])
    
    pickle.dump(word_indices_uni, open(dictpath_uni, "wb"))
    pickle.dump(word_indices_bi, open(dictpath_bi, "wb"))
    pickle.dump(word_indices_tri, open(dictpath_tri, "wb"))

else:
    logger.Log("Loading dictionary from %s" % (dictpath_uni))
    word_indices_uni = pickle.load(open(dictpath_uni, "rb"))
    word_indices_bi = pickle.load(open(dictpath_bi, "rb"))
    word_indices_tri = pickle.load(open(dictpath_tri, "rb"))
    logger.Log("Padding and indexifying sentences")
    sentences_to_padded_index_sequences_ngrams(word_indices_uni, word_indices_bi, word_indices_tri, [training_mnli, training_snli, dev_matched, dev_mismatched, dev_snli, test_snli])

logger.Log("Loading embeddings")
loaded_embeddings = loadEmbedding_rand(FIXED_PARAMETERS["embedding_data_path"], word_indices_uni)

[1] Building dictionary
[1] Padding and indexifying sentences
[1] Loading embeddings


In [6]:
word_indices_bi[PADDING]

0

In [3]:
training_mnli[0]

{u'annotator_labels': [u'contradiction'],
 u'genre': u'telephone',
 u'gold_label': u'contradiction',
 'label': 2,
 u'pairID': u'116375c',
 u'promptID': u'116375c',
 u'sentence1': u"so when we get like i said you know the next one when she decides to to sell the car she has now i'll end up with the Parisian and my wife will have the newer one of course and",
 u'sentence1_binary_parse': u"( so ( when ( we ( get ( like ( i ( said ( you ( know ( ( ( ( ( the ( next one ) ) ( when ( she ( decides ( to ( to ( sell ( ( the car ) ( she ( ( has now ) i ) ) ) ) ) ) ) ) ) ) ( 'll ( ( end up ) ( with ( the Parisian ) ) ) ) ) and ) ( ( my wife ) ( will ( have ( the ( ( ( newer one ) ( of course ) ) and ) ) ) ) ) ) ) ) ) ) ) ) ) ) )",
 'sentence1_binary_parse_index_sequence': array([ 25495,  59731,  28121,   7290,   8984,  67918,  93900,  41842,
         99973,  65649,  18840,  50916,  59731,  18074,  31276,  75768,
         75768,  11197,  65649,  28575,  18074,    631,  53874,  67918,
        10011

In [47]:
sent = training_mnli[0]['sentence1']

In [50]:
def tokenize(string):
    string = re.sub(r'\(|\)', '', string)
    return string.split()

In [54]:
bigrammed_sent = list(nltk.bigrams(tokenize(sent)))

In [55]:
bigrammed_sent

[(u'so', u'when'),
 (u'when', u'we'),
 (u'we', u'get'),
 (u'get', u'like'),
 (u'like', u'i'),
 (u'i', u'said'),
 (u'said', u'you'),
 (u'you', u'know'),
 (u'know', u'the'),
 (u'the', u'next'),
 (u'next', u'one'),
 (u'one', u'when'),
 (u'when', u'she'),
 (u'she', u'decides'),
 (u'decides', u'to'),
 (u'to', u'to'),
 (u'to', u'sell'),
 (u'sell', u'the'),
 (u'the', u'car'),
 (u'car', u'she'),
 (u'she', u'has'),
 (u'has', u'now'),
 (u'now', u"i'll"),
 (u"i'll", u'end'),
 (u'end', u'up'),
 (u'up', u'with'),
 (u'with', u'the'),
 (u'the', u'Parisian'),
 (u'Parisian', u'and'),
 (u'and', u'my'),
 (u'my', u'wife'),
 (u'wife', u'will'),
 (u'will', u'have'),
 (u'have', u'the'),
 (u'the', u'newer'),
 (u'newer', u'one'),
 (u'one', u'of'),
 (u'of', u'course'),
 (u'course', u'and')]

In [60]:
# bigrams

In [62]:
sent2 = [0]*FIXED_PARAMETERS["seq_length"]

token_sequence = list(nltk.bigrams(tokenize(sent)))
padding = FIXED_PARAMETERS["seq_length"] - len(token_sequence)

for i in range(FIXED_PARAMETERS["seq_length"]):
    if i >= len(token_sequence):
        index = bigrams[PADDING]
    else:
        if token_sequence[i] in bigrams:
            index = bigrams[token_sequence[i]]
        else:
            index = bigrams[UNKNOWN]
    sent2[i] = index

In [63]:
sent2

[619424,
 409875,
 828000,
 264019,
 1341244,
 1171988,
 1081219,
 835027,
 515480,
 21050,
 653008,
 1245533,
 71361,
 743673,
 1288829,
 715075,
 1135351,
 75251,
 433962,
 821216,
 348593,
 87153,
 1,
 1,
 1139144]

In [7]:
bigrams = word_indices_bi.values()

In [18]:
len(word_indices_bi)

1371055

In [19]:
len(word_indices_uni)

100158

In [21]:
bigrams = collections.Counter(word_indices_bi)

In [26]:
most_common = bigrams.most_common(500)
least_common = bigrams.most_common()[-500:]

In [33]:
words_to_get_rid_off = dict(most_common+least_common)

In [36]:
words_to_get_rid_off = words_to_get_rid_off.keys()

In [42]:
bigrams_fin = {k:v for k,v in bigrams.iteritems() if k not in words_to_get_rid_off}

In [56]:
bigrams_fin

{(u'northerner', u'walked'): 1319950,
 (u'from', u'Yoyogi'): 793540,
 (u'president', u'saw'): 124106,
 (u'Hillary', u'said'): 124217,
 (u'band', u'at'): 94449,
 (u'ten', u'substantive'): 894026,
 (u'or', u'interested'): 665892,
 (u'accusations', u'saying'): 1365796,
 (u'in', u'exploring'): 737210,
 (u'walking', u'seems'): 1367970,
 (u'with', u'daffodils'): 220222,
 (u'inmates', u'locked'): 802669,
 (u'her', u'America'): 605121,
 (u'artist', u'colony'): 967090,
 (u'shares', u'correctly'): 894028,
 (u'A', u'veteran'): 441779,
 (u'then', u'burrowed'): 894029,
 (u'their', u'lands'): 1179812,
 (u'dependent', u'solely'): 1204436,
 (u'stays', u'there'): 674066,
 (u'broadcasted', u'on'): 1001412,
 (u'continual', u'perhaps'): 1161583,
 (u'which', u'featured'): 31534,
 (u',', u'Feb.'): 773998,
 (u'advocacy', u'between'): 1128528,
 (u'assumed', u'my'): 856714,
 (u'their', u'seafood'): 1048772,
 (u'wild', u'river'): 673180,
 (u'Devising', u'the'): 942568,
 (u'After', u'Kevin'): 509,
 (u'have', u'e

In [1]:
a = [i for i in range(10)]
b = [i for i in range(20)]

In [2]:
a,b

([0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])

In [5]:
import pickle

In [7]:
pickle.dump(a,b,open('./test1.p',"wb"))

ValueError: pickle protocol must be <= 2

In [2]:
import tensorflow as tf

In [5]:
sess = tf.InteractiveSession()

In [7]:
val = tf.concat([1,2,3,4],[1,2,3,4],1)

TypeError: expected string or buffer