In [1]:
import random
import os
import time
from data.load_fb15k237 import load_fb15k, load_fb15k_type_constraints, split_relations
from sampler import *
from eval import eval_triples
from model import *
from model.comp_models import *
import sys
from kb import subsample_kb
import shutil
import json
from tensorflow.models.rnn.rnn_cell import *

In [2]:
tf.app.flags.DEFINE_string('fb15k_dir', "data_rel", 'data dir containing extracted files of fb15k dataset.')

# model
tf.app.flags.DEFINE_integer("size", 50, "hidden size of model")

# training
tf.app.flags.DEFINE_float("learning_rate", 1e-2, "Learning rate.")
tf.app.flags.DEFINE_float("l2_lambda", 0, "L2-regularization raten (only for batch training).")
tf.app.flags.DEFINE_float("sample_text_prob", 0.935,
                          "Probability of sampling text triple (default is ratio of text (emnlp) to kb triples.")
tf.app.flags.DEFINE_float("learning_rate_decay", 0.5, "Learning rate decay when loss on validation set does not improve.")
tf.app.flags.DEFINE_integer("num_neg", 200, "Number of negative examples for training.")
tf.app.flags.DEFINE_integer("pos_per_batch", 100, "Number of examples in each batch for training.")
tf.app.flags.DEFINE_integer("max_iterations", -1, "Maximum number of batches during training. -1 means until convergence")
tf.app.flags.DEFINE_integer("ckpt_its", -1, "Number of iterations until running checkpoint. Negative means after every epoch.")
tf.app.flags.DEFINE_integer("random_seed", 1234, "Seed for rng.")
tf.app.flags.DEFINE_integer("subsample_kb", -1, "num of entities in subsampled kb. if <= 0 use whole kb")
tf.app.flags.DEFINE_boolean("kb_only", False, "Only load and train on FB relations, ignoring text.")
tf.app.flags.DEFINE_boolean("batch_train", False, "Use batch training.")
tf.app.flags.DEFINE_boolean("type_constraint", False, "Use type constraint during sampling.")
tf.app.flags.DEFINE_string("save_dir", "save/" + time.strftime("%d%m%Y_%H%M%S", time.localtime()),
                           "Where to save model and its configuration, always last will be kept.")
tf.app.flags.DEFINE_string("model", "DistMult",
                           "Model architecture or combination thereof split by comma of: "
                           "'DistMult', 'DistMult', 'ModelE', 'ModelO', 'ModelN', 'WeightedModelO'")
tf.app.flags.DEFINE_string("observed_sets", "train_text", "Which sets to observe for observed models.")
tf.app.flags.DEFINE_string("valid_mode", "a", "[a,t,nt] are possible. a- validate on all triples, "
                                              "t- validate only on triples with text mentions, "
                                              "nt- validate only on triples without text mentions")
tf.app.flags.DEFINE_string("composition", "BiRNN", "'LSTM', 'GRU', 'RNN', 'BoW', 'BiLSTM', 'BiGRU', 'BiRNN'")
FLAGS = tf.app.flags.FLAGS

if "," in FLAGS.model: #multiple model
    FLAGS.model = FLAGS.model.split(",")

FLAGS.observed_sets = FLAGS.observed_sets.split(",")

assert (not FLAGS.batch_train or FLAGS.ckpt_its <= -1), "Do not define checkpoint iterations when doing batch training."

if FLAGS.batch_train:
    print("Batch training!")

random.seed(FLAGS.random_seed)
tf.set_random_seed(FLAGS.random_seed)

kb = load_fb15k(FLAGS.fb15k_dir, with_text=not FLAGS.kb_only)
if FLAGS.subsample_kb > 0:
    kb = subsample_kb(kb, FLAGS.subsample_kb)

if FLAGS.type_constraint:
    print("Loading type constraints...")
    load_fb15k_type_constraints(kb, os.path.join(FLAGS.fb15k_dir, "types"))

num_kb = 0
num_text = 0

for f in kb.get_all_facts():
    if f[2] == "train":
        num_kb += 1
    elif f[2] == "train_text":
        num_text += 1
print("Loaded data. %d kb triples. %d text_triples." % (num_kb, num_text))
batch_size = (FLAGS.num_neg+1) * FLAGS.pos_per_batch * 2  # x2 because subject and object loss training
#random sampler for generating negative samples
fact_sampler = BatchNegTypeSampler(kb, FLAGS.pos_per_batch, which_set="train", neg_per_pos=FLAGS.num_neg, type_constraint=FLAGS.type_constraint)

if not FLAGS.kb_only:
    text_sampler = BatchNegTypeSampler(kb, FLAGS.pos_per_batch, which_set="train_text", neg_per_pos=FLAGS.num_neg, type_constraint=False)
print("Created Samplers.")
train_dir = os.path.join(FLAGS.save_dir, "train")

i = 0

subsample_validation = map(lambda x: x[0], kb.get_all_facts_of_arity(2, "valid"))
if len(subsample_validation) > 5000:
    subsample_validation = random.sample(subsample_validation, 5000)


if FLAGS.ckpt_its <= 0:
    print "Setting checkpoint iteration to size of whole epoch."
    FLAGS.ckpt_its = fact_sampler.epoch_size
sess= tf.InteractiveSession()
print "Creating model ..."
model = create_model(kb, FLAGS.size, batch_size, num_neg=FLAGS.num_neg, learning_rate=FLAGS.learning_rate,
                     l2_lambda=FLAGS.l2_lambda, is_batch_training=FLAGS.batch_train, type=FLAGS.model,
                     observed_sets=FLAGS.observed_sets, composition=FLAGS.composition)#create a model object
model.saver.restore(sess,"/Users/mayk/working/rel_extractor/baseline/genie-kb/save/03052016_172546/train/model.ckpt-4641")

Loaded data. 13563 kb triples. 66301 text_triples.
Created Samplers.
Setting checkpoint iteration to size of whole epoch.
Creating model ...


In [3]:
from gensim.matutils import argsort
def query(rel,scores,tuples):
    indices = [ i for i in range(len(tuples)) if tuples[i][0] == rel]
    scores = scores[indices]
    tuples = [tuples[id] for id in indices]
    bests =argsort(scores,reverse=True)[:10]
    return [(tuples[best],scores[best]) for best in bests]
def make_tuples(rel,vocab):
    return [(rel,e1,e2) for e1 in vocab for e2 in vocab]

success
shit ass


In [22]:
q= '[XXX]:<conj>:and:[YYY]'
tuples = make_tuples(q,model._kb.get_vocab(1))
scores = model.score_triples(sess,tuples)
query(q,scores,tuples)

[(('[XXX]:<conj>:and:[YYY]', '/EVENT', '/CONTACT_INFO/url'),
  6.7448163032531738),
 (('[XXX]:<conj>:and:[YYY]', '/ORGANIZATION', '/LOCATION'),
  6.7448163032531738),
 (('[XXX]:<conj>:and:[YYY]', '/ORGANIZATION/CORPORATION', '/LOCATION'),
  6.5021052360534668),
 (('[XXX]:<conj>:and:[YYY]', '/EVENT', '/CONTACT_INFO'), 6.5021052360534668),
 (('[XXX]:<conj>:and:[YYY]', '/WORK_OF_ART', '/EVENT'), 6.0701150894165039),
 (('[XXX]:<conj>:and:[YYY]', '/ORGANIZATION/GOVERNMENT', '/LOCATION'),
  5.8778252601623535),
 (('[XXX]:<conj>:and:[YYY]', '/EVENT', '/LOCATION/REGION'),
  5.8778252601623535),
 (('[XXX]:<conj>:and:[YYY]', '/FACILITY/ATTRACTION', '/CONTACT_INFO/url'),
  5.7633728981018066),
 (('[XXX]:<conj>:and:[YYY]', '/ORGANIZATION', '/GPE'), 5.7633728981018066),
 (('[XXX]:<conj>:and:[YYY]', '/ORGANIZATION/GOVERNMENT', '/GPE'),
  5.4421877861022949)]

In [5]:
q= '[XXX]:<conj>:and:[YYY]'
tuples = make_tuples(q,model._kb.get_vocab(1))
scores = model.score_triples(sess,tuples)
query(q,scores,tuples)

[(('[XXX]:<conj>:and:[YYY]', '/EVENT', '/CONTACT_INFO/url'),
  6.7448163032531738),
 (('[XXX]:<conj>:and:[YYY]', '/ORGANIZATION', '/LOCATION'),
  6.7448163032531738),
 (('[XXX]:<conj>:and:[YYY]', '/ORGANIZATION/CORPORATION', '/LOCATION'),
  6.5021052360534668),
 (('[XXX]:<conj>:and:[YYY]', '/EVENT', '/CONTACT_INFO'), 6.5021052360534668),
 (('[XXX]:<conj>:and:[YYY]', '/WORK_OF_ART', '/EVENT'), 6.0701150894165039),
 (('[XXX]:<conj>:and:[YYY]', '/ORGANIZATION/GOVERNMENT', '/LOCATION'),
  5.8778252601623535),
 (('[XXX]:<conj>:and:[YYY]', '/EVENT', '/LOCATION/REGION'),
  5.8778252601623535),
 (('[XXX]:<conj>:and:[YYY]', '/FACILITY/ATTRACTION', '/CONTACT_INFO/url'),
  5.7633728981018066),
 (('[XXX]:<conj>:and:[YYY]', '/ORGANIZATION', '/GPE'), 5.7633728981018066),
 (('[XXX]:<conj>:and:[YYY]', '/ORGANIZATION/GOVERNMENT', '/GPE'),
  5.4421877861022949)]

In [7]:
from gensim.models import Word2Vec

e_obj= sess.run(model.e_obj,feed_dict={model._obj_input:np.array(range(len(model._kb.get_vocab(1))))})
with open('label.bin','wb') as f:
    vocab  =list(model._kb.get_vocab(1))
    f.write("160 50\n");
    for i in range(len(vocab)):
        f.write("%s %s\n" % (vocab[i]," ".join(map(lambda x:str(x),e_obj[i]))))
emb  = Word2Vec.load_word2vec_format('label.bin',binary=False)




constructing new boss


In [19]:
emb.most_similar(positive=['/CONTACT_INFO'])

[(u'/LANGUAGE', 0.6813899874687195),
 (u'/LOCATION/LAKE_SEA_OCEAN', 0.6801102161407471),
 (u'/WORK_OF_ART/SONG', 0.67412269115448),
 (u'/SUBSTANCE/DRUG', 0.6518536806106567),
 (u'/WORK_OF_ART/BOOK', 0.6446441411972046),
 (u'/CONTACT_INFO/url', 0.6431254744529724),
 (u'/FACILITY/BRIDGE', 0.6131840944290161),
 (u'/WORK_OF_ART', 0.6089992523193359),
 (u'/SUBSTANCE/CHEMICAL', 0.5996747612953186),
 (u'/FACILITY/HIGHWAY_STREET', 0.5766296982765198)]

In [16]:
emb.vocab

{u'/ANIMAL': <gensim.models.word2vec.Vocab at 0x11943d6d0>,
 u'/CONTACT_INFO': <gensim.models.word2vec.Vocab at 0x11943d910>,
 u'/CONTACT_INFO/url': <gensim.models.word2vec.Vocab at 0x11943d8d0>,
 u'/DISEASE': <gensim.models.word2vec.Vocab at 0x11943d590>,
 u'/EVENT': <gensim.models.word2vec.Vocab at 0x11943d650>,
 u'/EVENT/HURRICANE': <gensim.models.word2vec.Vocab at 0x11943ddd0>,
 u'/EVENT/WAR': <gensim.models.word2vec.Vocab at 0x11943da90>,
 u'/FACILITY': <gensim.models.word2vec.Vocab at 0x11943d810>,
 u'/FACILITY/AIRPORT': <gensim.models.word2vec.Vocab at 0x11943df10>,
 u'/FACILITY/ATTRACTION': <gensim.models.word2vec.Vocab at 0x11943d850>,
 u'/FACILITY/BRIDGE': <gensim.models.word2vec.Vocab at 0x11943dd90>,
 u'/FACILITY/BUILDING': <gensim.models.word2vec.Vocab at 0x11943dc50>,
 u'/FACILITY/HIGHWAY_STREET': <gensim.models.word2vec.Vocab at 0x11943de90>,
 u'/GAME': <gensim.models.word2vec.Vocab at 0x11943db90>,
 u'/GPE': <gensim.models.word2vec.Vocab at 0x11943d610>,
 u'/GPE/CITY': 

In [None]:
from data.load_fb15k237 import split_relations

In [None]:
facts = [ fact[0][0] for fact in model._kb.get_all_facts() if fact[2] == 'train_text']
    

In [None]:
for f in facts[:100]:
    print f,split_relations(f)

In [6]:
emb.most_similar(positive=['/GPE'])

NameError: name 'emb' is not defined

In [None]:
len(e_obj)