# ANALOGY
## Demonstration Jupyter Notebook

In [1]:
import os
import sys

if os.name == 'nt':
    module_path = os.path.abspath(os.path.join('..\..\..'))
else:
    module_path = os.path.abspath(os.path.join('../../..'))
    
if module_path not in sys.path:
    sys.path.append(module_path)
    
from analogy import ANALOGY

INPUT_FILE_PATH = "D:\\USC\\CS548\\groupdat\\FB15k\\"
MODEL_FILE_PATH = INPUT_FILE_PATH

TRAIN_FILE_NAME = "train.txt"
VALIDATION_FILE_NAME = "valid.txt"
WHOLE_FILE_NAME = "whole.txt"
TEST_FILE_NAME = "test.txt"

RELATIONS_FILE_NAME = "relation2id.txt"
ENTITIES_FILE_NAME = "entity2id.txt"

MODEL_FILE_NAME = "analogy.mod"

### Read Data Set - This must be run before executing learn_embeddings
Validation and Whole Text files are optional

In [2]:
algorithm = ANALOGY()

train_file_names = {"train": INPUT_FILE_PATH + TRAIN_FILE_NAME,
# Optional          "valid": INPUT_FILE_PATH + VALIDATION_FILE_NAME,
# Optional          "whole": INPUT_FILE_PATH + WHOLE_FILE_NAME,
                    "relations": INPUT_FILE_PATH + RELATIONS_FILE_NAME,
                    "entities": INPUT_FILE_PATH + ENTITIES_FILE_NAME}

algorithm.read_dataset(train_file_names)


INFO:root:Input Files ...
INFO:root:  entities -----> D:\USC\CS548\groupdat\FB15k\entity2id.txt
INFO:root: relations -----> D:\USC\CS548\groupdat\FB15k\relation2id.txt
INFO:root:     train -----> D:\USC\CS548\groupdat\FB15k\train.txt
INFO:root:Preparing data...
INFO:root:   Done loading data...


### Learn Embeddings - Data Set Must be Read First
this should take several minutes per epoch if using the time minimizing hyper-parameters below and no filtered evaluation or validation

In [3]:
parameters = {"mode": 'single',
              "epoch": 5,
              "batch": 128,
              "lr": 0.05,
              "dim": 40,             # reduced these from 200 to save processing time
              "negative": 1,         # reduced from 5 to save processing time
              "opt": 'adagrad',
              "l2_reg": 0.001,
              "gradclip": 5,
              'filtered': False}     # turned filtered output off to save processing time
 
algorithm.learn_embeddings(parameters)


INFO:root:Learning Embeddings...
INFO:root:Arguments...
INFO:root:     batch -----> 128
INFO:root:  cp_ratio -----> 0.5
INFO:root:       dim -----> 40
INFO:root:     epoch -----> 5
INFO:root:  filtered -----> False
INFO:root:  gradclip -----> 5
INFO:root:    l2_reg -----> 0.001
INFO:root:        lr -----> 0.05
INFO:root:    margin -----> 1
INFO:root:      mode -----> single
INFO:root:     nbest -----> 10
INFO:root:  negative -----> 1
INFO:root:       opt -----> adagrad
INFO:root: save_step -----> 30
INFO:root:setup trainer...
INFO:root:start 1 epoch
INFO:root:training loss in 1 epoch: 2562.1351944025305
INFO:root:training time in 1 epoch: 58.981436014175415
INFO:root:start 2 epoch
INFO:root:training loss in 2 epoch: 2134.851779754586
INFO:root:training time in 2 epoch: 58.9695520401001
INFO:root:start 3 epoch
INFO:root:training loss in 3 epoch: 1819.3407248218366
INFO:root:training time in 3 epoch: 58.697208881378174
INFO:root:start 4 epoch
INFO:root:training loss in 4 epoch: 1681.0978

### Save Model - Optional

In [4]:
algorithm.save_model(MODEL_FILE_PATH + MODEL_FILE_NAME)


Saving model: D:\USC\CS548\groupdat\FB15k\analogy.mod
  Model saved:


### Load Model - Optional
This can load a previously saved model for evaluation or other purposes, not required if current instance of ANALOGY has loaded a dataset and learned the embeddings for that dataset

In [5]:
new_alg = ANALOGY()
new_alg.load_model(MODEL_FILE_PATH + MODEL_FILE_NAME)


Loading model: D:\USC\CS548\groupdat\FB15k\analogy.mod
   Model loaded


### Add Some Test Data

In [6]:
test_subs = ['/m/08mbj32', '/m/08mbj5d', '/m/08mg_b']
test_rels = ['/location/statistical_region/religions./location/religion_percentage/religion',
             '/military/military_conflict/combatants./military/military_combatant_group/combatants',
             '/award/award_category/winners./award/award_honor/ceremony']
test_obs = ['/m/0631_', '/m/0d060g', '/m/01bx35']


### Print Embeddings for sub, rel, object
Each entity & relation has 3 embeddings. Two embeddings are used for ComplEx part of Hybrid score measure (one a vector of real numbers and the other a vector of imaginary numbers). The other embedding is used for DistMult part of Hybrid score measure which only has a real number.  

Output for each entity/relation retrieval function is tuple with each element having an array for the input list and its embedding, which is outputted as real part of complex number (ComplEx), imaginary part of complex number (Complex), real number (DistMult)  

The length of each embedding vector will be equal to the number of dimensions used as a hyper-parameter to the training model divided by 2 (since complex

In [7]:
subs = new_alg.retrieve_entity_embeddings(test_subs)
print("Length of embedding vector: {}, should equal number of dimensions of model assumption/2".format(len(subs[0][0])))
print("Subjects:")
print(subs)

rels = new_alg.retrieve_relations_embeddings(test_rels)
print("Relations:")
print(rels)

objs = new_alg.retrieve_entity_embeddings(test_obs)
print("Objects:")
print(objs)

Length of embedding vector: 20, should equal number of dimensions of model assumption/2
Subjects:
(array([[-0.07120496, -0.19308252, -0.82269281, -0.04814764,  0.27663849,
        -0.10169495, -0.01835223,  0.24143365, -0.9520605 ,  0.25659713,
         0.73273715,  0.38150959,  0.24872684,  0.08031953, -0.09623907,
         0.35290379, -0.10382456,  1.15660642, -0.55745059, -0.62715406],
       [-0.36286615,  0.4374186 , -0.26852091,  0.14522091,  0.01864142,
         0.28335949, -0.8334753 , -0.10990934, -0.36574102,  0.81517703,
        -0.12858646,  0.04630231,  0.32930097, -0.34123059, -0.24332062,
        -0.51826207,  0.06260657,  0.91959568, -0.4766628 , -0.41319964],
       [-0.26112736, -0.07409245,  0.33418486,  0.0610582 ,  0.37208818,
        -0.28557152,  0.6751504 ,  0.13817221,  0.09367402,  0.31974399,
        -0.2500653 ,  0.23273915,  0.39676781, -0.04312702, -0.16663712,
         0.04960696, -0.32102902,  0.34021685,  0.32136055,  0.19286348]]), array([[ 0.01399415,

### Scoring Matrix for Test Data
Length of each scoring vector (there are 3 for the above test data, one for each s,r,o triplet given) is equal to the number of entities in the vocabulary. For example, for FB15k this is 14,951. 


In [8]:
sm = algorithm.retrieve_scoring_matrix(test_subs, test_rels)
print(sm)
print("Number of test triplets in data: {}".format(len(sm)))
print("Length of a scoring vector: {}".format(len(sm[0])))

[[ 0.53771241 -0.91870999 -0.71549855 ...  0.6032623   0.88743655
  -0.03763333]
 [ 1.61292955 -0.28554807 -1.56477576 ... -0.31596579  0.39535818
   0.21878598]
 [ 0.58576572 -0.25167187  0.17536631 ...  0.28545742  0.26014282
   0.04020717]]
Number of test triplets in data: 3
Length of a scoring vector: 14951


### Evaluation of Trained Model
Produces evaluation metrics for model, whole text file is optional

In [9]:
evaluate_file_names = {"test": INPUT_FILE_PATH + TEST_FILE_NAME,
                       "whole": INPUT_FILE_PATH + WHOLE_FILE_NAME}  #  Optional
all_res = new_alg.evaluate(evaluate_file_names)
for metric in sorted(all_res.keys()):
    print('{:20s}: {}'.format(metric, all_res[metric]))

Running model evaluation
loading whole graph...
Hits@1              : 0.06493033806774898
Hits@1(filter)      : 0.11229706624231857
Hits@10             : 0.23489529549186572
Hits@10(filter)     : 0.3098474716866144
Hits@3              : 0.12752450440994736
Hits@3(filter)      : 0.20241742987252628
MRR                 : 0.12134882599117944
MRR(filter)         : 0.1809578329476909
