# ANALOGY
## Demonstration Jupyter Notebook

In [3]:
import os
import sys

if os.name == 'nt':
    module_path = os.path.abspath(os.path.join('..\..\..'))
else:
    module_path = os.path.abspath(os.path.join('../../..'))
    
if module_path not in sys.path:
    sys.path.append(module_path)
    
from analogy import ANALOGY

INPUT_FILE_PATH = "D:\\USC\\CS548\\groupdat\\FB15k\\"
MODEL_FILE_PATH = INPUT_FILE_PATH

TRAIN_FILE_NAME = "train.txt"
VALIDATION_FILE_NAME = "valid.txt"
WHOLE_FILE_NAME = "whole.txt"
TEST_FILE_NAME = "test.txt"

RELATIONS_FILE_NAME = "relation2id.txt"
ENTITIES_FILE_NAME = "entity2id.txt"

MODEL_FILE_NAME = "analogy.mod"

### Read Data Set - This must be run before executing learn_embeddings
Validation and Whole Text files are optional

In [4]:
algorithm = ANALOGY()

train_file_names = {"train": INPUT_FILE_PATH + TRAIN_FILE_NAME,
# Optional          "valid": INPUT_FILE_PATH + VALIDATION_FILE_NAME,
                    "whole": INPUT_FILE_PATH + WHOLE_FILE_NAME,
                    "relations": INPUT_FILE_PATH + RELATIONS_FILE_NAME,
                    "entities": INPUT_FILE_PATH + ENTITIES_FILE_NAME}

algorithm.read_dataset(train_file_names)


INFO:root:Input Files ...
INFO:root:  entities -----> D:\USC\CS548\groupdat\FB15k\entity2id.txt
INFO:root: relations -----> D:\USC\CS548\groupdat\FB15k\relation2id.txt
INFO:root:     train -----> D:\USC\CS548\groupdat\FB15k\train.txt
INFO:root:     whole -----> D:\USC\CS548\groupdat\FB15k\whole.txt
INFO:root:Preparing data...
INFO:root:   Loading whole graph...
INFO:root:   Done loading data...


### Learn Embeddings - Data Set Must be Read First
this should take several minutes per epoch if using the time minimizing hyper-parameters below and no filtered evaluation or validation

In [5]:
parameters = {"mode": 'single',
              "epoch": 3,
              "batch": 128,
              "lr": 0.05,
              "dim": 40,             # reduced these from 200 to save processing time
              "negative": 1,         # reduced from 5 to save processing time
              "opt": 'adagrad',
              "l2_reg": 0.001,
              "gradclip": 5,
              'filtered': True}     # turned filtered output off to save processing time
 
algorithm.learn_embeddings(parameters)


INFO:root:Learning Embeddings...
INFO:root:Arguments...
INFO:root:     batch -----> 128
INFO:root:       dim -----> 40
INFO:root:     epoch -----> 3
INFO:root:  filtered -----> True
INFO:root:  gradclip -----> 5
INFO:root:    l2_reg -----> 0.001
INFO:root:        lr -----> 0.05
INFO:root:      mode -----> single
INFO:root:  negative -----> 1
INFO:root:       opt -----> adagrad
INFO:root:setup trainer...
INFO:root:start 1 epoch
INFO:root:training loss in 1 epoch: 2562.1351944025305
INFO:root:training time in 1 epoch: 58.310484409332275
INFO:root:start 2 epoch
INFO:root:training loss in 2 epoch: 2134.851779754586
INFO:root:training time in 2 epoch: 58.2355432510376
INFO:root:start 3 epoch
INFO:root:training loss in 3 epoch: 1819.3407248218366
INFO:root:training time in 3 epoch: 58.1825897693634
INFO:root:done all


### Save Model - Optional

In [6]:
algorithm.save_model(MODEL_FILE_PATH + MODEL_FILE_NAME)


Saving model: D:\USC\CS548\groupdat\FB15k\analogy.mod
  Model saved:


### Load Model - Optional
This can load a previously saved model for evaluation or other purposes, not required if current instance of ANALOGY has loaded a dataset and learned the embeddings for that dataset

In [7]:
new_alg = ANALOGY()
new_alg.load_model(MODEL_FILE_PATH + MODEL_FILE_NAME)


Loading model: D:\USC\CS548\groupdat\FB15k\analogy.mod
   Model loaded


### Add Some Test Data

In [8]:
test_subs = ['/m/08mbj32', '/m/08mbj5d', '/m/08mg_b']
test_rels = ['/location/statistical_region/religions./location/religion_percentage/religion',
             '/military/military_conflict/combatants./military/military_combatant_group/combatants',
             '/award/award_category/winners./award/award_honor/ceremony']
test_obs = ['/m/0631_', '/m/0d060g', '/m/01bx35']


### Print Embeddings for sub, rel, object
Each entity & relation has 3 embeddings. Two embeddings are used for ComplEx part of Hybrid score measure (one a vector of real numbers and the other a vector of imaginary numbers). The other embedding is used for DistMult part of Hybrid score measure which only has a real number.  

Output for each entity/relation retrieval function is tuple with each element having an array for the input list and its embedding, which is outputted as real part of complex number (ComplEx), imaginary part of complex number (Complex), real number (DistMult)  

The length of each embedding vector will be equal to the number of dimensions used as a hyper-parameter to the training model divided by 2 (since complex

In [9]:
subs = new_alg.retrieve_entity_embeddings(test_subs)
print("Length of embedding vector: {}, should equal number of dimensions of model assumption/2".format(len(subs[0][0])))
print("Subjects:")
print(subs)

rels = new_alg.retrieve_relations_embeddings(test_rels)
print("Relations:")
print(rels)

objs = new_alg.retrieve_entity_embeddings(test_obs)
print("Objects:")
print(objs)

Length of embedding vector: 20, should equal number of dimensions of model assumption/2
Subjects:
(array([[-0.11599567, -0.09913024, -0.56126962, -0.03843246,  0.2027402 ,
        -0.21039491, -0.04347815,  0.18943705, -0.81214096,  0.23792471,
         0.53002726,  0.24625408,  0.16786235,  0.02220208, -0.1450165 ,
         0.26209121, -0.11544381,  0.95422802, -0.51634763, -0.43180553],
       [-0.34467764,  0.38421007, -0.18595466,  0.15601741,  0.02051645,
         0.11464661, -0.7570818 , -0.06532293, -0.43547332,  0.7957481 ,
        -0.09950476,  0.10988647,  0.33239722, -0.32173038, -0.19239841,
        -0.46784771,  0.05252034,  0.74668253, -0.48052335, -0.3940016 ],
       [-0.23951378, -0.00100591,  0.30055812, -0.02605304,  0.31145547,
        -0.32767302,  0.57782853,  0.05607674,  0.01100984,  0.28774833,
        -0.20066594,  0.08106874,  0.31468235, -0.09054843, -0.06231675,
         0.01841071, -0.41544719,  0.37955573,  0.30039227,  0.19988216]]), array([[ 0.02243235,

### Scoring Matrix for Test Data
Length of each scoring vector (there are 3 for the above test data, one for each s,r,o triplet given) is equal to the number of entities in the vocabulary. For example, for FB15k this is 14,951. 


In [10]:
sm = algorithm.retrieve_scoring_matrix(test_subs, test_rels)
print(sm)
print("Number of test triplets in data: {}".format(len(sm)))
print("Length of a scoring vector: {}".format(len(sm[0])))

[[ 0.26593932 -0.53132736 -0.61399813 ...  0.46215713  0.7114059
   0.02068561]
 [ 1.17390789 -0.10383274 -1.00958987 ... -0.15033812  0.41185776
   0.24936485]
 [ 0.48320306 -0.36659813  0.25440443 ...  0.16139368  0.35351311
   0.04187984]]
Number of test triplets in data: 3
Length of a scoring vector: 14951


### Evaluation of Trained Model
Produces evaluation metrics for model, whole text file is optional

In [11]:
evaluate_file_names = {"test": INPUT_FILE_PATH + TEST_FILE_NAME,
                       "whole": INPUT_FILE_PATH + WHOLE_FILE_NAME}  # Optional
all_res = new_alg.evaluate(evaluate_file_names)
for metric in sorted(all_res.keys()):
    print('{:20s}: {}'.format(metric, all_res[metric]))

Running model evaluation
loading whole graph...
Hits@1              : 0.06313588732203619
Hits@1(filter)      : 0.11469248870003894
Hits@10             : 0.21804269438472346
Hits@10(filter)     : 0.29192835740041645
Hits@3              : 0.11916168678370097
Hits@3(filter)      : 0.19607760153036177
MRR                 : 0.1144619222644371
MRR(filter)         : 0.17654824253325588
Hits@1              : 0.06313588732203619
Hits@1(filter)      : 0.11469248870003894
Hits@10             : 0.21804269438472346
Hits@10(filter)     : 0.29192835740041645
Hits@3              : 0.11916168678370097
Hits@3(filter)      : 0.19607760153036177
MRR                 : 0.1144619222644371
MRR(filter)         : 0.17654824253325588
