# ANALOGY
## Demonstration Jupyter Notebook

In [None]:
import os
import sys

if os.name == 'nt':
    module_path = os.path.abspath(os.path.join('..\..\..'))
else:
    module_path = os.path.abspath(os.path.join('../../..'))
    
if module_path not in sys.path:
    sys.path.append(module_path)
    
from analogy import ANALOGY

INPUT_FILE_PATH = "D:\\USC\\CS548\\groupdat\\FB15k\\"
MODEL_FILE_PATH = INPUT_FILE_PATH

TRAIN_FILE_NAME = "train.txt"
VALIDATION_FILE_NAME = "valid.txt"
WHOLE_FILE_NAME = "whole.txt"
TEST_FILE_NAME = "test.txt"

RELATIONS_FILE_NAME = "relation2id.txt"
ENTITIES_FILE_NAME = "entity2id.txt"

MODEL_FILE_NAME = "analogy.mod"

### Read Data Set - This must be run before executing learn_embeddings
Validation and Whole Text files are optional

In [None]:
algorithm = ANALOGY()

train_file_names = {"train": INPUT_FILE_PATH + TRAIN_FILE_NAME,
# Optional          "valid": INPUT_FILE_PATH + VALIDATION_FILE_NAME,
                    "whole": INPUT_FILE_PATH + WHOLE_FILE_NAME,
                    "relations": INPUT_FILE_PATH + RELATIONS_FILE_NAME,
                    "entities": INPUT_FILE_PATH + ENTITIES_FILE_NAME}

algorithm.read_dataset(train_file_names)


### Learn Embeddings - Data Set Must be Read First
this should take several minutes per epoch if using the time minimizing hyper-parameters below and no filtered evaluation or validation

In [None]:
parameters = {"mode": 'single',
              "epoch": 3,
              "batch": 128,
              "lr": 0.05,
              "dim": 40,             # reduced these from 200 to save processing time
              "negative": 1,         # reduced from 5 to save processing time
              "opt": 'adagrad',
              "l2_reg": 0.001,
              "gradclip": 5,
              'filtered': True}     # turned filtered output off to save processing time
 
algorithm.learn_embeddings(parameters)


### Save Model - Optional

In [None]:
algorithm.save_model(MODEL_FILE_PATH + MODEL_FILE_NAME)


### Load Model - Optional
This can load a previously saved model for evaluation or other purposes, not required if current instance of ANALOGY has loaded a dataset and learned the embeddings for that dataset

In [None]:
new_alg = ANALOGY()
new_alg.load_model(MODEL_FILE_PATH + MODEL_FILE_NAME)


### Add Some Test Data

In [None]:
test_subs = ['/m/08mbj32', '/m/08mbj5d', '/m/08mg_b']
test_rels = ['/location/statistical_region/religions./location/religion_percentage/religion',
             '/military/military_conflict/combatants./military/military_combatant_group/combatants',
             '/award/award_category/winners./award/award_honor/ceremony']
test_obs = ['/m/0631_', '/m/0d060g', '/m/01bx35']


### Print Embeddings for sub, rel, object
Each entity & relation has 3 embeddings. Two embeddings are used for ComplEx part of Hybrid score measure (one a vector of real numbers and the other a vector of imaginary numbers). The other embedding is used for DistMult part of Hybrid score measure which only has a real number.  

Output for each entity/relation retrieval function is tuple with each element having an array for the input list and its embedding, which is outputted as real part of complex number (ComplEx), imaginary part of complex number (Complex), real number (DistMult)  

The length of each embedding vector will be equal to the number of dimensions used as a hyper-parameter to the training model divided by 2 (since complex

In [None]:
subs = new_alg.retrieve_entity_embeddings(test_subs)
print("Length of embedding vector: {}, should equal number of dimensions of model assumption/2".format(len(subs[0][0])))
print("Subjects:")
print(subs)

rels = new_alg.retrieve_relations_embeddings(test_rels)
print("Relations:")
print(rels)

objs = new_alg.retrieve_entity_embeddings(test_obs)
print("Objects:")
print(objs)

### Scoring Matrix for Test Data
Length of each scoring vector (there are 3 for the above test data, one for each s,r,o triplet given) is equal to the number of entities in the vocabulary. For example, for FB15k this is 14,951. 


In [None]:
sm = algorithm.retrieve_scoring_matrix(test_subs, test_rels)
print(sm)
print("Number of test triplets in data: {}".format(len(sm)))
print("Length of a scoring vector: {}".format(len(sm[0])))

### Evaluation of Trained Model
Produces evaluation metrics for model, whole text file is optional

In [None]:
evaluate_file_names = {"test": INPUT_FILE_PATH + TEST_FILE_NAME,
                       "whole": INPUT_FILE_PATH + WHOLE_FILE_NAME}  # Optional
all_res = new_alg.evaluate(evaluate_file_names)
for metric in sorted(all_res.keys()):
    print('{:20s}: {}'.format(metric, all_res[metric]))