In [12]:
import numpy as np
import pickle
import json
import os
import sys
import copy
import sklearn.preprocessing
import models
from sklearn.neighbors import NearestNeighbors
import utils

In [13]:
DATA_DIR = "../../data/fb15k/"
DUMP_FILE = "../dumps/fb15k_distmult_dump_norm.pkl"
MODEL_TYPE = "distmult"

In [14]:
train_data = utils.read_data(os.path.join(DATA_DIR,"train.txt"))
# dev_data = read_data(os.path.join(DATA_DIR,"valid.txt"))
# test_data = read_data(os.path.join(DATA_DIR,"test.txt"))

In [15]:
dump=utils.load_pickle(DUMP_FILE)
dump.keys()

dict_keys(['tail_rel_type', 'entity_type', 'entity_to_id', 'head_rel_type', 'entity_real', 'rel_real', 'relation_to_id'])

In [16]:
model=models.TypedDM(DUMP_FILE)

In [17]:
mapped_train_data = utils.map_data(train_data,dump)
# mapped_dev_data = map_data(dev_data)
# mapped_test_data = map_data(test_data)

In [60]:
entity_to_rel=utils.get_ent_to_rel(mapped_train_data)

# Length 1 Rules

In [95]:
nbrs = NearestNeighbors(n_neighbors=500,metric=model.similarity_relembedding).fit(model.relation_matrix)
relation_count=len(dump['relation_to_id'])
print(relation_count)
count_r,set_r=utils.get_relation_dict(mapped_train_data)
print(len(count_r),len(set_r))
support=5

1345
1345 1345


In [96]:
rules_dict_1={}
count=0
count2=0
for r1 in range(relation_count):
    if(r1%100==0):
        print(r1)
    if(count_r[r1]<support):
        continue
    combined_rel=model.relation_matrix[r1].reshape((1,-1))
    distances, indices = nbrs.kneighbors(combined_rel)
    indices=indices[0]
    bool_arr=[False for i in range(relation_count)]
    for ind in indices:
        if ind!=r1:
            bool_arr[ind]=True
    cur_dict={}
    for pair_e1e2 in set_r[r1]:
        if pair_e1e2 not in entity_to_rel:
            continue
        for r2 in entity_to_rel[pair_e1e2]:
            count+=1
            if bool_arr[r2]==False:
                continue
            count2+=1
            if r2 not in cur_dict:
                cur_dict[r2]=0
            cur_dict[r2]+=1
    rules_dict_1[r1]=cur_dict            

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300


In [98]:
print(len(rules_dict_1))
print(count,count2)

992
732202 204


# Length 2 Rules

In [33]:
index_head=utils.get_head_index(mapped_train_data)
count_r1_r2,set_r1_r2=utils.get_r1r2_count(mapped_train_data,index_head,get_set=True)

In [34]:
nbrs = NearestNeighbors(n_neighbors=100,metric=model.similarity_relembedding).fit(model.relation_matrix)
support=10

In [35]:
count=0
rules_dict={}
for r1 in range(relation_count):
    if(r1%100==0):
        print(r1)
    for r2 in range(relation_count):
        if(r1==r2):
            continue
        pair=(r1,r2)
        if(pair not in count_r1_r2 or count_r1_r2[pair]<support):
            continue
        
        combined_rel=model.dot_relation(r1,r2).reshape((1,-1))
        distances, indices = nbrs.kneighbors(combined_rel)
        count+=1
        bool_arr=[False for i in range(relation_count)]
        for ind in indices[0]:
            bool_arr[ind]=True
        
        cur_dict={}
        entpair_lis=list(set_r1_r2[pair])
        for pair_e1e2 in entpair_lis:
            if pair_e1e2 not in entity_to_rel:
                continue
            for r in entity_to_rel[pair_e1e2]:
                if(bool_arr[r]==False):
                    continue
                if r not in cur_dict:
                    cur_dict[r]=0
                cur_dict[r]+=1
        rules_dict[pair]=cur_dict
        

0
100
200


KeyboardInterrupt: 

In [None]:
print(len(rules_dict))
print(relation_count)
print(count)

# Saving rules

In [81]:
path1='rules_1.pkl'
utils.dump_pickle(rules_dict_1,path1)

In [138]:
path2='rules_2.pkl'
utils.dump_pickle(rules_dict,path)
# with open(path2,"wb") as writefile:
#     pickle.dump(rules_dict,writefile)

## Saving auxilary data for rule2

In [44]:
path='set_r1_r2.pkl'
set_len_r1_r2={}
for r1r2 in set_r1_r2:
    set_len_r1_r2[r1r2]=len(set_r1_r2[r1r2])
utils.dump_pickle(set_len_r1_r2,path)

## Reading and Analysing rules

In [None]:
def get_inverse_map(mymap):
    inv_map = {}
    for k,v in mymap.items():
        if v in inv_map:
            print("Error, "+str(v)+" is repeated ")
            return
        inv_map[v] = k
    return inv_map

In [None]:
with open("rules_scores_train.pkl","rb") as f:
    rules_scores = pickle.load(f)
with open("rules_freq_train.pkl","rb") as f:
    rules_freq = pickle.load(f)

In [None]:
sorted_rules_map = sorted(rules_scores.items(), key=lambda kv: kv[1], reverse=True) 

In [None]:
rules_inv_map = get_inverse_map(dump['relation_to_id'])

In [None]:
relation_keyed_data = mine_rules(mapped_train_data)

In [None]:
def write_rules(filename,freq_thr):
    with open(filename,"w") as f:
        f.write("Score ; Freq_Intersection ; Freq_Union; Freq_Head; Freq_Tail ; Rule Head ; Rule Tail\n")
        for (rprime,r),sc in sorted_rules_map:
            if(rules_freq[(rprime,r)]>=freq_thr):
                union_freq = len(relation_keyed_data[rprime]) + len(relation_keyed_data[r]) - rules_freq[(rprime,r)]
                f.write("%f ; %d ; %d ; %d; %d ; %s ; %s\n" %(
                    sc,rules_freq[(rprime,r)],union_freq,len(relation_keyed_data[rprime]),
                    len(relation_keyed_data[r]),rules_inv_map[rprime],rules_inv_map[r]))

In [None]:
write_rules("rules_mined.txt",100)