In [1]:
import os
import json
import time

import numpy as np
import pandas as pd

In [2]:
from abc import ABC, abstractmethod

class KnowledgeGraph(ABC):
    def __init__(self, name=None, verbose = 0):
        self._name = name
        self._verbose = verbose
        
        self._entities = [] # list(string)
        self._relations = [] # list(string)
        # np.array([(head_entity, relation, tail_entity)])
        self._triples = np.zeros(shape=(0,3))
        
        self._built = False
        
    ####### PUBLIC #######
    @property
    def name(self):
        return self._name
    
    @property
    def entities(self):
        return self._entities
    
    @property
    def relations(self):
        return self._relations
    
    @property
    def triples(self):
        return self._triples
    
    def sample(self, k=1, negative=False):
        if negative:
            return self._sample_negative_loose(k)
        else:
            return self._sample_positive(k)
        
    ####### PRIVATE #######
    
    @abstractmethod
    def _build_graph(self):
        """
        A function that builds the graph by reading in the data in
        its current format and populating self._entities, self._relations,
        self._triples, and at the end should set self._built to True.
        """
        pass
    
    @property
    def _is_built(self):
        return self._built
    
    @property
    def _num_entities(self):
        return len(self._entities)
    
    @property
    def _num_relations(self):
        return len(self._relations)
    
    @property
    def _num_triples(self):
        return self._triples.shape[0]
    
    def _validate_graph(self):
        # Make sure properties are filled out
        assert self._built, "The graph is not built. Please build " \
        "or check that your build_graph method sets self._build " \
        "to True after completion"
        
        # Make sure shape of self._triples is [N, 3]
        assert self._triples.shape[1] == 3, "The _triples property" \
        "must have a shape of 3 in the second dimension. " \
        
        # Make sure all head, tail entities and relations are valid
        head_entities = self._triples[:,0]
        assert head_entities.max() <= len(self._entities), "There" \
        "exists an entity in the head entities of the _triples " \
        "property that exceeds the number of available entities." \
        
        tail_entities = self._triples[:,2]
        assert tail_entities.max() <= len(self._entities), "There" \
        "exists an entity in the tail entities of the _triples " \
        "property that exceeds the number of available entities." \
        
        relations = self._triples[:,1]
        assert relations.max() <= len(self._relations), "There" \
        "exists an relations in the _triples " \
        "property that exceeds the number of available relations." \
        
        if self._verbose >= 1:
            print("Graph was successfully validated!")
        
    def _sample_positive(self, k):
        triple_indices = np.random.choice(self._num_triples(), k)
        positive_samples = self._triples[triple_indices]
        
        return positive_samples
    
    def _sample_negative_loose(self, k):
        # TODO(frg100): Make a strict version that makes sure not to
        # add existing triples
        head_entities = np.random.choice(self._num_entities(), k)
        relations = np.random.choice(self._num_relations(), k)
        tail_entities = np.random.choice(self._num_entities(), k)
        
        negative_samples = np.concatenate([head_entities, relations, tail_entities])
        
        return negative_samples
    
    

In [8]:
class FB15k237(KnowledgeGraph):
    def __init__(self, base_path=None, splits=['train', 'test', 'valid'], verbose = 0):
        super().__init__(name='FB15k-237', verbose = verbose)
        
        self._base_path = base_path
        self._splits = splits
        
        self._entity_mapping = None
        
        start = time.time()
        self._build_graph(verbose)
        end = time.time()
        if verbose >= 1:
            print(f"Building the graph took {round(end-start)} seconds")    
        
            
    def _mid2entity(self, mid):
        if self._entity_mapping is None:
            # Load the map
            json_path = os.path.join(self._base_path, "entity2wikidata.json")
            json_file = open(json_path)
            self._entity_mapping = json.load(json_file)
            json_file.close()
            
        if mid not in self._entity_mapping:
            return None
            
        return self._entity_mapping[mid]['label']

    def _build_graph(self, verbose):
        # Initialize data structures for bookkeeping
        entities = set()
        relations = set()
        triples = set()

        num_data_points = sum(sum(1 for line in open(os.path.join(self._base_path, f"{split}.txt"))) for split in self._splits)
        
        # Load data
        for split in self._splits:
            path = os.path.join(self._base_path, f"{split}.txt")
            if verbose >= 1:
                print(f"Loading file {split}.txt")
                
            # Process into entities, relations, and triples
            with open(path, 'r') as f:
                for line in f:
                    # Check progress
                    last_percent_done = round((100*(self._num_triples-1))/num_data_points)
                    percent_done = round((100*self._num_triples)/num_data_points)
                    if verbose >= 2 and percent_done % 5 == 0 and last_percent_done % 5 != 0:
                        print(f"Data loading progress: [{percent_done}%]")
                    
                    # Initialize data
                    head, relation, tail = line.split()
                    head_id, relation_id, tail_id = None, None, None
                    
                    # If either of the entities has no natural language translation,
                    if not self._mid2entity(head) or not self._mid2entity(tail):
                        # Don't process it
                        continue
                    
                    if verbose >= 3 and percent_done % 5 == 0 and last_percent_done % 5 != 0:
                        print(f"{self._mid2entity(head)} {relation} {self._mid2entity(tail)}")
                    
                    # Process head
                    if head not in entities:
                        entities.add(head)
                        head_id = len(self._entities)
                        self._entities.append(head)
                    else:
                        head_id = self._entities.index(head)
                     
                    # Process tail
                    if tail not in entities:
                        entities.add(tail)
                        tail_id = len(self._entities)
                        self._entities.append(tail)
                    else:
                        tail_id = self._entities.index(tail)
                        
                    # Process relation
                    if relation not in relations:
                        relations.add(relation)
                        relation_id = len(self._relations)
                        self._relations.append(relation)
                    else:
                        relation_id = self._relations.index(relation)

                    # Create and add triple
                    triple = np.array([[head_id, relation_id, tail_id]], dtype=np.int32)  
                    if self._num_triples == 0:
                        self._triples = triple
                    else:
                        self._triples = np.append(self._triples, triple, axis=0)
        
        # Build and validate
        self._built = True
        self._validate_graph()

In [9]:
graph = FB15k237(base_path='./data/FB15k-237', splits=['train', 'valid','test'], verbose=2)

Loading file train.txt
Data loading progress: [5%]
Data loading progress: [10%]
Data loading progress: [15%]
Data loading progress: [20%]
Data loading progress: [25%]
Data loading progress: [30%]
Data loading progress: [35%]
Data loading progress: [40%]
Data loading progress: [45%]
Data loading progress: [50%]
Data loading progress: [55%]
Data loading progress: [60%]
Data loading progress: [65%]
Data loading progress: [70%]
Data loading progress: [75%]
Data loading progress: [80%]
Data loading progress: [85%]
Loading file valid.txt
Data loading progress: [90%]
Loading file test.txt
Data loading progress: [95%]
Graph was successfully validated!
Building the graph took 56 seconds


In [21]:
base_path='./data/FB15k-237'

mapping = {}

# Load file if it exists
json_path = os.path.join(base_path, "relation_mapping.json")

if os.path.exists(json_path):
    json_file_read = open(json_path, 'r')
    mapping = json.load(json_file_read)
    json_file_read.close()


# for rel in graph.relations:
#     if rel not in mapping:
#         relations_done = len(mapping.keys())
#         relations_total = len(graph.relations)
#         relations_left = relations_total - relations_done
#         print(f"[{round(100*(relations_done/relations_total), 2)}%] done describing relations ({relations_left} left)")
        
#         instance_idx = np.where(graph.triples[:, 1] == graph.relations.index(rel))[0][0]
#         head, relation, tail = graph.triples[instance_idx]
#         head, tail = graph._mid2entity(graph.entities[head]), graph._mid2entity(graph.entities[tail])
        
#         format_string = input(f"{head} {rel} {tail}: ")
#         mapping[rel] = format_string
#         json_file_write = open(json_path, 'w')
#         json.dump(mapping, json_file_write)
#         json_file_write.close()
        
    

In [None]:
# Generate the dataset using the functions


In [144]:
def see_relation_examples(relation, k = 1):
    instance_indices = np.where(graph.triples[:, 1] == graph.relations.index(rel))[0][:k]
    samples = graph.triples[instance_indices]
    for sample in samples:
        h, r, t = sample
        print(graph._mid2entity(graph.entities[h]), graph.relations[r], graph._mid2entity(graph.entities[t]))
    
see_relation_examples('/award/award_winner/awards_won./award/award_honor/award_winner', k=5)

Michelle Rodriguez /award/award_winner/awards_won./award/award_honor/award_winner Naveen Andrews
Scott Rudin /award/award_winner/awards_won./award/award_honor/award_winner Alan Bennett
Don Cheadle /award/award_winner/awards_won./award/award_honor/award_winner Larenz Tate
Freddy Rodriguez /award/award_winner/awards_won./award/award_honor/award_winner Justina Machado
Vincent Pastore /award/award_winner/awards_won./award/award_honor/award_winner Michael Imperioli


In [16]:
len(graph.relations)

235