In [1]:
import random
import torch
from torch import nn
from torch.autograd import Variable
#graph = [
#    ('seth rogan', 'acted_in', 'the interview'),
#    ('seth rogan', 'acted_in', 'this is the end'),
#    ('seth rogan', 'acted_in', 'pineapple express'),
#    ('james franco', 'acted_in', 'the interview'),
#    ('james franco', 'acted_in', 'this is the end'),
#    ('will smith', 'acted_in', 'irobot'),
#    ('will smith', 'acted_in', 'independence day'),
#    ('tom cruise', 'acted_in', 'mission impossible'),
#]
graph = [
    ('tom cruise', 'acted_in', 'oblivion'),
    ('tom cruise', 'acted_in', 'tropic thunder'),
    ('tom cruise', 'acted_in', 'mission impossible'),
    ('jack black', 'acted_in', 'tropic thunder'),
    ('ben stiller', 'acted_in', 'dodgeball'),
    ('dodgeball', 'is_genre', 'comedy'),
    ('tropic thunder', 'is_genre', 'comedy'),
    ('mission impossible', 'is_genre', 'action'),
    ('oblivion', 'is_genre', 'action'),
]


In [2]:
class KnowledgeGraphEmbeddings(nn.Module):
    def __init__(self, graph, embedding_size = 3):
        super(KnowledgeGraphEmbeddings, self).__init__()
        
        entities = []
        relations = []
        for fact in graph:
            source, relation, target = fact[0], fact[1], fact[2]
            entities.append(source)
            entities.append(target)
            relations.append(relation)

        self.entities = list(set(entities))
        self.relations = list(set(relations))
        self.entities2id = { ent:i for i, ent in enumerate(self.entities) } 
        self.relations2id = { rel:i for i, rel in enumerate(self.relations) } 
        
        self.entity_embeddings = nn.Embedding(len(self.entities2id), embedding_size)
        self.relation_embeddings = nn.Embedding(len(self.relations2id), embedding_size)
        self.graph = graph
        
    def positive(self, fact):
        source, relation, target = fact[0], fact[1], fact[2]
        source_id = self.entities2id[source]
        target_id = self.entities2id[target]
        relation_id = self.relations2id[relation]
        
        source_id = Variable(torch.LongTensor([source_id])).view(1, -1)
        target_id = Variable(torch.LongTensor([target_id])).view(1, -1)
        relation_id = Variable(torch.LongTensor([relation_id])).view(1, -1)

        source_embedding = self.entity_embeddings(source_id)
        target_embedding = self.entity_embeddings(target_id)
        relation_embedding = self.entity_embeddings(relation_id)
        score = torch.dist(source_embedding + relation_embedding, target_embedding)
        #print('positive', source, relation, target, score.data[0])
        return score
    
    def negative(self):
        # Sample until we find an invalid fact
        while True:
            random_source_id = random.randint(0, len(self.entities2id) - 1)
            random_relation_id = random.randint(0, len(self.relations2id) - 1)
            random_target_id = random.randint(0, len(self.entities2id) - 1)
            
            source = self.entities[random_source_id]
            target = self.entities[random_target_id]
            relation = self.relations[random_relation_id]
            if (source, relation, target) not in graph:
                break
        
        source_id = Variable(torch.LongTensor([random_source_id])).view(1, -1)
        relation_id = Variable(torch.LongTensor([random_relation_id])).view(1, -1)
        target_id = Variable(torch.LongTensor([random_target_id])).view(1, -1)
        
        source_embedding = self.entity_embeddings(source_id)
        relation_embedding = self.entity_embeddings(relation_id)
        target_embedding = self.entity_embeddings(target_id)
        
        score = torch.dist(source_embedding + relation_embedding, target_embedding)
        #print('negative', self.entities[random_source_id], self.relations[random_relation_id], self.entities[random_target_id], score.data[0])
        return score
    
    def forward(self, fact, margin=1.0):
        positive_score = self.positive(fact)
        negative_score = self.negative()
        loss = torch.max((positive_score - negative_score + margin).sum(), Variable(torch.FloatTensor([0])))
        #print('%.2f - %.2f + %.2f = %.2f' %(positive_score.data[0], negative_score.data[0], margin, loss.data[0]))

        return loss
    
    def score(self, fact):
        return self.positive(fact)
        

In [3]:
def train(fact, embedding_model, optimizer):
    avg_loss = 0.
    # 10 training cycles for this fact, with negative sampling
    for i in range(len(embedding_model.entities) * len(embedding_model.entities) * len(embedding_model.relations)):
        embedding_model.zero_grad()
        loss = embedding_model(fact)
        loss.backward()
        optimizer.step()
        avg_loss += loss.data[0]
    return avg_loss / 10

In [4]:
embedding_model = KnowledgeGraphEmbeddings(graph)
optimizer = torch.optim.Adam(embedding_model.parameters(), lr=0.001)
losses = []
epochs = 50
for e in range(epochs):
    total_loss = 0.
    for fact in graph:
        loss = train(fact, embedding_model, optimizer)
        total_loss += loss
    losses.append(total_loss)

KeyboardInterrupt: 

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6),)
plt.plot(losses)
plt.xlabel('epoch')
plt.ylabel('loss');

In [None]:
#print(embedding_model.score(('james franco', 'acted_in', 'the interview')))
#print(embedding_model.score(('james franco', 'acted_in', 'pineapple express')))
#print(embedding_model.score(('james franco', 'acted_in', 'irobot')))
#print(embedding_model.score(('james franco', 'acted_in', 'mission impossible')))

print(embedding_model.score(('ben stiller', 'acted_in', 'dodgeball')))
print(embedding_model.score(('ben stiller', 'acted_in', 'tropic thunder')))
print(embedding_model.score(('ben stiller', 'acted_in', 'mission impossible')))
print(embedding_model.score(('ben stiller', 'acted_in', 'oblivion')))