# 基于wordnet构建英语知识图谱

In [1]:
import nltk
from nltk.corpus import wordnet

## 1. 实体和关系抽取

### 1.1 Helpers

In [2]:
def index_relationship(start, end, rel_type):
    '''
    索引关系
    '''
    relationship_index.setdefault(start, {})
    relationship_index[start].setdefault(end, {})
    relationship_index[start][end] = rel_type

In [3]:
def add_relationship(start, end, rel_type):
    '''
    添加关系，移除重复关系
    '''
    if (start in relationship_index and end in relationship_index[start] and rel_type in relationship_index[start][end]) or \
       (end in relationship_index and start in relationship_index[end] and rel_type in relationship_index[end][start]):
        pass
    else:
        index_relationship(start, end, rel_type)
        index_relationship(end, start, rel_type)
        relationships.append([start, end, rel_type])

In [4]:
def extract_relationships(synset):
    '''
    同义词集实体之间的关系抽取，
    '''
    # 上位概念（名词，动词）
    for related_node in synset.hypernyms():
        add_relationship(synset.name(), related_node.name(), 'IsA')
    # 下位概念（名词，动词）
    for related_node in synset.hyponyms():
        add_relationship(related_node.name(), synset.name(), 'IsA')
    # 上位整体概念（名词）
    for related_node in synset.member_holonyms():
        add_relationship(synset.name(), related_node.name(), 'PartOf')
    for related_node in synset.substance_holonyms():
        add_relationship(synset.name(), related_node.name(), 'PartOf')
    for related_node in synset.part_holonyms():
        add_relationship(synset.name(), related_node.name(), 'PartOf')
    # 下位部件概念（名词）
    for related_node in synset.member_meronyms():
        add_relationship(related_node.name(), synset.name(), 'PartOf')
    for related_node in synset.substance_meronyms():
        add_relationship(related_node.name(), synset.name(), 'PartOf')
    for related_node in synset.part_meronyms():
        add_relationship(related_node.name(), synset.name(), 'PartOf')
    # 主题域（名词，动词）
    # topic_domains
    for related_node in synset.topic_domains():
        add_relationship(synset.name(), related_node.name(), 'Domain')
    # region_domains
    for related_node in synset.region_domains():
        add_relationship(synset.name(), related_node.name(), 'Domain')
    # usage_domains
    for related_node in synset.usage_domains():
        add_relationship(synset.name(), related_node.name(), 'Domain')
    # 属性
    for related_node in synset.attributes():
        add_relationship(synset.name(), related_node.name(), 'Attribute')
    # 因果
    for related_node in synset.causes():
        add_relationship(synset.name(), related_node.name(), 'Cause')
    # similar_tos
    for related_node in synset.similar_tos():
        add_relationship(synset.name(), related_node.name(), 'SimilarTo')
    # 反义（形容词）
    for lemma in synset.lemmas():
        for related_node in lemma.antonyms():
            add_relationship(synset.name(), related_node.synset().name(), 'Antonym')
    # entailment（动词）
    for entailment in synset.entailments():
        add_relationship(synset.name(), entailment.name(), 'Entailment')

### 1.2 工具类

In [5]:
class SynsetNode:
    '''
    同义词集概念节点
    '''
    def __init__(self, id, pos, definition):
        self._label = 'Synset'
        self._id = id
        self._pos = pos
        self._definition = definition

    def __repr__(self):
        return "SynsetNode({})".format(self._id)

    @property
    def get_id(self):
        return self._id

    @property
    def get_pos(self):
        return self._pos

    @property
    def get_definition(self):
        return self._definition

    @property
    def get_label(self):
        return self._label

    def get_row(self):
        return [
            self.get_id, self.get_pos, self.get_definition,
            self.get_label
        ]

In [6]:
class WordNode:
    '''
    词汇节点
    '''
    def __init__(self, id, name, pos):

        self._label = "Lemma"
        self._id = id
        self._name = name
        self._pos = pos

    def __repr__(self):
        return "WordNode({})".format(self._id)

    @staticmethod
    def get_header():
        return ['id:ID', 'name', 'pos', ':LABEL']

    def get_row(self):
        return [self._id, self._name, self._pos, self._label]

    @property
    def get_id(self):
        return self._id

### 1.3 抽取实体和关系

In [7]:
def extract_lemmas(synset):
    '''
    抽取词汇实体Lemma(WordNode)
    建立Lemma(WordNode)实体和Synset(SynsetNode)实体之间InSynset关系
    '''
    for lemma in synset.lemmas():
        id = ('%s.%s' % (lemma.name().lower(), synset.pos())).lower()
        if id not in visited_ids:
            visited_ids.add(id)
        lemmas.append(WordNode(id, lemma.name().lower(), synset.pos()))
        add_relationship(id, synset.name(), 'InSynset')

In [8]:
relationship_index = {}
relationships = []  # 关系
synsets = []  # 同义词集概念节点构造的实体
lemmas = []  # 词汇构造的实体
visited_ids = set()

In [9]:
all_synsets = list(nltk.corpus.wordnet.all_synsets())

In [10]:
for i, synset in enumerate(all_synsets):
    # 实体抽取-同义词集概念节点Concept Node
    synsets.append(SynsetNode(synset.name(), synset.pos(), synset.definition()))
    # 概念关系抽取
    extract_relationships(synset)
    if i % 10000 == 0:
        print(f'{i} Synsets extracted')
    
for i, synset in enumerate(all_synsets):
    extract_lemmas(synset)
    if i % 10000 == 0:
        print("Extracted lemmas for {} Synsets".format(i))

0 Synsets extracted
10000 Synsets extracted
20000 Synsets extracted
30000 Synsets extracted
40000 Synsets extracted
50000 Synsets extracted
60000 Synsets extracted
70000 Synsets extracted
80000 Synsets extracted
90000 Synsets extracted
100000 Synsets extracted
110000 Synsets extracted
Extracted lemmas for 0 Synsets
Extracted lemmas for 10000 Synsets
Extracted lemmas for 20000 Synsets
Extracted lemmas for 30000 Synsets
Extracted lemmas for 40000 Synsets
Extracted lemmas for 50000 Synsets
Extracted lemmas for 60000 Synsets
Extracted lemmas for 70000 Synsets
Extracted lemmas for 80000 Synsets
Extracted lemmas for 90000 Synsets
Extracted lemmas for 100000 Synsets
Extracted lemmas for 110000 Synsets


In [11]:
# 关系数，词汇实体数，同义词集实体数
len(relationships), len(lemmas), len(synsets)

(342950, 206978, 117659)

In [12]:
# 随机输出10个关系
import random
for i in [random.randint(0, len(relationships)-1) for _ in range(10)]:
    print(relationships[i])

['text_editor.n.02', 'editor_program.n.01', 'IsA']
['dwarf_sperm_whale.n.01', 'toothed_whale.n.01', 'IsA']
['thomson.n', 'thomson.n.04', 'InSynset']
['chaplainship.n', 'chaplaincy.n.01', 'InSynset']
['climacteric.n.01', 'biological_time.n.01', 'IsA']
['hilliness.n', 'hilliness.n.01', 'InSynset']
['ebony.n', 'ebony.n.02', 'InSynset']
['oversimplification.n.02', 'simplification.n.01', 'IsA']
['political_correctness.n.01', 'political_incorrectness.n.01', 'Antonym']
['forgettable.a.01', 'unmemorable.s.01', 'SimilarTo']


In [13]:
# 随机输出10个同义词集概念实体
for i in [random.randint(0, len(synsets)-1) for _ in range(10)]:
    print(synsets[i])

SynsetNode(hum.v.04)
SynsetNode(lox.n.02)
SynsetNode(liter.n.01)
SynsetNode(fruit_bat.n.01)
SynsetNode(noblesse.n.02)
SynsetNode(ablaut.n.01)
SynsetNode(eastern_chimpanzee.n.01)
SynsetNode(cheat.v.03)
SynsetNode(labyrinthine_vein.n.01)
SynsetNode(bourgogne.n.01)


In [14]:
# 随机输出10个词汇实体
for i in [random.randint(0, len(lemmas)-1) for _ in range(10)]:
    print(lemmas[i])

WordNode(lozal.n)
WordNode(theorem.n)
WordNode(standing.n)
WordNode(rima_glottidis.n)
WordNode(neck-deep.s)
WordNode(lentibulariaceae.n)
WordNode(repent.v)
WordNode(militarized.s)
WordNode(comatulid.n)
WordNode(habitation.n)


## 2. 关系存储为CSV文件

In [15]:
import csv

In [16]:
print('Writing synsets...', end='')
with open('synsets.csv', 'w', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['id:ID', 'pos:string', 'definition:string', ':LABEL'])
    for synset in synsets:
        writer.writerow(synset.get_row())
print('Done')

Writing synsets...Done


In [17]:
print('Writing words...', end='')
with open('words.csv', 'w', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(WordNode.get_header())
    for word in lemmas:
        writer.writerow(word.get_row())
print('Done')

Writing words...Done


In [18]:
print('Writing relationships...', end='')
with open('relationships.csv', 'w', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow([':START_ID', ':END_ID', ':TYPE'])
    for relationship in relationships:
        writer.writerow(relationship)
print('Done')

Writing relationships...Done


In [19]:
import pandas as pd

In [20]:
pd.read_csv('synsets.csv')

Unnamed: 0,id:ID,pos:string,definition:string,:LABEL
0,able.a.01,a,(usually followed by `to') having the necessar...,Synset
1,unable.a.01,a,(usually followed by `to') not having the nece...,Synset
2,abaxial.a.01,a,facing away from the axis of an organ or organism,Synset
3,adaxial.a.01,a,nearest to or facing toward the axis of an org...,Synset
4,acroscopic.a.01,a,facing or on the side toward the apex,Synset
...,...,...,...,...
117654,run_dry.v.01,v,become empty of water,Synset
117655,fog_up.v.01,v,get foggy,Synset
117656,char.v.01,v,burn to charcoal,Synset
117657,haze.v.01,v,"become hazy, dull, or cloudy",Synset


In [21]:
pd.read_csv('words.csv')

Unnamed: 0,id:ID,name,pos,:LABEL
0,able.a,able,a,Lemma
1,unable.a,unable,a,Lemma
2,abaxial.a,abaxial,a,Lemma
3,dorsal.a,dorsal,a,Lemma
4,adaxial.a,adaxial,a,Lemma
...,...,...,...,...
206973,fog_up.v,fog_up,v,Lemma
206974,char.v,char,v,Lemma
206975,coal.v,coal,v,Lemma
206976,haze.v,haze,v,Lemma


In [22]:
pd.read_csv('relationships.csv')

Unnamed: 0,:START_ID,:END_ID,:TYPE
0,able.a.01,ability.n.01,Attribute
1,able.a.01,ability.n.02,Attribute
2,able.a.01,unable.a.01,Antonym
3,unable.a.01,ability.n.01,Attribute
4,abaxial.a.01,biology.n.01,Domain
...,...,...,...
342945,fog_up.v,fog_up.v.01,InSynset
342946,char.v,char.v.01,InSynset
342947,coal.v,char.v.01,InSynset
342948,haze.v,haze.v.01,InSynset


## 3. CSV导入Neo4j(ONgDB)及可视化效果

In [23]:
# 导入的时候需要把neo4j停掉, 而且删掉原来的数据库
# bin/ongdb-admin import --database graph.db --nodes=import/synsets.csv --nodes=import/words.csv --relationships=import/relationships.csv --ignore-duplicate-nodes

### 可视化效果
![](./graph.png)

## 4. 简单的问答应用

In [29]:
from py2neo import Graph, Node, Relationship, NodeMatcher, RelationshipMatcher

In [30]:
def knowledge_inference(wordnet_graph, question):
    n_matcher = NodeMatcher(wordnet_graph)
    r_matcher = RelationshipMatcher(wordnet_graph)

    word = question.split(" ")[-1]
    print("Find knowledge for {}......\n\n".format(word))
    entity_node = n_matcher.match("Lemma", name=word).first()

    for r in r_matcher.match([entity_node], r_type="InSynset"):
        concept_node = r.end_node
        print("Definition: \n")
        print("-- {} is {}".format(word, concept_node["definition"]))
        print("\n" + "*" * 50 + "\n")

        print("Part of relations")
        for isa_r in r_matcher.match([concept_node], r_type="PartOf"):
            end = isa_r.end_node
            for isa_r in r_matcher.match([None, end], r_type="InSynset"):
                start = isa_r.start_node
                print("-- {} is part of {}".format(word, start["name"]))
            break

        for isa_r in r_matcher.match([None, concept_node], r_type="PartOf"):
            end = isa_r.end_node
            for isa_r in r_matcher.match([None, end], r_type="InSynset"):
                start = isa_r.start_node
                print("-- {} is part of {}".format(start["name"], word))
            break

        print("\n" + "*" * 50 + "\n")

        print("For examples: \n")
        for i, isa_r in enumerate(
                r_matcher.match([None, concept_node], r_type="IsA")):
            start = isa_r.start_node
            print("{}. {} is a {} which means {}\n".format(
                i + 1, start['id'].split(".")[0], word, start["definition"]))
        print("\n" + "*" * 50 + "\n")

        print("Similar Things: \n")
        for i, isa_r in enumerate(
                r_matcher.match([concept_node], r_type="SimilarTo")):
            end = isa_r.end_node
            print("{}".format(end["id"].split(".")[0]))

        for i, isa_r in enumerate(
                r_matcher.match([None, concept_node], r_type="SimilarTo")):
            start = isa_r.start_node
            print("{}".format(start["id"].split(".")[0]))

        print("\n" + "*" * 50 + "\n")

        print("Same domain words: \n")
        for i, isa_r in enumerate(
                r_matcher.match([concept_node], r_type="Domain")):
            end = isa_r.end_node
            print("{}".format(end["id"].split(".")[0]))

        for i, isa_r in enumerate(
                r_matcher.match([None, concept_node], r_type="Domain")):
            start = isa_r.start_node
            print("{}".format(start["id"].split(".")[0]))

        print("\n" + "*" * 50 + "\n")
        break

In [31]:
uri = "http://localhost:7474"
user = "ongdb"
password = "123456"
wordnet_graph = Graph(uri=uri, user=user, password=password)

In [32]:
question = "what is computer"
knowledge_inference(wordnet_graph, question)

Find knowledge for computer......


Definition: 

-- computer is a machine for performing calculations automatically

**************************************************

Part of relations
-- computer is part of platform
-- information_processing_system is part of computer
-- electronic_computer is part of computer
-- data_processor is part of computer
-- computing_device is part of computer
-- computing_machine is part of computer
-- computer is part of computer

**************************************************

For examples: 

1. web_site is a computer which means a computer connected to the internet that maintains a series of web pages on the World Wide Web

2. turing_machine is a computer which means a hypothetical computer with an infinitely long memory tape

3. server is a computer which means (computer science) a computer that provides client stations with access to files and printers as shared resources to a computer network

4. predictor is a computer which means a computer f

In [35]:
question = "what is us"
knowledge_inference(wordnet_graph, question)

Find knowledge for us......


Definition: 

-- us is North American republic containing 50 states - 48 conterminous states in North America plus Alaska in northwest North America and the Hawaiian Islands in the Pacific Ocean; achieved independence in 1776

**************************************************

Part of relations
-- us is part of north_america
-- u.s.a. is part of us
-- usa is part of us
-- u.s. is part of us
-- us is part of us
-- the_states is part of us
-- america is part of us
-- united_states_of_america is part of us
-- united_states is part of us

**************************************************

For examples: 


**************************************************

Similar Things: 


**************************************************

Same domain words: 

billionth
octillion
septillion
sextillion
quintillion
quadrillion
trillion
billion
inch
dollar
discount_rate
golden_fern
hedeoma
ringworm_bush
pineapple_weed
genus_epiphyllum
gibson
yankee
staff_member
old_man
mesti