In [1]:
import os
from py2neo import Graph
import ahocorasick

### 关键词导入

In [2]:
produce = [i.strip() for i in open("../data/produce.txt") if i.strip()]
classify = [i.strip() for i in open("../data/classify.txt") if i.strip()]
person = [i.strip() for i in open('../data/person.txt') if i.strip()]
belong = [i.strip() for i in open("../data/belong.txt") if i.strip()]
outline = [i.strip() for i in open("../data/outline.txt") if i.strip()]
location = [i.strip() for i in open("../data/location.txt") if i.strip()]
catalogue = [i.strip() for i in open("../data/catalogue.txt") if i.strip()]
family = [i.strip() for i in open("../data/family.txt") if i.strip()]
keywords = produce+ classify+ person+ belong+ outline+ classify+ location+ catalogue+ family
keywords = set(keywords)

### 构建关键词库

In [3]:
def build_wdtype_dict(keywords):
    wd_dict = dict()
    for wd in keywords:
        wd_dict[wd] = []
        if wd in produce:
            wd_dict[wd].append('produce')
        if wd in classify:
            wd_dict[wd].append('classify')
        if wd in person:
            wd_dict[wd].append('person')
        if wd in belong:
            wd_dict[wd].append('belong')
        if wd in outline:
            wd_dict[wd].append('outline')
        if wd in location:
            wd_dict[wd].append('location')
        if wd in catalogue:
            wd_dict[wd].append('catalogue')
        if wd in family:
            wd_dict[wd].append('family')
    return wd_dict

In [4]:
len(build_wdtype_dict(keywords))

6714

In [5]:
# a = {"a":1,"b":2,"c":3}
# b = {"a":2, "e":4,"d":"5"}
# a.update(b)
# a
a={1,2,34}
b={1,2,10}
a+b

TypeError: unsupported operand type(s) for +: 'set' and 'set'

### 加速匹配

In [6]:
def build_actree(wordlist):
    tree = ahocorasick.Automaton()
    for index, word in enumerate(wordlist):
        tree.add_word(word, (index, word))
    tree.make_automaton()
    return tree

In [7]:
# 福建省适合种什么农作物
a={"A":"a","B":"b","C":"c"}
b = {"A":"b","B":"e","D":"d"}
# a.items()
d = dict(zip(a.items(),b.items()))
[n[0]:n[1] for n in d]

SyntaxError: invalid syntax (<ipython-input-7-73fe9937c2ca>, line 6)

### 问句过滤

In [8]:
def check_medical(question,tree,wdtype_dict):
    region_wds = []
    for i in tree.iter(question):
        wd = i[1][1]
        region_wds.append(wd)
    stop_wds = []
    for wd1 in region_wds:
        for wd2 in region_wds:
            if wd1 in wd2 and wd1 != wd2:
                stop_wds.append(wd1)
    final_wds = [i for i in region_wds if i not in stop_wds]
    final_dict = {i:wdtype_dict.get(i) for i in final_wds}
    return final_dict

### 疑问词库

In [9]:
classify_qwds = ['什么品 种', '属于什么品 种','什么品种','属于什么品种', '品种是什么']
person_qwds = ["什么人命名","被命名","谁命名"]
belong_qwds = ['什么门', '属于什么门', '门是什么']
outline_qwds = ['什么纲', '属于什么纲', '纲是什么']
location_qwds = ["分布哪里","分布区域","分布范围","产地是","分布地区","主产地","场地有","在哪里","生在哪","生长在哪","在哪生"]
catalogue_qwds = ['什么目', '属于什么目', '目是什么']
family_qwds = ['什么科', '属于什么科', '科是什么']

### 特征词分类

In [10]:
def check_intention(wds, sent):
    for wd in wds:
        if wd in sent:
            return True
    return False


### 问句处理主程序

In [11]:
def question_parse_run(question):
    data = {}
    wd_dict = build_wdtype_dict(keywords)
#     print("wd_dict",wd_dict)
    tree = build_actree(keywords)
#     print("tree",tree)
    final_dict = check_medical(question, tree, wd_dict)
    print("final_dict",final_dict)
    if not final_dict:
        return {}
    data["args"] = final_dict
    entity_types = []
    for t in final_dict.values():
        entity_types += t
    question_type_list = []
    # 检测查询品种意图
    if_suitable = check_intention(classify_qwds, question)
    if if_suitable and "produce" in entity_types:
        question_type = "produce_classify"
        question_type_list.append(question_type)
    # 检测查询命名者意图
    if_suitable = check_intention(person_qwds, question)
    if if_suitable and "produce" in entity_types:
        question_type = "produce_person"
        question_type_list.append(question_type)
    # 检测查询门意图
    if_suitable = check_intention(belong_qwds, question)
    if if_suitable and "produce" in entity_types:
        question_type = "produce_belong"
        question_type_list.append(question_type)
    # 检测查询纲意图
    if_suitable = check_intention(outline_qwds, question)
    if if_suitable and "produce" in entity_types:
        question_type = "produce_outline"
        question_type_list.append(question_type)
    # 检测查询地理分布意图
    if_suitable = check_intention(location_qwds, question)
    if if_suitable and "produce" in entity_types:
        question_type = "produce_location"
        question_type_list.append(question_type)
    # 检测查询目意图
    if_suitable = check_intention(catalogue_qwds, question)
    if if_suitable and "produce" in entity_types:
        question_type = "produce_catalogue"
        question_type_list.append(question_type)
    # 检测查询科意图
    if_suitable = check_intention(family_qwds, question)
    if if_suitable and "produce" in entity_types:
        question_type = "produce_family"
        question_type_list.append(question_type)
        
    # 返回实体描述
    if len(question_type_list)==0 and "produce" in entity_types:
        question_type_list = ["disease_desc"]
        
    # 需要补充说明问题
    if len(question_type_list)==0 and len(entity_types)!=0:
        question_type_list = ["add_want_do"]
    
    if len(question_type_list) == 0:
         question_type_list = ["others"]

    data["question_types"] = question_type_list
    
    return data


In [31]:
question = "苹果香蕉是什么品种,是什么门，一般在哪生长,被谁命名的"
parse_data = question_parse_run(question) 
parse_data

final_dict {'苹果': ['produce', 'classify'], '香蕉': ['produce', 'classify']}


{'args': {'苹果': ['produce', 'classify'], '香蕉': ['produce', 'classify']},
 'question_types': ['produce_classify',
  'produce_person',
  'produce_belong',
  'produce_location']}

In [32]:
parse_data["args"],parse_data["question_types"]

({'苹果': ['produce', 'classify'], '香蕉': ['produce', 'classify']},
 ['produce_classify', 'produce_person', 'produce_belong', 'produce_location'])

In [33]:
a={"a":[1,2],"b":[3]}

In [34]:
"b" in a

True

In [35]:
def build_entity_dict(entitys):
    entity_dict = {}
    for entity, types in entitys.items():
        for type in types:
            if type not in entity_dict:
                entity_dict[type] = [entity]
            else:
                entity_dict[type].append(entity)
    return entity_dict

### 问句到数据查询语句转换

In [36]:
def gen_sql_list():
    entity_dict = build_entity_dict(parse_data["args"])
    print(entity_dict)
    sql_list = []
    for intention in parse_data["question_types"]:
        sql, sql_dict = [],{}
        sql_dict['question_type']=intention
        if intention == "produce_classify":
            sql = sql_teansfor(intention, entity_dict.get("produce"))
        elif intention == "produce_person":
            sql = sql_teansfor(intention, entity_dict.get("produce"))
        elif intention == "produce_belong":
            sql = sql_teansfor(intention, entity_dict.get("produce"))
        elif intention == "produce_outline":
            sql = sql_teansfor(intention, entity_dict.get("produce"))
        elif intention == "produce_location":
            sql = sql_teansfor(intention, entity_dict.get("produce"))
        elif intention == "produce_catalogue":
            sql = sql_teansfor(intention, entity_dict.get("produce"))
        elif intention == "produce_family":
            sql = sql_teansfor(intention, entity_dict.get("produce"))
        elif intention == "produce_desc":
            sql = sql_teansfor(intention, entity_dict.get("produce"))
        elif intention == "add_want_do":
            sql = sql_teansfor(intention, entity_dict.get("produce"))
        elif intention == "other":
            sql = sql_teansfor(intention, entity_dict.get("produce"))
        if sql:
            sql_dict['sql'] = sql
            sql_list.append(sql_dict)

    return sql_list
            

In [37]:
def sql_teansfor(intention, entites):
    if not entites:
        return []
    sql = []
    # 检测查询品种意图
    if intention == "produce_classify":
        sql = ["match(p:Produce),(b:Classify) where p.name='{0}' match n =((p)-[:part_of]-(b)) return p.name,b.name".format(i) for i in entites]
    if intention == "produce_person":
        sql = ["match(p:Produce),(b:Person) where p.name='{0}' match n =((p)-[:part_of]-(b)) return p.name,b.name".format(i) for i in entites]
    if intention == "produce_belong":
        sql = ["match(p:Produce),(b:Belong) where p.name='{0}' match n =((p)-[:part_of]-(b)) return p.name,b.name".format(i) for i in entites]
    if intention == "produce_outline":
        sql = ["match(p:Produce),(b:Outline) where p.name='{0}' match n =((p)-[:part_of]-(b)) return p.name,b.name".format(i) for i in entites]
    if intention == "produce_location":
        sql = ["match(p:Produce),(b:Location) where p.name='{0}' match n =((p)-[:part_of]-(b)) return p.name,b.name".format(i) for i in entites]
    if intention == "produce_family":
        sql = ["match(p:Produce),(b:Family) where p.name='{0}' match n =((p)-[:part_of]-(b)) return p.name, b.name".format(i) for i in entites]
    if intention == "produce_desc":
        sql = [i for i in entites]
    if intention == "add_want_do":
        sql = [i for i in entites]
    if intention == "other":
        sql = [i for i in entites]
    return sql

In [38]:
gen_sql_list()

{'produce': ['苹果', '香蕉'], 'classify': ['苹果', '香蕉']}


[{'question_type': 'produce_classify',
  'sql': ["match(p:Produce),(b:Classify) where p.name='苹果' match n =((p)-[:part_of]-(b)) return p.name,b.name",
   "match(p:Produce),(b:Classify) where p.name='香蕉' match n =((p)-[:part_of]-(b)) return p.name,b.name"]},
 {'question_type': 'produce_person',
  'sql': ["match(p:Produce),(b:Person) where p.name='苹果' match n =((p)-[:part_of]-(b)) return p.name,b.name",
   "match(p:Produce),(b:Person) where p.name='香蕉' match n =((p)-[:part_of]-(b)) return p.name,b.name"]},
 {'question_type': 'produce_belong',
  'sql': ["match(p:Produce),(b:Belong) where p.name='苹果' match n =((p)-[:part_of]-(b)) return p.name,b.name",
   "match(p:Produce),(b:Belong) where p.name='香蕉' match n =((p)-[:part_of]-(b)) return p.name,b.name"]},
 {'question_type': 'produce_location',
  'sql': ["match(p:Produce),(b:Location) where p.name='苹果' match n =((p)-[:part_of]-(b)) return p.name,b.name",
   "match(p:Produce),(b:Location) where p.name='香蕉' match n =((p)-[:part_of]-(b)) retur

### 生成对话

In [284]:

graph = Graph(
        host="127.0.0.1",
        http_port=7474,
        user="neo4j",
        password="")
num_limit = 5


In [293]:
def answer_prettify(question_type, answers):
    final_answer = []
    if not answers:
        return ''
     subject = answers[0]['p.name']
    if question_type == 'produce_classify':
        desc = [i['b.name'] for i in answers]
        final_answer = '{0}是属于：{1}品种'.format(subject, '；'.join(list(set(desc))[:num_limit]))
    elif question_type == 'produce_person':
        desc = [i['b.name'] for i in answers]
        final_answer = '{0}命名者是：{1}'.format(subject, '；'.join(list(set(desc))[:num_limit]))
    elif question_type == 'produce_belong':
        desc = [i['b.name'] for i in answers]
        final_answer = '{0}是属于：{1}门'.format(subject, '；'.join(list(set(desc))[:num_limit]))
    elif question_type == 'produce_outline':
        desc = [i['b.name'] for i in answers]
        final_answer = '{0}是属于：{1}纲'.format(subject, '；'.join(list(set(desc))[:num_limit]))
    elif question_type == 'produce_location':
        desc = [i['b.name'] for i in answers]
        final_answer = '{0}常分布于：{1}'.format(subject, '；'.join(list(set(desc))[:num_limit]))
    elif question_type == 'produce_catalogue':
        desc = [i['b.name'] for i in answers]
        final_answer = '{0}是属于：{1}目'.format(subject, '；'.join(list(set(desc))[:num_limit]))
    elif question_type == 'produce_family':
        desc = [i['b.name'] for i in answers]
        final_answer = '{0}是属于：{1}科'.format(subject, '；'.join(list(set(desc))[:num_limit]))
        
    elif question_type == 'produce_desc':
        desc = [i['b.name'] for i in answers]
        final_answer = '{0}是属于：{1}品种'.format(subject, '；'.join(list(set(desc))[:num_limit]))
    elif question_type == 'produce_other':
        desc = [i['b.name'] for i in answers]
        final_answer = '{0}是属于：{1}品种'.format(subject, '；'.join(list(set(desc))[:num_limit]))
  
    return final_answer


In [302]:
def search_main():
    sql_list = gen_sql_list()
    final_answers = []
    for sql in sql_list:
        question_type = sql['question_type']
        queries = sql['sql']
        answers = []
        for query in queries:
            ress = graph.run(query).data()
            answers += ress
            print(question_type,answers)
        _answer = answer_prettify(question_type, answers)
        if _answer:
            final_answers.append(_answer)
    print(final_answers)
#     return final_answers


In [305]:
search_main()

produce_classify [{'p.name': '红富士苹果', 'b.name': '富士苹果'}]
produce_person []
produce_belong [{'p.name': '红富士苹果', 'b.name': '被子植物门'}]
produce_location []
['红富士苹果是属于：富士苹果品种']


### 对话模板

In [52]:
if set():
    print(1)
else:
    print(2)

2


In [49]:
a=["1","2","3","4"]
a.insert(-1,"以及")
b=["7","5"]
b.insert(0,"不仅")
b.insert(-1,"而且")
b,a

(['不仅', '7', '而且', '5'], ['1', '2', '3', '以及', '4'])

In [309]:
import re

In [39]:
a = {'produce': ['苹果'], 'classify': ['苹果']}

In [40]:
"produce" in a

True