In [18]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import fool
import re
import json

In [2]:
df_base = pd.read_csv("../data/baseinfo.csv",header=None)
df_attr = pd.read_csv("../data/atterinfo.csv",header=None)
df_qa = pd.read_csv("../data/qadata.csv",header=None)
df_base.columns, df_attr.columns, df_qa.columns = ["id","name","intro"],["id","name","key","values"],["id","name","question","answer"]

In [3]:
"问题个数：%s"%len(df_qa.question.drop_duplicates()), "属性个数：%s"%len(df_attr.key.drop_duplicates()),"所有农作物的个数：%s"%(df_base.shape[0])

('问题个数：4434', '属性个数：1230', '所有农作物的个数：5114')

### 关键词库

In [4]:
key_words ={"门":"^门$",
           "纲":"^纲$",
           "品种":"^品种$|^种$",
           "命名者及年代":"^命名者及年代$",
           "区域":"分布区域|分布范围|原产地|产地名称|主产地|产.*?地|起源地|分布地区|地.*?点|特产地|发源地",
           "目": "^目$",
           "科":"^科$"}

### 停用词加载

In [5]:
with open("../data/stop_word.txt",encoding="utf8")as f:
    stop_words = [i.strip() for i in f.readlines()]

In [6]:
def cut_stop_word(stop_words, x):
    return "".join([i.strip() for i in x if i not in stop_words])

### 实体统一

In [7]:
def entity_unify(x):
    pass
    return x

### 添加关系

In [8]:
def add_rels(x,y):

    return ["###".join([x,i]) for i in y] if y else []

### 获取属性值

In [9]:
def get_attr_value(df, need_key):
    need_key, need_value = key_words.get(need_key,None), ""
    if need_key:
        need_value = df[df.key.str.contains(need_key,regex=True)].to_dict(orient="list").get("values","")
    return need_value[0] if need_value else ""

In [10]:
def read_nodes():
    
    # 共14类节点
    Produce = [] # 农产品名称
    Person = [] # 人物
    Belong = [] #  门
    Outline = [] # 纲
    Classify = [] # 品种
    Location = [] # 地域
    Catalogue = []#目
    Family = [] # 科
    Manure = [] # 肥料
    Climate = [] # 气候
    Diseases = []# 病虫害
    Nutrients = [] # 营养素
    Illness = [] # 疾病
    Ability = [] # 功效
    
    # 构建节点实体关系
    belong_part_of, classify_part_of, outline_part_of, catalogue_part_of, family_part_of = [],[],[],[],[] # 农产品-类别的关系
    have_nutrition = [] # 农产品-营养素的关系
    distribute = [] # 农产品-地域的关系
    give_name = [] #  农产品-命名者的关系
    fertilization = [] # 农产品-化肥的关系
    beneficial_envi = [] # 农产品-环境的关系
    fall_ill = [] # 农产品-病虫害的关系
    prevent_disease = [] # 农产品-疾病的关系
    health_help = [] # 农产品-功效的关系
    with tqdm(total=len(df_attr.groupby(["id"]).size()), ncols=50) as pbar:
        for name,group in df_attr.groupby(["id"]):
            pbar.update(1)
            # 添加属性
#             attr_dict = {}
            produce = group[group.key.str.contains("中.*?名.*?",regex=True)]
            # 中文名和中文学名存在歧义的暂时不处理
            if (produce.shape[0]>1 and len(produce["values"].unique())==1) or (produce.shape[0]==1):
                produce_name = cut_stop_word(stop_words,produce.iloc[0]["values"])
#                 attr_dict["中文学名"] = produce_name
                Produce.append(produce_name)
                
                belong = re.findall("(\w{1,5}门)|.*?称.*?(\w{1,5}门)",get_attr_value(group, need_key = "门"))
                belong = list(filter(None, np.array(belong).flatten()))
                Belong += belong
                belong_part_of.append(add_rels(produce_name,belong))
                
                classify = re.sub("\(|\)|[A-Za-z]|\s+|\t+|，|\.|（|）|；|：|。","",get_attr_value(group,need_key="品种")) 
                classify = [classify] if len(classify)<6 else ""
                Classify += classify
                classify_part_of.append(add_rels(produce_name,classify))
                
                person = re.sub("，|；|：|。|'|\"","",get_attr_value(group, need_key = "命名者及年代"))
                Person.append(person)
                give_name.append(add_rels(produce_name,[person]))
                
                outline =  re.findall("(\w{1,5}纲)|.*?称.*?(\w{1,5}纲)",get_attr_value(group, need_key = "纲"))
                outline = list(filter(None, np.array(outline).flatten()))
                Outline += outline
                outline_part_of.append(add_rels(produce_name,outline))
                
                word, locations = fool.analysis(get_attr_value(group, need_key = "区域"))
                location = list(filter(None, np.array([[l[3] for l in loca]for loca in locations]).flatten()))
                Location += location
                distribute.append(add_rels(produce_name,location))
                
                catalogue = re.findall(".*?称.(\w{1,5}目)|.*?属.(\w{1,5}目)|(\w{1,5}目)|.*?归.(\w{1,5}目)",get_attr_value(group, need_key="目"))
                catalogue = list(filter(None, np.array(catalogue).flatten()))
                Catalogue += catalogue
                catalogue_part_of.append(add_rels(produce_name,catalogue))
    
                family =  re.findall(".*?即.(\w{1,5}科)|.*?或.(\w{1,5}科)|(\w{1,5}科)|.*?属.(\w{1,5}科)|.*?于.(\w{1,5}科)",get_attr_value(group, need_key="科"))
                family = list(filter(None, np.array(family).flatten()))
                Family += family
                family_part_of.append(add_rels(produce_name,family))
                # 还没提取完明天继续
    #             Manure = ""
    #             Climate = ""
    #             Diseases = ""
    #             Nutrients = ""
    #             Illness = ""
    #             Ability = "" 
                
            else:
                pass
        return Produce, Person, Belong, Outline, Classify, Location, Catalogue, Family,\
            Manure, Climate, Diseases, Nutrients, Illness, Ability,\
            belong_part_of, classify_part_of, outline_part_of, catalogue_part_of, family_part_of, have_nutrition, distribute, give_name, fertilization, beneficial_envi, fall_ill, prevent_disease, health_help

### 连接数据库

In [11]:
from py2neo import Graph, Node, Relationship, NodeMatcher

graph = Graph(
    host="127.0.0.1",  # neo4j 搭载服务器的ip地址，ifconfig可获取到
    http_port=7474,  # neo4j 服务器监听的端口号
    user="neo4j",  # 数据库user name，如果没有更改过，应该是neo4j
    password="")

### 创建节点

In [12]:
def create_node(label, nodes):
    with tqdm(total=len(nodes), ncols=100) as pbar:
        for node_name in nodes:
            node = Node(label, name=node_name)
            graph.create(node)
            pbar.set_description("Save %s to neo4j bat" % label)
            pbar.update(1)
        return

### 创建关系

In [13]:
def create_relationship(start_node, end_node, edges, rel_type, rel_name):
    # 去重处理
    set_edges = sum([i for i in edges if i], [])
    with tqdm(total=len(set(set_edges)), ncols=100) as pbar:
        for edge in set(set_edges):
            edge = edge.split('###')
            p = edge[0]
            q = edge[1]
            if q and p:
                query = "match(p:%s),(q:%s) where p.name='%s'and q.name='%s' create (p)-[rel:%s{name:'%s'}]->(q)" % (
                    start_node, end_node, p, q, rel_type, rel_name)
                try:
                    graph.run(query)
                    pbar.set_description("Save %s of %s relationship to neo4j bat" %(p,q))
                    pbar.update(1)
                except Exception as e:
                    print(e)

### 主函数

In [19]:
Produce, Person, Belong, Outline, Classify, Location, Catalogue, Family, \
Manure, Climate, Diseases, Nutrients,Illness, Ability, \
belong_part_of, classify_part_of, outline_part_of, catalogue_part_of, family_part_of, have_nutrition, distribute, give_name, fertilization, beneficial_envi, fall_ill, prevent_disease, health_help = read_nodes()# 创建节点
create_node("Produce",set(Produce))
create_node("Person",set(Person))
create_node("Belong",set(Belong))
create_node("Outline",set(Outline))
create_node("Classify",set(Classify))
create_node("Location",set(Location))
create_node("Catalogue",set(Catalogue))
create_node("Family",set(Family))
# 创建实体关系
create_relationship("Produce", "Belong", belong_part_of, "part_of", "属于")
create_relationship("Produce", "Classify", classify_part_of, "part_of", "属于")
create_relationship("Produce", "Outline", outline_part_of, "part_of", "属于")
create_relationship("Produce", "Catalogue", catalogue_part_of, "part_of", "属于")
create_relationship("Produce", "Family", family_part_of, "part_of", "属于")
create_relationship("Produce", "Location", distribute, "distribute", "分布于")
create_relationship("Produce", "Person", give_name, "give_name", "取名字是")



100%|█████████| 4408/4408 [01:00<00:00, 72.65it/s]


### 导出文件

In [21]:
f_produce = open('../data/produce.txt', 'w+')
f_person = open('../data/person.txt', 'w+')
f_belong = open('../data/belong.txt', 'w+')
f_outline = open('../data/outline.txt', 'w+')
f_classify = open('../data/classify.txt', 'w+')
f_location = open('../data/location.txt', 'w+')
f_catalogue = open('../data/catalogue.txt', 'w+')
f_family = open('../data/family.txt', 'w+')

f_produce.write('\n'.join(list(set(Produce))))
f_person.write('\n'.join(list(set(Person))))
f_belong.write('\n'.join(list(set(Belong))))
f_outline.write('\n'.join(list(set(Outline))))
f_classify.write('\n'.join(list(set(Classify))))
f_location.write('\n'.join(list(set(Location))))
f_catalogue.write('\n'.join(list(set(Catalogue))))
f_family.write('\n'.join(list(set(Family))))


f_produce.close()
f_person.close()
f_belong.close()
f_outline.close()
f_classify.close()
f_location.close()
f_catalogue.close()
f_family.close()

['望都塔桧',
 '红岗山桃子',
 '宝贵满堂',
 '三色虎耳草',
 '指示生物',
 '宽根荷',
 '台湾大青枣',
 '狲猴桃',
 '菜豆锈病',
 '欧洲云杉',
 '油菜假黑斑病',
 '垂花赫蕉',
 '南川青荚叶',
 '禾叶点地梅',
 '琥头',
 '三羽新月蕨',
 '墨兰',
 '保丰2号',
 '商路科',
 '裸蕨类',
 '中甘9号',
 '东农610',
 '黑梅',
 '破布籽',
 '一串红',
 '白梗芋',
 '朝优9号',
 '防己',
 '六出花',
 '内含子',
 '下田菊',
 '异果黄堇',
 '黄金瓜',
 '新型兰属',
 '巨型魔芋',
 '铁椒3号',
 '彩叶凤梨',
 '木瓜面膜',
 '国欣棉3号',
 '发芽糙米',
 '准两优2号',
 '大田作物',
 '五岭红',
 '唐山秋瓜',
 '黄金蜜柚',
 '九轮塔',
 '常夏石竹',
 '台湾杜鹃',
 '阿太一代',
 '福禄桐',
 '辽茄五号茄子',
 '东久橐吾',
 '甘蓝菜',
 '七爪龙',
 '爱伦达尔桔',
 '三角树状大戟',
 '红皮南瓜',
 '中科10号',
 '京玉桃',
 '具芒碎米莎草',
 '特小凤',
 '两广蛇根草',
 '华光油桃',
 '皇家秋天',
 '脐血橙',
 '甘蔗桂糖',
 '长豇豆',
 '中亚荩草',
 '红地球葡萄',
 '笋干',
 '新陆中42号',
 '三叶梣',
 '密花石斛',
 '金桃猕猴桃',
 '圆叶茑萝',
 '东亚蛾眉蕨',
 '来檬',
 '慈姑',
 '大青皮冬瓜',
 '沙田柚',
 '中优901',
 '悦阳45天',
 '界牌慈菇',
 '龙骨菜',
 '三叶薯蓣',
 '中棉所70',
 '北京小杂50号',
 '山核桃',
 '婆罗门菊',
 '授粉',
 '万金优2008',
 '中华青荚叶',
 '中农稻1号',
 '黄小橙子',
 '鲍鱼菇',
 '假多叶黄堇',
 '南宁肉丝瓜',
 '农业气候',
 '丝颖针茅',
 '矮生延胡索',
 '白粉桃',
 '开瓣豹子花',
 '云山大樱桃',
 '巴',
 '双蝴蝶',
 '佳粉17号',
 '丝茎蓼',
 '吉农大578',
 '丰绿猕猴桃',
 '九台晚李',
 '中优223',
 

In [None]:
# df_attr[(df_attr.id=="05bdd42efc5a483d58dc70cf95497b04")&(df_attr.key=="中文名")]
# a = re.findall("(\w{1,5}门)|.*?称.*?(\w{1,5}门)","被子植物门")
# a
# a = re.sub("\(|\)|[A-Za-z]|\s+|\t+|，|\.|（|）|；|：|。","",ddddd)

In [67]:
# import ahocorasick

# def make_AC(AC, word_set):
#     for word in word_set:
#         AC.add_word(word,word)
#     return AC

# def test_ahocorasick():
#     '''
#     ahocosick：自动机的意思
#     可实现自动批量匹配字符串的作用，即可一次返回该条字符串中命中的所有关键词
#     '''
#     key_list = ["苹果", "香蕉", "梨", "橙子", "柚子", "火龙果", "柿子", "猕猴挑"]
#     AC_KEY = ahocorasick.Automaton()
#     AC_KEY = make_AC(AC_KEY, set(key_list))
#     AC_KEY.make_automaton()
#     test_str_list = ["我最喜欢吃的水果有：苹果、梨、香蕉、还有烤肉香蕉水", "我也喜欢吃香蕉，但是我不喜欢吃梨"]
#     for content in test_str_list:
#         name_list = set()
#         for item in AC_KEY.iter(content):#将AC_KEY中的每一项与content内容作对比，若匹配则返回
#             name_list.add(item[1])
#         name_list = list(name_list)
#         if len(name_list) > 0:
#             print(content, "--->命中的关键词有：", "\t".join(name_list))
# if __name__ == "__main__":
#     test_ahocorasick()

我最喜欢吃的水果有：苹果、梨、香蕉、还有烤肉香蕉水 --->命中的关键词有： 苹果	梨	香蕉
我也喜欢吃香蕉，但是我不喜欢吃梨 --->命中的关键词有： 梨	香蕉


In [None]:
from question_classifier import *
from question_parser import *
from answer_search import *

'''问答类'''
class ChatBotGraph:
    def __init__(self):
        self.classifier = QuestionClassifier()
        self.parser = QuestionPaser()
        self.searcher = AnswerSearcher()

    def chat_main(self, sent):
        answer = '！'
        res_classify = self.classifier.classify(sent)
        if not res_classify:
            return answer
        res_sql = self.parser.parser_main(res_classify)
        final_answers = self.searcher.search_main(res_sql)
        if not final_answers:
            return answer
        else:
            return '\n'.join(final_answers)

if __name__ == '__main__':
    handler = ChatBotGraph()
    while 1:
        question = input('用户:')
        answer = handler.chat_main(question)
        print('小勇:', answer)
