In [62]:
# -*- coding: UTF-8 -*-
import pandas as pd
import ast
import os
import jieba
import jieba.posseg as pseg
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import docx
# 可视化
from graphviz import Digraph
from pyltp import SentenceSplitter,Segmentor,Postagger,NamedEntityRecognizer,Parser,SementicRoleLabeller

In [80]:
csvPath = './data/operation.csv'
filePath='./data/solution/'
relationPath='./data/relation.csv'

LTP_DATA_DIR='D:\LTPmodel\ltp_data_v3.4.0'
cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')  # 分词模型路径，模型名称为`cws.model`
pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')  # 词性标注模型路径，模型名称为`pos.model`
ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model')  # 命名实体识别模型路径，模型名称为`ner.model`
par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model')	 # 依存句法分析模型路径，模型名称为`parser.model`
srl_model_path = os.path.join(LTP_DATA_DIR, 'srl')	# 语义角色标注模型目录路径，模型目录为`srl`。注意该模型路径是一个目录，而不是一个文件。


In [64]:
# 文件操作
def readDocx(filePath):
    fullText=[]
    # fullText=""
    doc = docx.Document(filePath)
    for p in doc.paragraphs:
        fullText.append(p.text)
        # fullText = fullText+(p.text)
    return fullText


# 读取所有文件
def findAllFile(base):
    for root,dirs,files in os.walk(base):
        for f in files:
            yield f

# 写入csv中
def writeCSV(data_ls):
    df=pd.DataFrame(data_ls)
    df.to_csv(csvPath,mode='a',index=False)

# filePath_ls是solution文件夹下所有文件的文件名
def getAllFilePath(filePath_ls):

    for file in findAllFile(filePath):
        filePath_ls.append(file)
    # filePath_ls.sort(key=lambda x:int(x.split('.')[0]))

分词

In [65]:
# 分词
jieba.load_userdict('D:\zoe\实验\meProject\Algorithm_KnowledgeGraph\dic.txt')

In [66]:
# jieba词性标注
def posttagger(sent):
    words=pseg.cut(sent)
    words_list = []
    tags_list = []
    for word,tag in words:
        words_list.append(word)
        tags_list.append(tag)
        # print(word,tag)
    return words_list,tags_list
    # return tags_list


将源文件与字典匹配，保留包含字典中的值的句子

In [67]:
# 字典匹配
# data_ls:包含标签的句子
# tags_ls:通过模糊匹配得到的标签
def match(sent,dic,data_ls,tags_ls):
    matchname = process.extractOne(sent,dic_list,scorer=fuzz.partial_ratio)
    if matchname[1] >= 60:
        data_ls.append(sent)
        tags_ls.append(matchname[0])
    

In [68]:
#去除重复词
def remove_duplicates(text_ls):

    check_val=set()
    result=[]
    for item in text_ls:
        if item not in check_val:
            result.append(item)
            check_val.add(item)
    return result

依存句法分析

In [69]:
def e_parser(words,tags):
    parser = Parser(par_model_path)
    arcs = parser.parse(words,tags)
    parser.release()
    return arcs

In [108]:
# 
def extractOne(words,relation,heads):
    relation_dic = ['VOB','SBV']
    relation_ls = []
   
    for i in range(len(words)):
        rel = []
        if words[i] in tags_ls and relation[i] in relation_dic:
            rel.append(words[i])
            rel.append(heads[i])
            relation_ls.append(rel)
    return relation_ls
    # writeCSV(relation_ls)

In [71]:
def extractRelation(arcs,words):
    rely_id = [arc[0] for arc in arcs]  # 提取依存父节点id
    relation = [arc[1] for arc in arcs]  # 提取依存关系
    heads = ['Root' if id == 0 else words[id-1] for id in rely_id]  # 匹配依存父节点词语
    return relation,heads

In [72]:
# 可视化
def visual(relation,words,heads):
    g = Digraph('测试图片')

    g.node(name='Root')
    for word in words:
        # print(word)
        g.node(name=word,fontname='FangSong')

    for i in range(len(words)):
        if relation[i] not in ['HED']:
            g.edge(words[i], heads[i], label=relation[i])
        else:
            if heads[i] == 'Root':
                g.edge(words[i], 'Root', label=relation[i])
            else:
                g.edge(heads[i], 'Root', label=relation[i])

    g.view()

In [251]:
'''
words:分词结果
tags:词性标注
arcs:依存句法
'''
def extract(data_ls):
    relation_dic = ['VOB','SBV']
    relation_ls = []

    # relation_ls.append(name)
    for sentence in data_ls:
        words = []
        tags = []
        relation = []
        heads = []
        words,tags=posttagger(sentence)
        # print(sentence)
        arcs = e_parser(words,tags)
        
        relation,heads=extractRelation(arcs,words)
        # for r,w,h in zip(relation,words,heads):
        #     print(r,w,h)
        # visual(relation,words,heads)
        # break
        # relation_ls.append(extractOne(words,relation,heads))
        # writeCSV(relation_ls)
        for i in range(len(words)):
            rel = []
            if words[i] in tags_ls and relation[i] in relation_dic:
                rel.append(words[i])
                rel.append(heads[i])
                relation_ls.append(rel)
    # result_str = ''.join(relation_ls)

    # print(result_str)
    # with open (csvPath,'a',encoding='utf_8') as fp:
    #     fp.write(result_str)
    #     fp.close()

    # print(relation_ls)
    return relation_ls
    
    # writeCSV(relation_ls)


读取字典

In [87]:
fp = open('./lexicon.txt','r',encoding='utf-8')
dic_list = []
line = fp.readline().strip()
dic_list.append(line)
while line:
    line = fp.readline().strip()
    if line != '':
        dic_list.append(line)
fp.close()

In [255]:
# data_ls = []
# tags_ls = []
# for text in text_list:
#     match(text,dic_list,data_ls,tags_ls)
# # for data in data_ls:
# #     print(data)
# tags_ls = remove_duplicates(tags_ls)
# for t in tags_ls:
#     print(t)

    
filePath_ls = []
getAllFilePath(filePath_ls)
df = pd.DataFrame(columns=['name','operations'])
for idx in range(len(filePath_ls)):
    path = filePath+filePath_ls[idx]
    text_list = readDocx(path)
    data_ls = []
    tags_ls = []
    name = filePath_ls[idx].rsplit('.',1)[0]
    for text in text_list:
        match(text,dic_list,data_ls,tags_ls)
    tags_ls = remove_duplicates(tags_ls)
    relation_ls = extract(data_ls)
    # extract(data_ls,filePath_ls[idx])

    df.loc[idx]=[name,relation_ls]
    print(idx,'successfully!')
df.to_csv(csvPath,mode='a',index=False)
    
        

