In [None]:
conda create -n dingliang python=3.6

conda deactivate

# LTP

## 安装

```
$ git clone https://github.com/HIT-SCIR/pyltp
$ git submodule init
$ git submodule update


// for windows, edit these two file before build
// D:\works\NLP\kaikeba\tools\pyltp\ltp\src\srl\include\extractor\Converter.h, remove chinese
// 
diff --git a/patch/libs/python/src/converter/builtin_converters.cpp b/patch/libs/python/src/converter/builtin_converters.cpp
index 78e55fd..9873fe2 100644
--- a/patch/libs/python/src/converter/builtin_converters.cpp
+++ b/patch/libs/python/src/converter/builtin_converters.cpp
@@ -48,7 +48,7 @@ namespace
 #else
   void* convert_to_cstring(PyObject* obj)
   {
-      return PyUnicode_Check(obj) ? _PyUnicode_AsString(obj) : 0;
+      return PyUnicode_Check(obj) ? (void *)_PyUnicode_AsString(obj) : 0;^M
   }
 #endif

$ python setup.py install

```

## Step1 cut sentence

In [1]:
from pyltp import SentenceSplitter
document=u'昨日，雷先生说，交警部门罚了他16次，他只认了一次，交了一次罚款，拿到法院的判决书后，会前往交警队，要求撤销此前的处罚。'
sentences = SentenceSplitter.split(document)
print(sentences)

<pyltp.VectorOfString object at 0x000001303DB8EAB0>


## Step2 cut word

In [2]:
import os
LTP_DATA_DIR = r'../../tools/ltp_data_v3.4.0/'  # ltp模型目录的路径
cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')  # 分词模型路径，模型名称为`cws.model`

from pyltp import Segmentor
segmentor = Segmentor()  # 初始化实例
segmentor.load(cws_model_path)  # 加载模型
words = segmentor.segment(sentences[0])  # 分词
print('\t'.join(words))
segmentor.release()  # 释放模型

昨日	，	雷	先生	说	，	交警	部门	罚	了	他	16	次	，	他	只	认	了	一	次	，	交	了	一	次	罚款	，	拿	到	法院	的	判决书	后	，	会	前往	交警队	，	要求	撤销	此前	的	处罚	。


## Step3 Part-of-Speech Tagging

In [3]:
pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')  # 词性标注模型路径，模型名称为`pos.model`

from pyltp import Postagger
postagger = Postagger() # 初始化实例
postagger.load(pos_model_path)  # 加载模型

postags = postagger.postag(words)  # 词性标注

print('\t'.join(postags))
postagger.release()  # 释放模型

nt	wp	nh	n	v	wp	j	n	v	u	r	m	q	wp	r	d	v	u	m	q	wp	v	u	m	q	v	wp	v	v	n	u	n	nd	wp	v	v	n	wp	v	v	nt	u	v	wp


## Step4 Named Entity Recognition

In [4]:
ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model')  # 命名实体识别模型路径，模型名称为`pos.model`

from pyltp import NamedEntityRecognizer
recognizer = NamedEntityRecognizer() # 初始化实例
recognizer.load(ner_model_path)  # 加载模型

netags = recognizer.recognize(words, postags)  # 命名实体识别

print('\t'.join(netags))
recognizer.release()  # 释放模型

O	O	S-Nh	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O


## Step5 Dependency Parsing

In [5]:
par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model')  # 依存句法分析模型路径，模型名称为`parser.model`

from pyltp import Parser
parser = Parser() # 初始化实例
parser.load(par_model_path)  # 加载模型

arcs = parser.parse(words, postags)  # 句法分析

print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))
parser.release()  # 释放模型

5:ADV	1:WP	4:ATT	5:SBV	0:HED	5:WP	8:ATT	9:SBV	5:VOB	9:RAD	9:IOB	13:ATT	9:VOB	9:WP	17:SBV	17:ADV	9:COO	17:RAD	20:ATT	17:CMP	17:WP	17:COO	22:RAD	25:ATT	26:ATT	22:VOB	22:WP	33:ATT	28:CMP	32:ATT	30:RAD	28:VOB	36:ADV	33:WP	36:ADV	17:COO	36:VOB	17:WP	17:COO	39:VOB	43:ATT	41:RAD	40:VOB	5:WP


In [6]:
def build_parse_child_dict(words, postags, arcs):
    """
    为句子中的每个词语维护一个保存句法依存儿子节点的字典
    Args:
        words: 分词列表
        postags: 词性列表
        arcs: 句法依存列表
    """
    child_dict_list = []
    for index in range(len(words)):
        child_dict = dict()
        for arc_index in range(len(arcs)):
            if arcs[arc_index].head == index + 1:
                if arcs[arc_index].relation in child_dict:
                    child_dict[arcs[arc_index].relation].append(arc_index)
                else:
                    child_dict[arcs[arc_index].relation] = []
                    child_dict[arcs[arc_index].relation].append(arc_index)
        #if child_dict.has_key('SBV'):
        #    print words[index],child_dict['SBV']
        child_dict_list.append(child_dict)
    return child_dict_list

In [7]:
child_dict_list = build_parse_child_dict(words, postags, arcs)

In [8]:
print(child_dict_list)

[{'WP': [1]}, {}, {}, {'ATT': [2]}, {'ADV': [0], 'SBV': [3], 'WP': [5, 43], 'VOB': [8]}, {}, {}, {'ATT': [6]}, {'SBV': [7], 'RAD': [9], 'IOB': [10], 'VOB': [12], 'WP': [13], 'COO': [16]}, {}, {}, {}, {'ATT': [11]}, {}, {}, {}, {'SBV': [14], 'ADV': [15], 'RAD': [17], 'CMP': [19], 'WP': [20, 37], 'COO': [21, 35, 38]}, {}, {}, {'ATT': [18]}, {}, {'RAD': [22], 'VOB': [25], 'WP': [26]}, {}, {}, {'ATT': [23]}, {'ATT': [24]}, {}, {'CMP': [28], 'VOB': [31]}, {}, {'RAD': [30]}, {}, {'ATT': [29]}, {'ATT': [27], 'WP': [33]}, {}, {}, {'ADV': [32, 34], 'VOB': [36]}, {}, {}, {'VOB': [39]}, {'VOB': [42]}, {'RAD': [41]}, {}, {'ATT': [40]}, {}]


In [9]:
def complete_e(words, postags, child_dict_list, word_index):
    """
    完善识别的部分实体
    """
    child_dict = child_dict_list[word_index]
    prefix = ''
    if 'ATT' in child_dict:
        for i in range(len(child_dict['ATT'])):
            prefix += complete_e(words, postags, child_dict_list, child_dict['ATT'][i])
    
    postfix = ''
    if postags[word_index] == 'v':
        if 'VOB' in child_dict:
            postfix += complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
        if 'SBV' in child_dict:
            prefix = complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) + prefix

    return prefix + words[word_index] + postfix

In [10]:
def fact_triple_extract(words, postags, arcs):
    child_dict_list = build_parse_child_dict(words, postags, arcs)
    for index in range(len(postags)):
        # 抽取以谓词为中心的事实三元组
        if postags[index] == 'v':
            child_dict = child_dict_list[index]
            # 主谓宾
            if ('SBV' in child_dict) and ('VOB' in child_dict):
                e1 = complete_e(words, postags, child_dict_list, child_dict['SBV'][0])
                r = words[index]
                e2 = complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
                print("主语谓语宾语关系\t({}, {}, {})\n".format(e1, r, e2))
                
            # 定语后置，动宾关系
            if arcs[index].relation == 'ATT':
                if 'VOB' in child_dict:
                    e1 = complete_e(words, postags, child_dict_list, arcs[index].head - 1)
                    r = words[index]
                    e2 = complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
                    temp_string = r+e2
                    if temp_string == e1[:len(temp_string)]:
                        e1 = e1[len(temp_string):]
                    if temp_string not in e1:
                        print("定语后置动宾关系\t({}, {}, {})\n".format(e1, r, e2))
                        
            # 含有介宾关系的主谓动补关系
            if ('SBV' in child_dict) and ('CMP' in child_dict):
                #e1 = words[child_dict['SBV'][0]]
                e1 = complete_e(words, postags, child_dict_list, child_dict['SBV'][0])
                cmp_index = child_dict['CMP'][0]
                r = words[index] + words[cmp_index]
                if 'POB' in child_dict_list[cmp_index]:
                    e2 = complete_e(words, postags, child_dict_list, child_dict_list[cmp_index]['POB'][0])
                    print("介宾关系主谓动补\t({}, {}, {})\n".format(e1, r, e2))

        # 尝试抽取命名实体有关的三元组
        if netags[index][0] == 'S' or netags[index][0] == 'B':
            ni = index
            if netags[ni][0] == 'B':
                while netags[ni][0] != 'E':
                    ni += 1
                e1 = ''.join(words[index:ni+1])
            else:
                e1 = words[ni]
            if arcs[ni].relation == 'ATT' and postags[arcs[ni].head-1] == 'n' and netags[arcs[ni].head-1] == 'O':
                r = complete_e(words, postags, child_dict_list, arcs[ni].head-1)
                if e1 in r:
                    r = r[(r.index(e1)+len(e1)):]
                if arcs[arcs[ni].head-1].relation == 'ATT' and netags[arcs[arcs[ni].head-1].head-1] != 'O':
                    e2 = complete_e(words, postags, child_dict_list, arcs[arcs[ni].head-1].head-1)
                    mi = arcs[arcs[ni].head-1].head-1
                    li = mi
                    if netags[mi][0] == 'B':
                        while netags[mi][0] != 'E':
                            mi += 1
                        e = ''.join(words[li+1:mi+1])
                        e2 += e
                    if r in e2:
                        e2 = e2[(e2.index(r)+len(r)):]
                    if r+e2 in sentence:
                        print("人名//地名//机构\t({}, {}, {})\n".format(e1, r, e2))


In [11]:
fact_triple_extract(words, postags, arcs)

主语谓语宾语关系	(雷先生, 说, 交警部门罚16次)

主语谓语宾语关系	(交警部门, 罚, 16次)

定语后置动宾关系	(后, 拿, 法院判决书)



## Practice

In [14]:
def extract_opinion(words, postags, arcs):
    child_dict_list = build_parse_child_dict(words, postags, arcs)
    index = 0
    for arc in arcs:
        if arc.relation == 'HED':
            break
        index+=1
    
    # 谓语是说一类的词
    predicate = words[index]
    child_dict = child_dict_list[index]
    if ('SBV' in child_dict) and ('VOB' in child_dict):
        e1 = complete_e(words, postags, child_dict_list, child_dict['SBV'][0])
        r = words[index]
        e2 = ''.join(words[index+1:])
        print("主语谓语宾语关系\t({}, {}, {})\n".format(e1, r, e2))

In [15]:
extract_opinion(words, postags, arcs)

主语谓语宾语关系	(雷先生, 说, ，交警部门罚了他16次，他只认了一次，交了一次罚款，拿到法院的判决书后，会前往交警队，要求撤销此前的处罚。)



In [16]:
import re
def token(string):
    """
    only keep number and words
    """
    return re.findall(r'[\d|\w]+', string)

In [3]:
import pandas as pd
def get_documents():
    csv_path = '../lesson05/sqlResult_1558435.csv'
    database = pd.read_csv(csv_path, encoding='gb18030')
    database = database.fillna('')
    contents = database['content'].tolist()
    contents = [n.replace('\r\n', '') for n in contents]
    return contents

In [4]:
documents = get_documents()
documents[0]

'此外，自本周（6月12日）起，除小米手机6等15款机型外，其余机型已暂停更新发布（含开发版/体验版内测，稳定版暂不受影响），以确保工程师可以集中全部精力进行系统优化工作。有人猜测这也是将精力主要用到MIUI 9的研发之中。MIUI 8去年5月发布，距今已有一年有余，也是时候更新换代了。当然，关于MIUI 9的确切信息，我们还是等待官方消息。'

In [5]:
import os
from pyltp import SentenceSplitter
from pyltp import Segmentor
from pyltp import Postagger
from pyltp import NamedEntityRecognizer
from pyltp import Parser

def build_parse_child_dict(words, postags, arcs):
    """
    为句子中的每个词语维护一个保存句法依存儿子节点的字典
    Args:
        words: 分词列表
        postags: 词性列表
        arcs: 句法依存列表
    """
    child_dict_list = []
    for index in range(len(words)):
        child_dict = dict()
        for arc_index in range(len(arcs)):
            if arcs[arc_index].head == index + 1:
                if arcs[arc_index].relation in child_dict:
                    child_dict[arcs[arc_index].relation].append(arc_index)
                else:
                    child_dict[arcs[arc_index].relation] = []
                    child_dict[arcs[arc_index].relation].append(arc_index)
        #if child_dict.has_key('SBV'):
        #    print words[index],child_dict['SBV']
        child_dict_list.append(child_dict)
    return child_dict_list

def complete_e(words, postags, child_dict_list, word_index):
    """
    完善识别的部分实体
    """
    child_dict = child_dict_list[word_index]
    prefix = ''
    if 'ATT' in child_dict:
        for i in range(len(child_dict['ATT'])):
            prefix += complete_e(words, postags, child_dict_list, child_dict['ATT'][i])

    postfix = ''
    if postags[word_index] == 'v':
        if 'VOB' in child_dict:
            postfix += complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
        if 'SBV' in child_dict:
            prefix = complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) + prefix

    return prefix + words[word_index] + postfix

def extract_opinion(document):
    LTP_DATA_DIR = r'../../tools/ltp_data_v3.4.0/'  # ltp模型目录的路径
    cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')  # 分词模型路径，模型名称为`cws.model`
    segmentor = Segmentor()  # 初始化实例
    segmentor.load(cws_model_path)  # 加载模型

    pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')  # 词性标注模型路径，模型名称为`pos.model`
    postagger = Postagger() # 初始化实例
    postagger.load(pos_model_path)  # 加载模型

    ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model')  # 命名实体识别模型路径，模型名称为`pos.model`
    recognizer = NamedEntityRecognizer() # 初始化实例
    recognizer.load(ner_model_path)  # 加载模型

    par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model')  # 依存句法分析模型路径，模型名称为`parser.model`
    parser = Parser() # 初始化实例
    parser.load(par_model_path)  # 加载模型

    # cut to sentences
    sentences = SentenceSplitter.split(document)
    table = []
    for sentence in sentences:
        print(sentence)
        # cut words
        words = segmentor.segment(sentence)  # 分词
        print('\t'.join(words))
        # postages
        postags = postagger.postag(words)  # 词性标注
        print('\t'.join(postags))
        # ner
        netags = recognizer.recognize(words, postags)  # 命名实体识别
        print('\t'.join(netags))
        # dependency parsing
        arcs = parser.parse(words, postags)  # 句法分析
        print("\t".join("{}:{}".format(arc.head, arc.relation) for arc in arcs))

        child_dict_list = build_parse_child_dict(words, postags, arcs)
        index = 0
        for arc in arcs:
            if arc.relation == 'HED':
                break
            index+=1

        # 谓语是说一类的词
        predicate = words[index]
        child_dict = child_dict_list[index]
        if ('SBV' in child_dict) and ('VOB' in child_dict):
            e1 = complete_e(words, postags, child_dict_list, child_dict['SBV'][0])
            r = words[index]
            e2 = ''.join(words[index+1:])
            print("{} | {} | {}".format(e1,r,e2))
            table.append((e1,r,e2))


    segmentor.release()  # 释放模型
    postagger.release()  # 释放模型
    recognizer.release()  # 释放模型
    parser.release()  # 释放模型
    
    return table

In [6]:
table = extract_opinion(documents[1])

骁龙835作为唯一通过Windows 10桌面平台认证的ARM处理器，高通强调，不会因为只考虑性能而去屏蔽掉小核心。
骁龙	835	作为	唯一	通过	Windows	10	桌面	平台	认证	的	ARM	处理器	，	高通	强调	，	不	会	因为	只	考虑	性能	而	去	屏蔽	掉	小	核心	。
nz	m	p	b	p	ws	m	q	n	v	u	ws	n	wp	nh	v	wp	d	v	c	d	v	n	c	v	v	v	a	n	wp
O	O	O	O	O	O	O	O	O	O	O	O	O	O	S-Nh	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O
2:ATT	16:ADV	16:ADV	16:ADV	16:ADV	9:ATT	8:ATT	9:ATT	10:ATT	13:ATT	10:RAD	13:ATT	5:POB	5:WP	16:SBV	0:HED	16:WP	19:ADV	22:ADV	22:ADV	22:ADV	16:VOB	22:VOB	26:ADV	26:ADV	22:COO	26:CMP	29:ATT	26:VOB	16:WP
高通 | 强调 | ，不会因为只考虑性能而去屏蔽掉小核心。
相反，他们正联手微软，找到一种适合桌面平台的、兼顾性能和功耗的完美方案。
相反	，	他们	正	联手	微软	，	找到	一	种	适合	桌面	平台	的	、	兼顾	性能	和	功耗	的	完美	方案	。
v	wp	r	d	v	ni	wp	v	m	q	v	n	n	u	wp	v	n	c	n	u	a	n	wp
O	O	O	O	O	S-Ni	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O	O
5:ADV	1:WP	5:SBV	5:ADV	0:HED	5:VOB	5:WP	5:COO	10:ATT	22:ATT	22:ATT	13:ATT	11:VOB	11:RAD	16:WP	11:COO	16:VOB	19:LAD	17:COO	11:RAD	22:ATT	8:VOB	5:WP
他们 | 联手 | 微软，找到一种适合桌面平台的、兼顾性能和功耗的完美方案。
报道称，微软已经拿到了一些新的源码，以便Windows 10更好地理解big.little架构。
报道	称	，	微软	已经	拿	到	了	一些	新	的	源码	，	以便	Windows	10	更	好	地	理解	big.littl