In [1]:
import sys
from importlib import reload
stdout = sys.stdout
reload(sys)
sys.stdout = stdout

# 有监督分类的更多例子

## 句子分割
句子分割可以看作是一个标点符号的分类任务：每当我们遇到一个可能会结束一个句子的符号，如句号或问号，我们必须决定它是否终止了当前句子。

In [2]:
import nltk

sents = nltk.corpus.treebank_raw.sents()
tokens = []  # Store all words in a list.
boundaries = set()  # Store all sentence end positions from tokens list above.
offset = 0
for sent in nltk.corpus.treebank_raw.sents():
    tokens.extend(sent)
    offset += len(sent)
    boundaries.add(offset - 1)
    
print(tokens[:10])
print(sorted(list(boundaries))[:10])
print(offset)

['.', 'START', 'Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will']
[1, 20, 36, 38, 64, 66, 102, 134, 163, 199]
101797


我们为句子中的每个标点符号提供如下数据特征：
1. next-word-capitalized：下一个单词是否首字母大写
2. prevword：上一个单词的小写形式
3. punct：当前标点符号
4. prev-word-is-one-char：上一个单词只有一个字母

In [3]:
def punct_features(tokens, i):
    return {
        'next-word-capitalized': tokens[i + 1][0].isupper(),
        'prevword': tokens[i - 1].lower(),
        'punct': tokens[i],
        'prev-word-is-one-char': len(tokens[i - 1]) == 1
    }

基于这一特征提取器，我们可以通过选择所有的标点符号创建一个加标签的特征及链表，然后标注它们是否是边界标识符，并利用特征集训练和评估一个标点符号分类器：

In [4]:
featuresets = [(punct_features(tokens, i), (i in boundaries)) 
               for i in range(1, len(tokens) - 1) 
               if tokens[i] in '.?!']
print(featuresets[0])
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

({'next-word-capitalized': False, 'prevword': 'nov', 'punct': '.', 'prev-word-is-one-char': False}, False)
0.936026936026936


使用这种分类器进行断句，我们只需要检查每个标点符号，看它是否是作为一个边界标识符，在边界标识符处分割词链表。

In [5]:
def segment_sentences(words):
    start = 0
    sents = []
    for i, word in enumerate(words):
        if word in '.?!' and classifier.classify(punct_features(words, i)) == True:
            sents.append(words[start: i+ 1])
            start = i + 1
    if start < len(words):
        sents.append(words[start:])
    return sents

words = tokens[:1000]
sents = segment_sentences(words)
print(len(sents))

34


## 识别对话行为类型
识别对话中言语下的对话行为，如问候、陈述、情感、说明等，是理解谈话重要的第一步。这里我们使用 NPS 聊天语料库，包含 10000 个来自即时消息会话的帖子以及对应的行为类型标签，建立一个分类器来识别新的即时消息帖子的对话行为类型。

In [6]:
def dialogue_act_features(post):
    features = {}
    for word in nltk.word_tokenize(post):
        features['contains(%s)' % word.lower()] = True
    return features

posts = nltk.corpus.nps_chat.xml_posts()[:10000]
featuresets = [(dialogue_act_features(post.text), post.get('class')) for post in posts]
print(featuresets[:1])
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

[({'contains(now)': True, 'contains(im)': True, 'contains(left)': True, 'contains(with)': True, 'contains(this)': True, 'contains(gay)': True, 'contains(name)': True}, 'Statement')]
0.668


## 识别文字蕴含
识别文字蕴含是判断文本 T 的一个给定片段是否蕴含着另一个叫做“假设 H”的文本，示例如下：

> **Challenge 3, Pair 34 (True)**  
**T:** Parviz Davudi was representing Iran at a meeting of the Shanghai Co-operation
Organisation (SCO), the fledgling association that binds Russia, China and four
former Soviet republics of central Asia together to fight terrorism.  
**H:** China is a member of SCO.

我们可以把识别文字蕴含当作一个分类任务，尝试为每一对预测真/假标签。比较理想的假设是如果有一个蕴含，那么假设所表示的所有信息也应该在文本中表示。相反，如果假设中有的资料文本中没有，那么就没有蕴含。  
nltk 内置的 RTEFeatureExtractor 类可以帮助我们分析除去部分停用词后的文本和假设中的词汇，并计算它们之间的重叠和差异。

In [7]:
rtepair = nltk.corpus.rte.pairs(['rte3_dev.xml'])[33]
extractor = nltk.RTEFeatureExtractor(rtepair)
print(extractor.text_words)
print(extractor.hyp_words)
print(extractor.overlap('word'))  # Same words between text and hypothesis
print(extractor.overlap('ne'))  # All-cap word or title-cased word in overlap('word')
print(extractor.hyp_extra('word'))  # Extraneous material in the hypothesis

{'Co', 'terrorism.', 'at', 'fight', 'Iran', 'that', 'meeting', 'Shanghai', 'Organisation', 'association', 'was', 'together', 'Russia', 'Asia', 'four', 'Soviet', 'China', 'fledgling', 'republics', 'Davudi', 'central', 'representing', 'operation', 'Parviz', 'SCO', 'binds', 'former'}
{'member', 'China', 'SCO.'}
set()
{'China'}
{'member'}


使用文本和假设间的重叠和差异作为特征，我们可以训练和评估如下文字蕴含识别器：

In [8]:
def rte_features(rtepair):  # RTE stands for Recognizing textual entailment.
    extractor = nltk.RTEFeatureExtractor(rtepair)
    features = {}
    features['word_overlap'] = len(extractor.overlap('word'))
    features['word_hyp_extra'] = len(extractor.hyp_extra('word'))
    features['ne_overlap'] = len(extractor.overlap('ne'))
    features['ne_hyp_extra'] = len(extractor.hyp_extra('ne'))
    return features

rtepairs = nltk.corpus.rte.pairs(['rte3_dev.xml'])
featuresets = [(rte_features(rtepair), rtepair.value) for rtepair in rtepairs]
print(featuresets[:1])
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

[({'word_overlap': 2, 'word_hyp_extra': 0, 'ne_overlap': 1, 'ne_hyp_extra': 1}, 1)]
0.4625
