# 鉴定性别

In [1]:
import nltk
from nltk.corpus import names
import random
from nltk.classify import apply_features
from nltk.classify.naivebayes import NaiveBayesClassifier 
from nltk.classify import accuracy

def gender_features(word):
    '''特征提取器'''
    return {'last_letter': word[-1]}

names = ([(name,'male') for name in names.words('male.txt')] +
        [(name,'female') for name in names.words('female.txt')])

random.shuffle(names)

train_set = apply_features(gender_features,names[500:])
test_set = apply_features(gender_features,names[:500])

classifier = NaiveBayesClassifier.train(train_set) #训练朴素贝叶斯分类器

print (classifier.classify(gender_features('Neo'))) #获取单个姓名预测值
print (accuracy(classifier, test_set)) #获取准确率
print (classifier.show_most_informative_features(5)) #查看最有效的5个特征-似然比

male
0.736
Most Informative Features
             last_letter = 'a'            female : male   =     35.3 : 1.0
             last_letter = 'k'              male : female =     32.1 : 1.0
             last_letter = 'f'              male : female =     14.0 : 1.0
             last_letter = 'p'              male : female =     11.3 : 1.0
             last_letter = 'v'              male : female =     10.6 : 1.0
None


# 选择正确的特征

In [2]:
def gender_features2(word):
    return {'last_letter1': word[-1:],
           'last_letter2': word[-2:]}

train_set2 = apply_features(gender_features2,names[500:])
test_set2 = apply_features(gender_features2,names[:500])

classifier2 = NaiveBayesClassifier.train(train_set2) #训练朴素贝叶斯分类器

print (accuracy(classifier2, test_set2)) #获取准确率

0.772


# 文档分类

In [3]:
#电影评论正负面分类任务
from nltk.corpus import movie_reviews
import random
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words.keys())[:2000]
def document_features(document):
    '''特征提取器，用于提取前2000个高频词汇'''
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

featuresets = [(document_features(d), c) for (d,c) in documents]
train_set3, test_set3 = featuresets[100:], featuresets[:100]
classifier3 = NaiveBayesClassifier.train(train_set3)
print (accuracy(classifier3, test_set3))

0.82


In [4]:
#基于后缀训练词性分类器
from nltk.corpus import brown
from nltk.classify.decisiontree import DecisionTreeClassifier
suffix_fdist = nltk.FreqDist()
for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]] +=1
    suffix_fdist[word[-2:]] +=1
    suffix_fdist[word[-3:]] +=1
common_suffixes = list(suffix_fdist.keys())[:100]

def pos_features(word):
    features = {}
    for suffix in common_suffixes:
        features['endswith(%s)' % suffix] = word.lower().endswith(suffix)
    return features

tagged_words = brown.tagged_words(categories='news')
featuresets2 = [(pos_features(n), g) for (n,g) in tagged_words]
size = int(len(featuresets2) * 0.02)
size2 = int(len(featuresets2) * 0.1)
train_set4, test_set4 = featuresets2[size:size2], featuresets2[:size]

In [5]:
classifier4 = DecisionTreeClassifier.train(train_set4)
print(accuracy(classifier4, test_set4))
print(classifier4.pseudocode(depth=4))#伪代码解释器

0.5952262555942317
if endswith(the) == False: 
  if endswith(s) == False: 
    if endswith(,) == False: 
      if endswith(.) == False: return 'IN'
      if endswith(.) == True: return '.'
    if endswith(,) == True: return ','
  if endswith(s) == True: 
    if endswith(was) == False: 
      if endswith(as) == False: return 'NNS'
      if endswith(as) == True: return 'HVZ'
    if endswith(was) == True: return 'BEDZ'
if endswith(the) == True: return 'AT'



# 探索上下文语境

In [6]:
def pos_features2(sentence,i):
    '''提取句子中指定位置词的最后3位的后缀，以及它的前一个词'''
    features = {"suffix(1)": sentence[i][-1:],
                "suffix(2)": sentence[i][-2:],
                "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    return features

print(pos_features2(brown.sents()[0], 1))
tagged_sents = brown.tagged_sents(categories='news')
featuresets3 = []

for tagged_sent in tagged_sents:
    untagged_sent = nltk.tag.untag(tagged_sent)
    for i, (word,tag) in enumerate(tagged_sent):
        featuresets3.append((pos_features2(untagged_sent,i),tag))

size1 = int(len(featuresets3) * 0.02)
size3 = int(len(featuresets3) * 0.1)
train_set5, test_set5 = featuresets3[size1:size3], featuresets3[:size1]
classifier5 = NaiveBayesClassifier.train(train_set5)
print (accuracy(classifier5, test_set5))#利用上下文特征提高了我们的词性标注器的性能

{'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ton', 'prev-word': 'The'}
0.7603182496270512


# 序列分类

一种序列分类器策略，称为连续分类或贪婪序列分类，是为第一个输入找到最有可能的 类标签，然后使用这个问题的答案帮助找到下一个输入的最佳的标签。这个过程可以不断重 复直到所有的输入都被贴上标签。

另一种方案是为词性标记所有可能的序列打分，选择总得分最高的序列。隐马尔可夫模 型就采取这种方法。隐马尔可夫模型类似于连续分类器，它不光看输入也看已预测标记的历 史。然而，不是简单地找出一个给定的词的单个最好的标签，而是为标记产生一个概率分布。 然后将这些概率结合起来计算标记序列的概率得分，最高概率的标记序列会被选中。

In [7]:
def pos_features3(sentence, i, history):
    '''特征提取'''
    features = {"suffix(1)": sentence[i][-1:],
                "suffix(2)": sentence[i][-2:],
                "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
        features["prev-tag"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
        features["prev-tag"] = history[i-1]
    return features

class ConsecutivePosTagger(nltk.TaggerI):
    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = pos_features3(untagged_sent, i, history)
                train_set.append((featureset, tag))
                history.append(tag)
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)
    
    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featureset = pos_features3(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)

tagged_sents = brown.tagged_sents(categories='news')
size4 = int(len(tagged_sents) * 0.02)
size5 = int(len(tagged_sents) * 0.1)
train_sents, test_sents = tagged_sents[size4:size5], tagged_sents[:size4]
tagger = ConsecutivePosTagger(train_sents)
print (tagger.evaluate(test_sents))

0.7765858208955224


# 句子分割

In [8]:
sents = nltk.corpus.treebank_raw.sents()
tokens = []
boundaries = set()
offset = 0
for sent in nltk.corpus.treebank_raw.sents():
    tokens.extend(sent)
    offset += len(sent)
    boundaries.add(offset-1)

def punct_features(tokens, i):
    return {'next-word-capitalized': tokens[i+1][0].isupper(),
            'prevword': tokens[i-1].lower(),
            'punct': tokens[i],
            'prev-word-is-one-char': len(tokens[i-1]) == 1}
featuresets4 = [(punct_features(tokens, i), (i in boundaries))
                for i in range(1, len(tokens)-1)
                if tokens[i] in '.?!']
size = int(len(featuresets4) * 0.1)
train_set6, test_set6 = featuresets4[size:], featuresets4[:size]
classifier6 = NaiveBayesClassifier.train(train_set6)
print (accuracy(classifier6, test_set6))

def segment_sentences(words):
    '''基于分类的断句器'''
    start = 0
    sents = []
    for i, word in words:
        if word in '.?!' and classifier.classify(words, i) == True:
            sents.append(words[start: i+1])
            start = i+1
    if start < len(words):
        sents.append(words[start:])

0.936026936026936


# 识别对话行为类型

In [9]:
posts = nltk.corpus.nps_chat.xml_posts()[:10000]
def dialogue_act_features(post):
    '''词袋模型之特征提取器'''
    features = {}
    for word in nltk.word_tokenize(post):
        features['contains(%s)' % word.lower()] = True
    return features
featuresets5 = [(dialogue_act_features(post.text), post.get('class'))
                for post in posts]
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets5[size:], featuresets5[:size]
classifier7 = NaiveBayesClassifier.train(train_set)
print (accuracy(classifier7,test_set))

0.715


# 识别文字蕴含

rte_classify.RTEFeatureExtractor RTE语料库的简单分类器。

它计算文本和假设之间单词和命名实体的重叠，以及假设中是否有单词/命名实体未能在文本中出现，因为这是一个指标，说明假设比文本（即不受文本约束）更具信息性。

In [20]:
def rte_features(rtepair):
    extractor = nltk.classify.rte_classify.RTEFeatureExtractor(rtepair)
    features={}
    features['word_overlap'] = len(extractor.overlap('word'))
    features['word_hyp_extra'] = len(extractor.hyp_extra('word'))
    features['ne_overlap'] = len(extractor.overlap('ne'))
    features['ne_hyp_extra'] = len(extractor.hyp_extra('ne'))
    return features

rtepair = nltk.corpus.rte.pairs(['rte3_dev.xml'])[33]
extractor = nltk.classify.rte_classify.RTEFeatureExtractor(rtepair)
print(extractor.text_words)
print(extractor.hyp_words)
print (extractor.overlap('word'))
print (extractor.overlap('ne'))
print (extractor.hyp_extra('word'))

{'fledgling', 'terrorism.', 'Shanghai', 'former', 'Organisation', 'meeting', 'Davudi', 'association', 'at', 'was', 'central', 'that', 'Asia', 'four', 'operation', 'fight', 'Parviz', 'Soviet', 'China', 'representing', 'Russia', 'republics', 'SCO', 'binds', 'Iran', 'together', 'Co'}
{'member', 'SCO.', 'China'}
set()
{'China'}
{'member'}
