# 鉴定性别

In [31]:
import nltk
from nltk.corpus import names
import random
from nltk.classify import apply_features
from nltk.classify.naivebayes import NaiveBayesClassifier 
from nltk.classify import accuracy

def gender_features(word):
    '''特征提取器'''
    return {'last_letter': word[-1]}

names = ([(name,'male') for name in names.words('male.txt')] +
        [(name,'female') for name in names.words('female.txt')])

random.shuffle(names)

train_set = apply_features(gender_features,names[500:])
test_set = apply_features(gender_features,names[:500])

classifier = NaiveBayesClassifier.train(train_set) #训练朴素贝叶斯分类器

print (classifier.classify(gender_features('Neo'))) #获取单个姓名预测值
print (accuracy(classifier, test_set)) #获取准确率
print (classifier.show_most_informative_features(5)) #查看最有效的5个特征-似然比

male
0.75
Most Informative Features
             last_letter = 'a'            female : male   =     36.9 : 1.0
             last_letter = 'k'              male : female =     31.4 : 1.0
             last_letter = 'f'              male : female =     16.0 : 1.0
             last_letter = 'p'              male : female =     12.6 : 1.0
             last_letter = 'd'              male : female =     10.0 : 1.0
None


# 选择正确的特征

In [32]:
def gender_features2(word):
    return {'last_letter1': word[-1:],
           'last_letter2': word[-2:]}

train_set2 = apply_features(gender_features2,names[500:])
test_set2 = apply_features(gender_features2,names[:500])

classifier2 = NaiveBayesClassifier.train(train_set2) #训练朴素贝叶斯分类器

print (accuracy(classifier2, test_set2)) #获取准确率

0.79


# 文档分类

In [35]:
#电影评论正负面分类任务
from nltk.corpus import movie_reviews
import random
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words.keys())[:2000]
def document_features(document):
    '''特征提取器，用于提取前2000个高频词汇'''
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

featuresets = [(document_features(d), c) for (d,c) in documents]
train_set3, test_set3 = featuresets[100:], featuresets[:100]
classifier3 = NaiveBayesClassifier.train(train_set3)
print (accuracy(classifier3, test_set3))

In [57]:
#基于后缀训练词性分类器
from nltk.corpus import brown
from nltk.classify.decisiontree import DecisionTreeClassifier
suffix_fdist = nltk.FreqDist()
for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]] +=1
    suffix_fdist[word[-2:]] +=1
    suffix_fdist[word[-3:]] +=1
common_suffixes = list(suffix_fdist.keys())[:100]

def pos_features(word):
    features = {}
    for suffix in common_suffixes:
        features['endswith(%s)' % suffix] = word.lower().endswith(suffix)
    return features

tagged_words = brown.tagged_words(categories='news')
featuresets2 = [(pos_features(n), g) for (n,g) in tagged_words]
size = int(len(featuresets2) * 0.02)
size2 = int(len(featuresets2) * 0.1)
train_set4, test_set4 = featuresets2[size:size2], featuresets2[:size]

In [59]:
classifier4 = DecisionTreeClassifier.train(train_set4)
print(accuracy(classifier4, test_set4))
print(classifier4.pseudocode(depth=4))#伪代码解释器

0.5952262555942317
if endswith(the) == False: 
  if endswith(s) == False: 
    if endswith(,) == False: 
      if endswith(.) == False: return 'IN'
      if endswith(.) == True: return '.'
    if endswith(,) == True: return ','
  if endswith(s) == True: 
    if endswith(was) == False: 
      if endswith(as) == False: return 'NNS'
      if endswith(as) == True: return 'HVZ'
    if endswith(was) == True: return 'BEDZ'
if endswith(the) == True: return 'AT'



In [None]:
探索上下文语境