In [1]:
import nltk

from nltk.corpus import brown

In [2]:
def gender_features(word):
    return {'last_letter':word[-1]}

gender_features('shark')

{'last_letter': 'k'}

In [2]:
from nltk.corpus import names

In [3]:
import random

In [5]:
names = ([(name , 'male') for name in names.words('male.txt')]+
         [(name , 'female') for name in names.words('female.txt')])

In [6]:
random.shuffle(names)

In [7]:
featuresets = [(gender_features(n) , g) for (n,g) in names]
train_set , test_set = featuresets[500:] , featuresets[:500]

classifier = nltk.NaiveBayesClassifier.train(train_set)

In [8]:
classifier.classify(gender_features('Neo'))

'male'

In [9]:
classifier.classify(gender_features('Alice'))

'female'

In [10]:
nltk.classify.accuracy(classifier , test_set)

0.76

In [11]:
classifier.show_most_informative_features(5)

Most Informative Features
             last_letter = 'k'              male : female =     43.9 : 1.0
             last_letter = 'a'            female : male   =     37.0 : 1.0
             last_letter = 'v'              male : female =     18.7 : 1.0
             last_letter = 'f'              male : female =     14.6 : 1.0
             last_letter = 'p'              male : female =     12.6 : 1.0


In [None]:
#=========
from nltk.classify import apply_features

train_set = apply_features(gender_features , names[500:])
test_set = apply_features(gender_features , names[:500])
#=========

In [13]:
train_names = names[1500:]
devtest_names = names[500:1500]
test_names = names[:500]

train_set = [(gender_features(n),g) for (n,g) in train_names]
devtest_set = [(gender_features(n),g) for (n,g) in devtest_names]
test_set = [(gender_features(n),g) for (n,g) in test_names]

classifier = nltk.NaiveBayesClassifier.train(train_set)


In [14]:
nltk.classify.accuracy(classifier , devtest_set)

0.78

In [15]:
errors = []

for (name , tag) in devtest_names:
    guess = classifier.classify(gender_features(names))
    
    if guess != tag:
        errors.append((tag , guess , name))

In [17]:
len(errors)

337

In [18]:
#可以观察上面的errors
#发现ch结尾通常为男
#yn结尾通常为女
#修改特征提取方法
def gender_features_2(word):
    return {'suffix1':word[-1:],
            'suffix2':word[-2:]}

In [None]:
#每次更换特征后 应该重新分割训练 验证集
#否则会 过拟合验证集

In [19]:
#电影评论的语料库
from nltk.corpus import movie_reviews

In [22]:
movie_reviews.categories()

['neg', 'pos']

In [27]:
movie_reviews.sents(categories='neg')[1]

['they', 'get', 'into', 'an', 'accident', '.']

In [28]:
documents = [(list(movie_reviews.words(fileid)) , category)
                for category in movie_reviews.categories()
                for fileid in movie_reviews.fileids()]

random.shuffle(documents)

In [30]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())

word_features = list(all_words.keys())[:2000]


#类似使用词典向量来表征一篇文档
def document_features(document):
    document_words = set(document)
    features = {}
    
    for word in word_features:
        features['contains(%s)' % word] = (word in document)
        
    return features

In [31]:
document_features(movie_reviews.words('pos/cv957_8737.txt'))

{'contains(plot)': True,
 'contains(:)': True,
 'contains(two)': True,
 'contains(teen)': False,
 'contains(couples)': False,
 'contains(go)': False,
 'contains(to)': True,
 'contains(a)': True,
 'contains(church)': False,
 'contains(party)': False,
 'contains(,)': True,
 'contains(drink)': False,
 'contains(and)': True,
 'contains(then)': True,
 'contains(drive)': False,
 'contains(.)': True,
 'contains(they)': True,
 'contains(get)': True,
 'contains(into)': True,
 'contains(an)': True,
 'contains(accident)': False,
 'contains(one)': True,
 'contains(of)': True,
 'contains(the)': True,
 'contains(guys)': False,
 'contains(dies)': False,
 'contains(but)': True,
 'contains(his)': True,
 'contains(girlfriend)': True,
 'contains(continues)': False,
 'contains(see)': False,
 'contains(him)': True,
 'contains(in)': True,
 'contains(her)': False,
 'contains(life)': False,
 'contains(has)': True,
 'contains(nightmares)': False,
 'contains(what)': True,
 "contains(')": True,
 'contains(s)': T

In [32]:
featuresets = [(document_features(d) , c) for (d,c) in documents]
#使用2000个高频词的出现与否作为一篇评论的特征向量

train_set , test_set = featuresets[100:] , featuresets[:100]

classifier = nltk.NaiveBayesClassifier.train(train_set)

In [33]:
nltk.classify.accuracy(classifier , test_set)

0.03

In [34]:
classifier.show_most_informative_features(5)

Most Informative Features
contains(differentiates) = True              neg : pos    =      1.7 : 1.0
    contains(brutalized) = True              neg : pos    =      1.7 : 1.0
       contains(strayed) = True              pos : neg    =      1.4 : 1.0
        contains(norway) = True              pos : neg    =      1.4 : 1.0
        contains(twitch) = True              neg : pos    =      1.4 : 1.0


In [36]:
suffix_fdist = nltk.FreqDist()

for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]] += 1
    suffix_fdist[word[-2:]] += 1
    suffix_fdist[word[-3:]] += 1

In [39]:
common_suffixes = list(suffix_fdist.keys())[:100]

In [40]:
common_suffixes

['e',
 'he',
 'the',
 'n',
 'on',
 'ton',
 'y',
 'ty',
 'nty',
 'd',
 'nd',
 'and',
 'ry',
 'ury',
 'id',
 'aid',
 'ay',
 'day',
 'an',
 'ion',
 'f',
 'of',
 's',
 "'s",
 "a's",
 't',
 'nt',
 'ent',
 'ary',
 'ed',
 'ced',
 '`',
 '``',
 'o',
 'no',
 'ce',
 'nce',
 "'",
 "''",
 'at',
 'hat',
 'ny',
 'any',
 'es',
 'ies',
 'k',
 'ok',
 'ook',
 'ace',
 '.',
 'r',
 'er',
 'her',
 'in',
 'end',
 'ts',
 'nts',
 'ity',
 've',
 'ive',
 'ee',
 'tee',
 ',',
 'h',
 'ch',
 'ich',
 'ad',
 'had',
 'l',
 'll',
 'all',
 'ge',
 'rge',
 'ves',
 'se',
 'ise',
 'ks',
 'nks',
 'a',
 'ta',
 'nta',
 'or',
 'for',
 'ner',
 'as',
 'was',
 'ted',
 'ber',
 'm',
 'rm',
 'erm',
 'en',
 'een',
 'ged',
 'by',
 'ior',
 'rt',
 'urt',
 'dge',
 'od']

In [41]:
#这次的特征是单词的末尾词
def pos_features(word):
    features = {}
    
    for suffix in common_suffixes:
        features['endswith(%s)' % suffix] = word.lower().endswith(suffix)
    
    return features

In [42]:
tagged_words = brown.tagged_words(categories='news')
featuresets = [(pos_features(n),g) for (n,g) in tagged_words]

size = int(len(featuresets) * 0.1)

train_set , test_set = featuresets[size:] , featuresets[:size]

classifier = nltk.DecisionTreeClassifier.train(train_set)


In [43]:
nltk.classify.accuracy(classifier , test_set)

0.5689706613625062

In [45]:
classifier.classify(pos_features('kiss'))

'NNS'

In [46]:
classifier.pseudocode(depth=4)
#对决策树进行解释显示

"if endswith(the) == False: \n  if endswith(,) == False: \n    if endswith(s) == False: \n      if endswith(.) == False: return '.'\n      if endswith(.) == True: return '.'\n    if endswith(s) == True: \n      if endswith(was) == False: return 'PP$'\n      if endswith(was) == True: return 'BEDZ'\n  if endswith(,) == True: return ','\nif endswith(the) == True: return 'AT'\n"

In [47]:
#=======
#现在考虑上下文的词性预测
#=======

In [49]:
#指定词的位置
#末尾1 2 3个子串
#前一个词
def pos_features(sentence , i):
    features = {'suffix(1)': sentence[i][-1:],
                'suffix(2)': sentence[i][-2:],
                'suffix(3)': sentence[i][-3:]}
    
    if i == 0:
        features['prev-word'] = '<START>'
    else:
        features['prev-word'] = sentence[i-1]
        
    return features


In [50]:
pos_features(brown.sents()[0] , 8)

{'prev-word': 'an', 'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion'}

In [51]:
brown.sents()[0]

['The',
 'Fulton',
 'County',
 'Grand',
 'Jury',
 'said',
 'Friday',
 'an',
 'investigation',
 'of',
 "Atlanta's",
 'recent',
 'primary',
 'election',
 'produced',
 '``',
 'no',
 'evidence',
 "''",
 'that',
 'any',
 'irregularities',
 'took',
 'place',
 '.']

In [52]:
tagged_sents = brown.tagged_sents(categories='news')

featuresets = []

for tagged_sent in tagged_sents:
    untagged_sent = nltk.tag.untag(tagged_sent)
    
    for i , (word , tag) in enumerate(tagged_sent):
        featuresets.append((pos_features(untagged_sent , i) , tag))

In [53]:
size = int(len(featuresets)*0.1)

train_set  , test_set = featuresets[size:] , featuresets[:size]

classifier = nltk.NaiveBayesClassifier.train(train_set)


In [54]:
nltk.classify.accuracy(classifier , test_set)

0.7891596220785678

In [55]:
def pos_features(sentence , i , history):
    features = {'suffix(1)': sentence[i][-1:],
                'suffix(2)': sentence[i][-2:],
                'suffix(3)': sentence[i][-3:]}
    
    if i == 0:
        features['prev-word'] = '<START>'
        features['prev-tag'] = '<START>'
    else:
        features['prev-word'] = sentence[i-1]
        features['prev-tag'] = history[i-1]
        
    return features



In [56]:
class ConsecutivePosTagger(nltk.TaggerI):
    def __init__(self , train_sents):
        train_set = []
        
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            
            for i , (word ,tag) in enumerate(tagged_sent):
                featuresets = pos_features(untagged_sent , i , history)
                train_set.append((featuresets , tag))
                history.append(tag)
        
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)
    
    def tag(self , sentence):
        history = []
        
        for i , word in enumerate(sentence):
            featuresets = pos_features(sentence ,i ,history)
            tag = self.classifier.classify(featuresets)
            history.append(tag)
        
        return zip(sentence , history)

In [57]:
tagged_sents = brown.tagged_sents(categories='news')

In [59]:
size = int(len(tagged_sents)*0.1)

train_set , test_set = tagged_sents[size:] , tagged_sents[:size]

tagger = ConsecutivePosTagger(tagged_sents)

In [61]:
tagger.evaluate(test_set)

0.8271673620769587

In [62]:
sents = nltk.corpus.treebank_raw.sents()

In [64]:
len(sents)

4193

In [65]:
tokens = []
boundaries = set()
offset = 0

for sent in nltk.corpus.treebank_raw.sents():
    tokens.extend(sent)
    offset += len(sent)
    
    #用来记录句号位置
    boundaries.add(offset-1)

In [68]:
a=[1,2,3]
a.extend([4 , 5, 6])
a

[1, 2, 3, 4, 5, 6]

In [72]:
def punct_features(tokens , i):
    return {'next-word-capitalized': tokens[i+1][0].isupper(),
             'prevword':tokens[i-1].lower(),
             'punct':tokens[i],
             'prev-word-is-one-char': len(tokens[i-1]) == 1}

In [73]:
featuresets = [(punct_features(tokens , i) , (i in boundaries))
              for i in range(1 , len(tokens)-1)
              if tokens[i] in '.?!']

In [74]:
size = int(len(featuresets) * 0.1)

train_set , test_set = featuresets[size:] , featuresets[:size]

classifier = nltk.NaiveBayesClassifier.train(train_set)


In [75]:
nltk.classify.accuracy(classifier , test_set)

0.936026936026936

In [76]:
posts = nltk.corpus.nps_chat.xml_posts()[:10000]

In [78]:
#特征提取
def dialogue_act_features(post):
    features = {}
    
    for word in nltk.word_tokenize(post):
        features['contains(%s)' % word.lower()] = True
    
    return features

In [81]:
featuresets = [(dialogue_act_features(post.text),
               post.get('class'))
              for post in posts]

In [84]:
featuresets[1]

({'contains(:)': True, 'contains(p)': True}, 'Emotion')

In [85]:
size = int(len(featuresets)*0.1)

train_set ,test_set = featuresets[size:] , featuresets[:size]

classifier = nltk.NaiveBayesClassifier.train(train_set)



In [86]:
nltk.classify.accuracy(classifier, test_set)

0.668

In [87]:
def rte_features(rtepair):
    extractor = nltk.RTEFeatureExtractor(rtepair)
    
    features = {}
    
    features['word_overlop'] = len(etractor.overlap('word'))
    features['word_hyp_extra'] = len(extractor.hyp_extra('word'))
    
    features['ne_overlap'] = len(extractor.overlap('ne'))
    features['ne_hyp_extra'] = len(extractor.hyp_extra('ne'))
    
    return features

In [88]:
rtepair = nltk.corpus.rte.pairs(['rte3_dev.xml'])[33]

extractor = nltk.RTEFeatureExtractor(rtepair)

LookupError: 
**********************************************************************
  Resource 'corpora/rte' not found.  Please use the NLTK
  Downloader to obtain the resource:  >>> nltk.download()
  Searched in:
    - 'C:\\Users\\qq122/nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - 'I:\\Anaconda3\\nltk_data'
    - 'I:\\Anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\qq122\\AppData\\Roaming\\nltk_data'
**********************************************************************

In [None]:
#nltk可以透明地调用调用别的机器学习包

In [4]:
import random

from nltk.corpus import brown

In [5]:
tagged_sents = list(brown.tagged_sents(categories = 'news'))
random.shuffle(tagged_sents)

size = int(len(tagged_sents) * 0.1)

train_set , test_set = tagged_sents[size:] , tagged_sents[:size]

In [6]:
#混淆矩阵
def tag_list(tagged_sents):
    return [tag for sent in tagged_sents for (word,tag) in sent]

def apply_tagger(tagger , corpus):
    return [tagger.tag(nltk.tag.untag(sent)) for sent in corpus]

In [7]:
gold = tag_list(brown.tagged_sents(categories='editorial'))

test = tag_list(apply_tagger(t2 , brown.tagged_sents(categories='editorial')))

cm = nltk.ConfusionMatrix(gold , test)

NameError: name 't2' is not defined

In [8]:
#计算熵
import math

def entropy(labels):
    freqdist = nltk.FreqDist(labels)
    probs = [freqdist.freq(l) for l in nltk.FreqDist(labels)]
    
    return -sum([p*math.log(p,2) for p in probs])

In [9]:
print(entropy(['male','male','male','male']))

-0.0


In [10]:
print(entropy(['female','male','female','male']))

1.0


In [11]:
print(entropy(['female','female','female','male']))

0.8112781244591328
