# 基础

In [1]:
import re
import nltk
wordlist = [ w for w in nltk.corpus.words.words('en') if w.islower()]
searchlist = [w for w in wordlist if re.search('^ab.+ed$',w)]#re.search() 查找并返回布尔值
searchlist[:5]

['abaissed', 'abandoned', 'abased', 'abashed', 'abatised']

· 通配符，匹配所有字符
^abc 匹配以 abc 开始的字符串
abc$ 匹配以 abc 结尾的字符串
[abc] 匹配字符集合中的一个
[A-Z0-9] 匹配字符一个范围
ed|ing|s 匹配指定的一个字符串(析取)
* 前面的项目零个或多个，如 a*, [a-z]* (也叫 Kleene 闭包) + 前面的项目 1 个或多个，如 a+, [a-z]+
? 前面的项目零个或 1 个(即:可选)如:a?, [a-z]?
{n} 重复 n 次，n 为非负整数
{n,} 至少重复 n 次
{,n} 重复不多于 n 次
{m,n} 至少重复 m 次不多于 n 次
a(b|c)+ 括号表示操作符的范围

# 字符块

In [8]:
word = 'supercalifragilisticexpialidocious'
re.findall(r'[aeiou]', word)[:5]#返回一个列表

['u', 'e', 'a', 'i', 'a']

In [9]:
len(re.findall(r'[aeiou]', word))

16

In [10]:
wsj = sorted(set(nltk.corpus.treebank.words()))
fd = nltk.FreqDist(vs for word in wsj
                   for vs in re.findall(r'[aeiou]{2,}', word))
fd.items()

dict_items([('ea', 476), ('oi', 65), ('ou', 329), ('io', 549), ('ee', 217), ('ie', 331), ('ui', 95), ('ua', 109), ('ai', 261), ('ue', 105), ('ia', 253), ('ei', 86), ('iai', 1), ('oo', 174), ('au', 106), ('eau', 10), ('oa', 59), ('oei', 1), ('oe', 15), ('eo', 39), ('uu', 1), ('eu', 18), ('iu', 14), ('aii', 1), ('aiia', 1), ('ae', 11), ('aa', 3), ('oui', 6), ('ieu', 3), ('ao', 6), ('iou', 27), ('uee', 4), ('eou', 5), ('aia', 1), ('uie', 3), ('iao', 1), ('eei', 2), ('uo', 8), ('uou', 5), ('eea', 1), ('ueui', 1), ('ioa', 1), ('ooi', 1)])

In [13]:
regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'
def compress(word):
    pieces = re.findall(regexp, word)
    return ''.join(pieces)
english_udhr = nltk.corpus.udhr.words('English-Latin1')
tokenlist=[compress(w) for w in english_udhr[:75]]
nltk.tokenwrap(tokenlist) # 打印文本标记列表，在空白处断行

'Unvrsl Dclrtn of Hmn Rghts Prmble Whrs rcgntn of the inhrnt dgnty and\nof the eql and inlnble rghts of all mmbrs of the hmn fmly is the fndtn\nof frdm , jstce and pce in the wrld , Whrs dsrgrd and cntmpt fr hmn\nrghts hve rsltd in brbrs acts whch hve outrgd the cnscnce of mnknd ,\nand the advnt of a wrld in whch hmn bngs shll enjy frdm of spch and'

In [14]:
rotokas_words = nltk.corpus.toolbox.words('rotokas.dic')
cvs = [cv for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w)]
cfd = nltk.ConditionalFreqDist(cvs)
cfd.tabulate()

    a   e   i   o   u 
k 418 148  94 420 173 
p  83  31 105  34  51 
r 187  63  84  89  79 
s   0   0 100   2   1 
t  47   8   0 148  37 
v  93  27 105  48  49 


In [15]:
cv_word_pairs = [(cv, w) for w in rotokas_words
                 for cv in re.findall(r'[ptksvr][aeiou]', w)]
cv_index = nltk.Index(cv_word_pairs)#利用元组数列生成字典
cv_index['su']

['kasuari']

# 查找词干

In [14]:
def stem(word):
    for suffix in ['ing','ly','ed','ious','ies','ive','es','s','ment']:
        if word.endswith(suffix):
            return word[:-len(suffix)]
    return word
print (re.findall(r'^.*(ing|ly|ed|ious|ies|ive|es|s|ment)$','processing'))#等同以上函数
print (re.findall(r'^.*(?:ing|ly|ed|ious|ies|ive|es|s|ment)$','processing'))
print (re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$','processing'))
print (re.findall(r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$','language'))

['ing']
['processing']
[('process', 'ing')]
[('language', '')]


In [17]:
def stem(word):
    regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
    stem,suffix = re.findall(regexp,word)[0]
    return word
raw = """DENNIS: Listen, strange women lying in ponds distributing swords
is no basis for a system of government. Supreme executive power derives from
a mandate from the masses, not from some farcical aquatic ceremony."""
tokens = nltk.word_tokenize(raw)
[stem(t) for t in tokens][:5]

['DENNIS', ':', 'Listen', ',', 'strange']

# 搜索已分词文本

在下面的例子中，我们使用<.*>，它将匹配所有单个标识符，将它括在括号里，于是只 匹配词(例如:monied)而不匹配短语(例如:a monied man)
第二个例子找出以词“br o”结尾的三个词组成的短语
第三个例子找出以字母“l”开始的三个或更多词组成的 序列

In [23]:
from nltk.corpus import gutenberg, nps_chat
moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))
print (moby.findall(r"<a> <.*> <man>"))
print (moby.findall(r"<a> (<.*>) <man>"))

a monied man; a nervous man; a dangerous man; a white man; a white
man; a white man; a pious man; a queer man; a good man; a mature man;
a white man; a Cape man; a great man; a wise man; a wise man; a
butterless man; a white man; a fiendish man; a pale man; a furious
man; a better man; a certain man; a complete man; a dismasted man; a
younger man; a brave man; a brave man; a brave man; a brave man
None
monied; nervous; dangerous; white; white; white; pious; queer; good;
mature; white; Cape; great; wise; wise; butterless; white; fiendish;
pale; furious; better; certain; complete; dismasted; younger; brave;
brave; brave; brave
None


In [25]:
chat = nltk.Text(nps_chat.words())
chat.findall(r"<.*> <.*> <bro>")

you rule bro; telling you bro; u twizted bro


In [26]:
chat.findall(r"<l.*>{3,}")

lol lol lol; lmao lol lol; lol lol lol; la la la la la; la la la; la
la la; lovely lol lol love; lol lol lol.; la la la; la la la


In [27]:
from nltk.corpus import brown
hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned']))
hobbies_learned.findall(r"<\w*> <and> <other> <\w*s>")

speed and other activities; water and other liquids; tomb and other
landmarks; Statues and other monuments; pearls and other jewels;
charts and other items; roads and other features; figures and other
objects; military and other areas; demands and other factors;
abstracts and other compilations; iron and other metals
