# 정규표현식

In [9]:
import nltk
import re
import os

In [10]:
# Basic Regular Expression Meta-Characters, Including Wildcards, Ranges and Closures
# .	        Wildcard, matches any character
# ^abc	    Matches some pattern abc at the start of a string
# abc$	    Matches some pattern abc at the end of a string
# [abc]	    Matches one of a set of characters
# [^abc]      Matches anything but a set of characters
# [A-Z0-9]	Matches one of a range of characters
# ed|ing|s	Matches one of the specified strings (disjunction)
# *	        Zero or more of previous item, e.g. a*, [a-z]* (also known as Kleene Closure)
# +	        One or more of previous item, e.g. a+, [a-z]+
# ?	        Zero or one of the previous item (i.e. optional), e.g. a?, [a-z]?
# {n}	        Exactly n repeats where n is a non-negative integer
# {n,}	    At least n repeats
# {,n}	    No more than n repeats
# {m,n}	    At least m and no more than n repeats
# a(b|c)+	    Parentheses that indicate the scope of the operators

In [11]:
wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()]
print(wordlist[:10])
result = [w for w in wordlist if re.search('ed$', w)][:10]
print(result[:10])
result = [w for w in wordlist if re.search('^..j..t..$', w)][:10]
print(result[:10])
result = [w for w in wordlist if re.search('^[ghi][mno][jlk][def]$', w)][:10]
print(result[:10])
result = [w for w in wordlist if re.search('^[ah]+$', w)][:10]
print(result[:10])

wsj = sorted(set(nltk.corpus.treebank.words()))
wordlist = [w for w in wsj if re.search('^[0-9]+\.[0-9]+$', w)]
print(wordlist[:10])
wordlist = [w for w in wsj if re.search('^[A-Z]+\$$', w)]
print(wordlist[:10])
wordlist = [w for w in wsj if re.search('^[0-9]{4}$', w)]
print(wordlist[:10])
wordlist = [w for w in wsj if re.search('^[0-9]+-[a-z]{3,5}$', w)]
print(wordlist[:10])
wordlist = [w for w in wsj if re.search('^[a-z]{5,}-[a-z]{2,3}-[a-z]{,6}$', w)]
print(wordlist[:10])
wordlist = [w for w in wsj if re.search('(ed|ing)$', w)]
print(wordlist[:10])

['a', 'aa', 'aal', 'aalii', 'aam', 'aardvark', 'aardwolf', 'aba', 'abac', 'abaca']
['abaissed', 'abandoned', 'abased', 'abashed', 'abatised', 'abed', 'aborted', 'abridged', 'abscessed', 'absconded']
['abjectly', 'adjuster', 'dejected', 'dejectly', 'injector', 'majestic', 'objectee', 'objector', 'rejecter', 'rejector']
['gold', 'golf', 'hold', 'hole']
['a', 'aa', 'ah', 'aha', 'h', 'ha', 'hah']
['0.0085', '0.05', '0.1', '0.16', '0.2', '0.25', '0.28', '0.3', '0.4', '0.5']
['C$', 'US$']
['1614', '1637', '1787', '1901', '1903', '1917', '1925', '1929', '1933', '1934']
['10-day', '10-lap', '10-year', '100-share', '12-point', '12-year', '14-hour', '15-day', '150-point', '190-point']
['black-and-white', 'bread-and-butter', 'father-in-law', 'machine-gun-toting', 'savings-and-loan']
['62%-owned', 'Absorbed', 'According', 'Adopting', 'Advanced', 'Advancing', 'Alfred', 'Allied', 'Annualized', 'Anything']


In [46]:
wsj = sorted(set(nltk.corpus.treebank.words()))
wsj

['!',
 '#',
 '$',
 '%',
 '&',
 "'",
 "''",
 "'30s",
 "'40s",
 "'50s",
 "'80s",
 "'82",
 "'86",
 "'S",
 "'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 '*',
 '*-1',
 '*-10',
 '*-100',
 '*-101',
 '*-102',
 '*-103',
 '*-104',
 '*-105',
 '*-106',
 '*-107',
 '*-108',
 '*-109',
 '*-11',
 '*-110',
 '*-111',
 '*-112',
 '*-113',
 '*-114',
 '*-115',
 '*-116',
 '*-117',
 '*-118',
 '*-119',
 '*-12',
 '*-120',
 '*-121',
 '*-122',
 '*-123',
 '*-124',
 '*-125',
 '*-126',
 '*-127',
 '*-128',
 '*-129',
 '*-13',
 '*-130',
 '*-131',
 '*-132',
 '*-133',
 '*-134',
 '*-135',
 '*-136',
 '*-137',
 '*-138',
 '*-139',
 '*-14',
 '*-140',
 '*-141',
 '*-142',
 '*-144',
 '*-145',
 '*-146',
 '*-147',
 '*-149',
 '*-15',
 '*-150',
 '*-151',
 '*-152',
 '*-153',
 '*-154',
 '*-155',
 '*-156',
 '*-157',
 '*-158',
 '*-159',
 '*-16',
 '*-160',
 '*-161',
 '*-162',
 '*-163',
 '*-164',
 '*-165',
 '*-166',
 '*-17',
 '*-18',
 '*-19',
 '*-2',
 '*-20',
 '*-21',
 '*-22',
 '*-23',
 '*-24',
 '*-25',
 '*-26',
 '*-27',
 '*-28',
 '*-29',
 

In [12]:
raw = open(os.getcwd()+r'/06_01.txt', encoding = 'utf8').read()

In [13]:
rawlist = raw.split(' ')
print(rawlist[:10])

['The', 'Project', 'Gutenberg', 'EBook', 'of', 'Crime', 'and', 'Punishment,', 'by', 'Fyodor']


In [14]:
result = [w for w in rawlist if re.search('the', w)]
# search _처음부터 검사, 처음 매칭된 문자열만 리턴
print(result[:10])

['the', 'the', 'the', 'the', 'the', 'they', 'their', 'father', 'mother', 'their']


In [15]:
result = [w for w in rawlist if re.search('ly$', w)]
print(result[:10])

['deeply', 'only', 'generally', 'sickly', 'himself\ninstantly', 'Suddenly', 'constantly', 'family', 'probably', 'widely']


In [16]:
result = [w for w in rawlist if re.search('^a', w)]
print(result[:10])

['and', 'anyone', 'anywhere', 'at', 'and', 'away', 'at', 'and', 'and', 'about']


In [17]:
result = [w for w in rawlist if re.search('.+tion', w)]
print(result[:10])

['restrictions', 'examination', 'acclamations.', 'conviction', 'revolutionist,', 'conversations', 'intention', 'execution.', 'resignation', 'devotion']


In [25]:
result = [w for w in rawlist if re.search('^[^abc][ne]$', w)]
print(result[:10])

['in', 'in', 'he', 'in', 'In', 'He', 'in', 'he', 'on', 'we']


In [31]:
result = [w for w in rawlist if re.search('[aeiou]*[aeiou]$', w)]
print(result[:10])

['The', 'Crime', 'the', 'use', 'anyone', 'anywhere', 'no', 'no', 'You', 'give']


In [63]:
raw2=sorted(set(rawlist)) #리스트로 정렬. sort와 다른 점은 원본이 그대로라는 것.
raw2

['',
 '#2554]\nLast',
 '$5,000)',
 '(801)',
 '(Amalia',
 '(Aniska',
 '(Cough!)',
 '(Cough,',
 '(Cough-cough-cough.)',
 '(Dounia',
 '(During',
 '(Everything',
 '(For',
 '(Ha-ha-ha!',
 '(Hang',
 '(Hard',
 '(He',
 '(I',
 '(I\nwas',
 '(It',
 '(I’ve',
 '(Katerina\nIvanovna',
 '(Let’s',
 '(Marmeladov',
 '(Oh,',
 '(Porfiry',
 '(Praskovya',
 '(Raskolnikov',
 '(She',
 '(Sonia',
 '(The',
 '(The\nlady',
 '(Yes,',
 '(You',
 '(a',
 '(a)',
 '(ach,',
 '(all',
 '(all\nher',
 '(and',
 '(any',
 '(as',
 '(at',
 '(available',
 '(b)',
 '(but',
 '(c)',
 '(do',
 '(does',
 '(everybody\nknows',
 '(for',
 '(for\nhe',
 '(has',
 '(he',
 '(he\ntapped',
 '(he\nwas',
 '(help',
 '(his',
 '(i.e.',
 '(if',
 '(in',
 '(it',
 '(it’s',
 '(more',
 '(not\ncounting',
 '(note',
 '(nothing',
 '(now',
 '(on',
 '(or',
 '(or...',
 '(ordinary),',
 '(rather',
 '(shattered',
 '(she',
 '(since',
 '(so',
 '(someone\nlaughed',
 '(sometimes,',
 '(take',
 '(that',
 '(that’s',
 '(the',
 '(they',
 '(though',
 '(though\nshe',
 '(trying',
 '(

In [64]:
raw2list = [w for w in raw2 if re.search('^[A-G]+[H-Z]+$', w)]
print(raw2list[:10])

['ANY', 'BUT', 'EBOOK', 'EIN', 'FOR', 'FULL']


In [65]:
raw2list = [w for w in raw2 if re.search('^[A-Z]+\n[A-Z]+$', w)]
print(raw2list[:10])

['BE\nLIABLE', 'LICENSE\nPLEASE', 'OR\nINCIDENTAL', 'OTHER\nWARRANTIES', 'THE\nTRADEMARK', 'THOSE\nPROVIDED', 'TO\nWARRANTIES']


In [67]:
raw2list = [w for w in raw2 if re.search('^[0-9]+\.[A-Z]+$', w)]
print(raw2list[:10])

['1.C', '1.E']


In [72]:
raw2list = [w for w in raw2 if re.search('^[0-9]*', w)]
print(raw2list[:10])

['', '#2554]\nLast', '$5,000)', '(801)', '(Amalia', '(Aniska', '(Cough!)', '(Cough,', '(Cough-cough-cough.)', '(Dounia']


In [73]:
raw2list = [w for w in raw2 if re.search('^[0-9]{4}', w)]
print(raw2list[:10])

['1500', '1849', '1859', '1861', '1864', '1880', '2001,', '2006', '2016\n\nLanguage:', '2554-0.txt']


In [79]:
raw2list = [w for w in raw2 if re.search('^[A-Z]{2,}\n[A-Z]{8,12}$', w)]
print(raw2list[:10])

['OR\nINCIDENTAL', 'OTHER\nWARRANTIES', 'THE\nTRADEMARK', 'THOSE\nPROVIDED', 'TO\nWARRANTIES']


In [81]:
raw2list = [w for w in raw2 if re.search('^[^A-Z]+[a-z]+$', w)]
print(raw2list[:10])

['(a', '(all', '(all\nher', '(and', '(any', '(as', '(at', '(available', '(but', '(do']


In [87]:
raw2list = [w for w in raw2 if re.search('(ed|tion)', w)]
print(raw2list[:10])
i2=[]
for i in raw2list:
    i=i.split('\n')
    i2.append(i[-1])
print(i2[:10])

['(he\ntapped', '(shattered', '(someone\nlaughed', '***\n\n\n\n\nProduced', 'Additional', 'Alexandrovna\ncried,', 'Alexandrovna\nintervened', 'Alexandrovna\nmuttered,', 'Alexandrovna’s\nquestions,', 'Archive\nFoundation\n\nThe']
['tapped', '(shattered', 'laughed', 'Produced', 'Additional', 'cried,', 'intervened', 'muttered,', 'questions,', 'The']


In [88]:
word = 'supercalifragilisticexpialidocious'
# findall _매치되는 모든 문자열을 리스트로 리턴
result = re.findall(r'[aeiou]', word)
print(result)
result = re.findall('[aeiou](..)[aeiou]', word)
print(result)
result = re.findall('[^aeiou].+[^aeiou]', word) #[]안의 ^는 not의 의미
print(result)
result = re.findall('[^aeiou].+?[^aeiou]', word)
print(result)

['u', 'e', 'a', 'i', 'a', 'i', 'i', 'i', 'e', 'i', 'a', 'i', 'o', 'i', 'o', 'u']
['rc', 'fr', 'st', 'xp', 'ci']
['supercalifragilisticexpialidocious']
['sup', 'rcal', 'frag', 'lis', 'tic', 'xpial', 'doc']


In [98]:
result = re.findall('[aeiou](..)[aeiou]', raw)
print(result[:50])

['nb', 'k ', 'st', 's ', 'k ', 'f ', ' c', 'lm', ' r', 'ct', 'ts', ' m', 't ', 'y ', 't ', ' t', 'nb', 'ns', 's ', 'k ', 'nl', 'nb', 'tl', 'th', 'st', 'le', ' D', 'ng', 'ct', 't ', 'ck', 'st', 'nc', 'rn', 'bo', 'st', 'ms', 'ad', 'nd', 'st', ' s', 'f ', 'ct', ' v', 'rk', 'gi', 'pl', ' p', 'n ', ' r']


In [103]:
result = re.findall('[aeiou].+?[aeiou]', raw)
print(result[:10])

['e Pro', 'ect Gu', 'enbe', 'ook o', 'ime', 'and Pu', 'ishme', 'odo', 'osto', 'is e']


In [105]:
result = re.findall('[aeiou]+.+?[aeiou]*', raw)
print(result[:10])

['e ', 'oje', 'ute', 'er', 'ook', 'of', 'ime', 'an', 'uni', 'en']


In [106]:
wsj = sorted(set(nltk.corpus.treebank.words()))
fdist = nltk.FreqDist(vs for word in wsj for vs in re.findall(r'[aeiou]{2,}', word))
fdist.most_common(10)

[('io', 549),
 ('ea', 476),
 ('ie', 331),
 ('ou', 329),
 ('ai', 261),
 ('ia', 253),
 ('ee', 217),
 ('oo', 174),
 ('ua', 109),
 ('au', 106)]

In [108]:
fdist = nltk.FreqDist(vs for word in raw2 for vs in re.findall(r'[aeiou]{2,}', word))
fdist.most_common(10)

[('ou', 2654),
 ('ea', 1782),
 ('io', 1113),
 ('ai', 1060),
 ('oo', 1000),
 ('ee', 888),
 ('ie', 736),
 ('ia', 644),
 ('au', 274),
 ('ue', 242)]

In [13]:
[vs for word in wsj for vs in re.findall(r'[aeiou]{2,}', word)]

['ea',
 'oi',
 'ea',
 'ou',
 'oi',
 'ea',
 'ea',
 'oi',
 'oi',
 'ea',
 'io',
 'ea',
 'ea',
 'ea',
 'oi',
 'ea',
 'ea',
 'ea',
 'ea',
 'ea',
 'ea',
 'ea',
 'ee',
 'ea',
 'ea',
 'ea',
 'ea',
 'ea',
 'ea',
 'ea',
 'ea',
 'oi',
 'ea',
 'ea',
 'ou',
 'ou',
 'ou',
 'ie',
 'ui',
 'io',
 'ua',
 'io',
 'ai',
 'ai',
 'ai',
 'io',
 'ie',
 'ue',
 'ue',
 'ia',
 'ie',
 'ea',
 'ai',
 'ou',
 'ia',
 'ei',
 'ie',
 'ea',
 'ea',
 'ie',
 'ia',
 'ia',
 'ua',
 'ie',
 'io',
 'ea',
 'ia',
 'io',
 'ui',
 'ia',
 'ia',
 'ea',
 'iai',
 'ai',
 'ia',
 'ia',
 'ia',
 'ia',
 'ia',
 'io',
 'oo',
 'io',
 'ia',
 'ia',
 'ia',
 'ia',
 'ue',
 'ea',
 'ai',
 'ai',
 'ue',
 'ie',
 'au',
 'ea',
 'ea',
 'ea',
 'ea',
 'eau',
 'au',
 'ei',
 'ei',
 'ei',
 'ei',
 'ei',
 'ia',
 'ie',
 'io',
 'ue',
 'oa',
 'oei',
 'oe',
 'ia',
 'oo',
 'oo',
 'oo',
 'eau',
 'ou',
 'ou',
 'ai',
 'ou',
 'ai',
 'oo',
 'ea',
 'au',
 'ia',
 'ea',
 'ea',
 'ee',
 'ia',
 'ai',
 'oa',
 'oo',
 'oo',
 'oo',
 'ei',
 'ei',
 'ea',
 'ui',
 'ui',
 'eau',
 'ie',
 'ia',
 

In [110]:
fdist = nltk.FreqDist(vs for word in raw2 for vs in re.findall(r'[aeiou]{3}', word))
fdist.most_common(10)

[('iou', 219),
 ('eou', 31),
 ('uou', 24),
 ('uee', 21),
 ('eau', 20),
 ('uie', 17),
 ('uai', 14),
 ('eei', 13),
 ('ieu', 9),
 ('uea', 6)]

In [3]:
regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'
def compress(word):
    pieces = re.findall(regexp, word)
    return ''.join(pieces)

english_udhr = nltk.corpus.udhr.words('English-Latin1')
print(nltk.tokenwrap(compress(w) for w in english_udhr[:75]))


    a   e   i   o   u 
k 418 148  94 420 173 
p  83  31 105  34  51 
r 187  63  84  89  79 
s   0   0 100   2   1 
t  47   8   0 148  37 
v  93  27 105  48  49 


In [27]:
rotokas_words = nltk.corpus.toolbox.words('rotokas.dic')
cvs = [cv for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w)]
cfd = nltk.ConditionalFreqDist(cvs) # CFD works pairwise only
cfd.tabulate()

    a   e   i   o   u 
k 418 148  94 420 173 
p  83  31 105  34  51 
r 187  63  84  89  79 
s   0   0 100   2   1 
t  47   8   0 148  37 
v  93  27 105  48  49 


In [None]:
cv_word_pairs = [(cv, w) for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w)]
cv_index = nltk.Index(cv_word_pairs)
cv_index['po']