# 11. Managing Linguistic Data

## 11.1 Corpus Structure: a Case Study

In [1]:
import nltk
import re

In [2]:
phonetic = nltk.corpus.timit.phones('dr1-fvmh0/sa1')
phonetic[:20]

['h#',
 'sh',
 'iy',
 'hv',
 'ae',
 'dcl',
 'y',
 'ix',
 'dcl',
 'd',
 'aa',
 'kcl',
 's',
 'ux',
 'tcl',
 'en',
 'gcl',
 'g',
 'r',
 'iy']

In [3]:
nltk.corpus.timit.word_times('dr1-fvmh0/sa1')

[('she', 7812, 10610),
 ('had', 10610, 14496),
 ('your', 14496, 15791),
 ('dark', 15791, 20720),
 ('suit', 20720, 25647),
 ('in', 25647, 26906),
 ('greasy', 26906, 32668),
 ('wash', 32668, 37890),
 ('water', 38531, 42417),
 ('all', 43091, 46052),
 ('year', 46052, 50522)]

In [4]:
timitdict = nltk.corpus.timit.transcription_dict()
timitdict['greasy'] + timitdict['wash'] + timitdict['water']

['g', 'r', 'iy1', 's', 'iy', 'w', 'ao1', 'sh', 'w', 'ao1', 't', 'axr']

In [5]:
phonetic[17:30]

['g', 'r', 'iy', 's', 'iy', 'w', 'aa', 'sh', 'epi', 'w', 'aa', 'dx', 'ax']

In [6]:
nltk.corpus.timit.spkrinfo('dr1-fvmh0')

SpeakerInfo(id='VMH0', sex='F', dr='1', use='TRN', recdate='03/11/86', birthdate='01/08/60', ht='5\'05"', race='WHT', edu='BS', comments='BEST NEW ENGLAND ACCENT SO FAR')

## 11.2 The Life-Cycle of a Corpus

In [8]:
s1 = "00000010000000001000000"
s2 = "00000001000000010000000"
s3 = "00010000000000000001000"
nltk.windowdiff(s1, s1, 3)

0.0

In [9]:
nltk.windowdiff(s1, s2, 3)

0.19047619047619047

In [10]:
nltk.windowdiff(s2, s3, 3)

0.5714285714285714

## 11.3 Acquiring Data

In [12]:
from io import open
legal_pos = set(['n', 'v.t.', 'v.i.', 'adj', 'det'])
pattern = re.compile(r"'font-size:11.0pt'>([a-z.]+)<")
document = open("dict.htm", encoding="utf-8").read()
used_pos = set(re.findall(pattern, document))
illegal_pos = used_pos.difference(legal_pos)
print(list(illegal_pos))

[]


In [13]:
document

"<p class=MsoNormal>sleep\n  <span style='mso-spacerun:yes'> </span>\n  [<span class=SpellE>sli:p</span>]\n  <span style='mso-spacerun:yes'> </span>\n  <b><span style='font-size:11.0pt'>v.i.</span></b>\n  <span style='mso-spacerun:yes'> </span>\n  <i>a condition of body and mind ...<o:p></o:p></i>\n</p>\n"

In [14]:
used_pos

{'v.i.'}

In [15]:
from bs4 import BeautifulSoup

def lexical_data(html_file, encoding="utf-8"):
    SEP = '_ENTRY'
    html = open(html_file, encoding=encoding).read()
    html = re.sub(r'<p', SEP + '<p', html)
    text = BeautifulSoup(html).get_text()
    text = ' '.join(text.split())
    for entry in text.split(SEP):
        if entry.count(' ') > 2:
            yield entry.split(' ', 3)

In [16]:
import csv
lexicon = csv.reader(open('dict.csv'))
pairs = [(lexeme, defn) for (lexeme, _, _, defn) in lexicon]
lexemes, defns = zip(*pairs)
defn_words = set(w for defn in defns for w in defn.split())
sorted(defn_words.difference(lexemes))

['...',
 'a',
 'and',
 'body',
 'by',
 'cease',
 'condition',
 'down',
 'each',
 'foot',
 'lifting',
 'mind',
 'of',
 'progress',
 'setting',
 'to']

In [17]:
idx = nltk.Index((defn_word, lexeme)
                for (lexeme, defn) in pairs
                for defn_word in nltk.word_tokenize(defn)
                if len(defn_word) > 3)

In [18]:
with open("dict.idx", "w") as idx_file:
    for word in sorted(idx):
        idx_words = ', '.join(idx[word])
        idx_line = "{}: {}".format(word, idx_words)
        print(idx_line, file=idx_file)

In [19]:
mappings = [('ph', 'f'), ('ght', 't'), ('^kn', 'n'), ('qu', 'kw'),
            ('[aeiou]+', 'a'), (r'(.)\1', r'\1')]

In [20]:
def signature(word):
    for patt, repl in mappings:
        word = re.sub(patt, repl, word)
    pieces = re.findall('[^aeiou]+', word)
    return ''.join(char for piece in pieces for char in sorted(piece))[:8]

In [21]:
signature('illefent')

'lfnt'

In [22]:
signature('ebsekwieous')

'bskws'

In [23]:
signature('nuculerr')

'nclr'

In [24]:
signatures = nltk.Index((signature(w), w) for w in nltk.corpus.words.words())
signatures[signature('nuculerr')]

['anicular',
 'inocular',
 'nucellar',
 'nuclear',
 'unicolor',
 'uniocular',
 'unocular']

In [25]:
def rank(word, wordlist):
    ranked = sorted((nltk.edit_distance(word, w), w) for w in wordlist)
    return [word for (_, word) in ranked]

def fuzzy_spell(word):
    sig = signature(word)
    if sig in signatures:
        return rank(word, signatures[sig])
    else:
        return []

In [26]:
fuzzy_spell('illefent')

['olefiant', 'elephant', 'oliphant', 'elephanta']

In [27]:
fuzzy_spell('ebsekwieous')

['obsequious']

In [28]:
fuzzy_spell('nucular')

['anicular',
 'inocular',
 'nucellar',
 'nuclear',
 'unocular',
 'uniocular',
 'unicolor']

# 11.4 Working with XML

In [30]:
merchant_file = nltk.data.find('corpora/shakespeare/merchant.xml')
raw = open(merchant_file).read()
print(raw[:163])

<?xml version="1.0"?>
<?xml-stylesheet type="text/css" href="shakes.css"?>
<!-- <!DOCTYPE PLAY SYSTEM "play.dtd"> -->

<PLAY>
<TITLE>The Merchant of Venice</TITLE>


In [31]:
print(raw[1789:2006])

<TITLE>ACT I</TITLE>

<SCENE><TITLE>SCENE I.  Venice. A street.</TITLE>
<STAGEDIR>Enter ANTONIO, SALARINO, and SALANIO</STAGEDIR>

<SPEECH>
<SPEAKER>ANTONIO</SPEAKER>
<LINE>In sooth, I know not why I am so sad:</LINE>


In [32]:
from xml.etree.ElementTree import ElementTree
merchant = ElementTree().parse(merchant_file) 
merchant

<Element 'PLAY' at 0x7f1f3ec7b908>

In [33]:
merchant[0]

<Element 'TITLE' at 0x7f1f3ec7b9f8>

In [34]:
merchant[0].text

'The Merchant of Venice'

In [35]:
merchant.getchildren()

[<Element 'TITLE' at 0x7f1f3ec7b9f8>,
 <Element 'PERSONAE' at 0x7f1f3ec7bc28>,
 <Element 'SCNDESCR' at 0x7f1f3ec81728>,
 <Element 'PLAYSUBT' at 0x7f1f3ec81778>,
 <Element 'ACT' at 0x7f1f3ec817c8>,
 <Element 'ACT' at 0x7f1f3ec1e408>,
 <Element 'ACT' at 0x7f1f3ec4ec28>,
 <Element 'ACT' at 0x7f1f3ebfc228>,
 <Element 'ACT' at 0x7f1f3eb9fc28>]

In [36]:
merchant[-2][0].text

'ACT IV'

In [37]:
merchant[-2][1]

<Element 'SCENE' at 0x7f1f3ebfc2c8>

In [38]:
merchant[-2][1][0].text

'SCENE I.  Venice. A court of justice.'

In [39]:
merchant[-2][1][54]

<Element 'SPEECH' at 0x7f1f3ec0a3b8>

In [40]:
merchant[-2][1][54][0]

<Element 'SPEAKER' at 0x7f1f3ec0a408>

In [41]:
merchant[-2][1][54][0].text

'PORTIA'

In [42]:
merchant[-2][1][54][1]

<Element 'LINE' at 0x7f1f3ec0a458>

In [43]:
merchant[-2][1][54][1].text

"The quality of mercy is not strain'd,"

In [44]:
for i, act in enumerate(merchant.findall('ACT')):
    for j, scene in enumerate(act.findall('SCENE')):
        for k, speech in enumerate(scene.findall('SPEECH')):
            for line in speech.findall('LINE'):
                if 'music' in str(line.text):
                    print("Act %d Scene %d Speech %d: %s" % (i+1, j+1, k+1, line.text))

Act 3 Scene 2 Speech 9: Let music sound while he doth make his choice;
Act 3 Scene 2 Speech 9: Fading in music: that the comparison
Act 3 Scene 2 Speech 9: And what is music then? Then music is
Act 5 Scene 1 Speech 23: And bring your music forth into the air.
Act 5 Scene 1 Speech 23: Here will we sit and let the sounds of music
Act 5 Scene 1 Speech 23: And draw her home with music.
Act 5 Scene 1 Speech 24: I am never merry when I hear sweet music.
Act 5 Scene 1 Speech 25: Or any air of music touch their ears,
Act 5 Scene 1 Speech 25: By the sweet power of music: therefore the poet
Act 5 Scene 1 Speech 25: But music for the time doth change his nature.
Act 5 Scene 1 Speech 25: The man that hath no music in himself,
Act 5 Scene 1 Speech 25: Let no such man be trusted. Mark the music.
Act 5 Scene 1 Speech 29: It is your music, madam, of the house.
Act 5 Scene 1 Speech 32: No better a musician than the wren.


In [45]:
from collections import Counter
speaker_seq = [s.text for s in merchant.findall('ACT/SCENE/SPEECH/SPEAKER')]
speaker_freq = Counter(speaker_seq)
top5 = speaker_freq.most_common(5)
top5

[('PORTIA', 117),
 ('SHYLOCK', 79),
 ('BASSANIO', 73),
 ('GRATIANO', 48),
 ('ANTONIO', 47)]

In [46]:
from collections import defaultdict
abbreviate = defaultdict(lambda: 'OTH')
for speaker, _ in top5:
    abbreviate[speaker] = speaker[:4]

In [47]:
speaker_seq2 = [abbreviate[speaker] for speaker in speaker_seq]
cfd = nltk.ConditionalFreqDist(nltk.bigrams(speaker_seq2))
cfd.tabulate()

     ANTO BASS GRAT  OTH PORT SHYL 
ANTO    0   11    4   11    9   12 
BASS   10    0   11   10   26   16 
GRAT    6    8    0   19    9    5 
 OTH    8   16   18  153   52   25 
PORT    7   23   13   53    0   21 
SHYL   15   15    2   26   21    0 


In [48]:
from nltk.corpus import toolbox
lexicon = toolbox.xml('rotokas.dic')

In [49]:
lexicon[3][0]

<Element 'lx' at 0x7f1f3ebb9e08>

In [50]:
lexicon[3][0].tag

'lx'

In [51]:
lexicon[3][0].text

'kaa'

In [52]:
[lexeme.text.lower() for lexeme in lexicon.findall('record/lx')][:20]

['kaa',
 'kaa',
 'kaa',
 'kaakaaro',
 'kaakaaviko',
 'kaakaavo',
 'kaakaoko',
 'kaakasi',
 'kaakau',
 'kaakauko',
 'kaakito',
 'kaakuupato',
 'kaaova',
 'kaapa',
 'kaapea',
 'kaapie',
 'kaapie',
 'kaapiepato',
 'kaapisi',
 'kaapisivira']

In [53]:
import sys
from nltk.util import elementtree_indent
from xml.etree.ElementTree import ElementTree
elementtree_indent(lexicon)
tree = ElementTree(lexicon[3])
tree.write(sys.stdout, encoding='unicode')

<record>
    <lx>kaa</lx>
    <ps>N</ps>
    <pt>MASC</pt>
    <cl>isi</cl>
    <ge>cooking banana</ge>
    <tkp>banana bilong kukim</tkp>
    <pt>itoo</pt>
    <sf>FLORA</sf>
    <dt>12/Aug/2005</dt>
    <ex>Taeavi iria kaa isi kovopaueva kaparapasia.</ex>
    <xp>Taeavi i bin planim gaden banana bilong kukim tasol long paia.</xp>
    <xe>Taeavi planted banana in order to cook it.</xe>
  </record>

In [54]:
html = "<table>\n"
for entry in lexicon[70:80]:
    lx = entry.findtext('lx')
    ps = entry.findtext('ps')
    ge = entry.findtext('ge')
    html += "  <tr><td>%s</td><td>%s</td><td>%s</td></tr>\n" % (lx, ps, ge)
html += "</table>"
print(html)

<table>
  <tr><td>kakae</td><td>???</td><td>small</td></tr>
  <tr><td>kakae</td><td>CLASS</td><td>child</td></tr>
  <tr><td>kakaevira</td><td>ADV</td><td>small-like</td></tr>
  <tr><td>kakapikoa</td><td>???</td><td>small</td></tr>
  <tr><td>kakapikoto</td><td>N</td><td>newborn baby</td></tr>
  <tr><td>kakapu</td><td>V</td><td>place in sling for purpose of carrying</td></tr>
  <tr><td>kakapua</td><td>N</td><td>sling for lifting</td></tr>
  <tr><td>kakara</td><td>N</td><td>arm band</td></tr>
  <tr><td>Kakarapaia</td><td>N</td><td>village name</td></tr>
  <tr><td>kakarau</td><td>N</td><td>frog</td></tr>
</table>


## 11.5 Working with Toolbox Data

In [55]:
from nltk.corpus import toolbox
lexicon = toolbox.xml('rotokas.dic')
sum(len(entry) for entry in lexicon) / len(lexicon)

13.635955056179775

In [56]:
from xml.etree.ElementTree import SubElement

def cv(s):
    s = s.lower()
    s = re.sub(r'[^a-z]',     r'_', s)
    s = re.sub(r'[aeiou]',    r'V', s)
    s = re.sub(r'[^V_]',      r'C', s)
    return (s)

def add_cv_field(entry):
    for field in entry:
        if field.tag == 'lx':
            cv_field = SubElement(entry, 'cv')
            cv_field.text = cv(field.text)

In [57]:
lexicon = toolbox.xml('rotokas.dic')
add_cv_field(lexicon[53])
print(nltk.toolbox.to_sfm_string(lexicon[53]))

\lx kaeviro
\ps V
\pt A
\ge lift off
\ge take off
\tkp go antap
\sc MOTION
\vx 1
\nt used to describe action of plane
\dt 03/Jun/2005
\ex Pita kaeviroroe kepa kekesia oa vuripierevo kiuvu.
\xp Pita i go antap na lukim haus win i bagarapim.
\xe Peter went to look at the house that the wind destroyed.
\cv CVVCVCV



In [58]:
from collections import Counter
field_sequences = Counter(':'.join(field.tag for field in entry) for entry in lexicon)
field_sequences.most_common()[:10]

[('lx:ps:pt:ge:tkp:dt:ex:xp:xe', 41),
 ('lx:rt:ps:pt:ge:tkp:dt:ex:xp:xe', 37),
 ('lx:rt:ps:pt:ge:tkp:dt:ex:xp:xe:ex:xp:xe', 27),
 ('lx:ps:pt:ge:tkp:nt:dt:ex:xp:xe', 20),
 ('lx:ps:pt:ge:tkp:nt:dt:ex:xp:xe:ex:xp:xe', 17),
 ('lx:ps:pt:ge:tkp:dt:ex:xp:xe:ex:xp:xe', 16),
 ('lx:rt:ps:pt:ge:ge:tkp:dt:ex:xp:xe:ex:xp:xe', 12),
 ('lx:ps:pt:ge:ge:tkp:dt:ex:xp:xe', 9),
 ('lx:rt:ps:pt:ge:tkp:dt:ex:xp:xe:ex:xp:xe:ex:xp:xe', 9),
 ('lx:ps:pt:ge:tkp:nt:sf:dt:ex:xp:xe', 9)]

In [59]:
grammar = nltk.CFG.fromstring('''
  S -> Head PS Glosses Comment Date Sem_Field Examples
  Head -> Lexeme Root
  Lexeme -> "lx"
  Root -> "rt" |
  PS -> "ps"
  Glosses -> Gloss Glosses |
  Gloss -> "ge" | "tkp" | "eng"
  Date -> "dt"
  Sem_Field -> "sf"
  Examples -> Example Ex_Pidgin Ex_English Examples |
  Example -> "ex"
  Ex_Pidgin -> "xp"
  Ex_English -> "xe"
  Comment -> "cmt" | "nt" |
  ''')

In [60]:
def validate_lexicon(grammar, lexicon, ignored_tags):
    rd_parser = nltk.RecursiveDescentParser(grammar)
    for entry in lexicon:
        marker_list = [field.tag for field in entry if field.tag not in ignored_tags]
        if list(rd_parser.parse(marker_list)):
            print("+", ':'.join(marker_list))
        else:
            print("-", ':'.join(marker_list))

In [61]:
lexicon = toolbox.xml('rotokas.dic')[10:20]
ignored_tags = ['arg', 'dcsv', 'pt', 'vx'] 
validate_lexicon(grammar, lexicon, ignored_tags)

- lx:ps:ge:tkp:sf:nt:dt:ex:xp:xe:ex:xp:xe:ex:xp:xe
- lx:rt:ps:ge:tkp:nt:dt:ex:xp:xe:ex:xp:xe
- lx:ps:ge:tkp:nt:dt:ex:xp:xe:ex:xp:xe
- lx:ps:ge:tkp:nt:sf:dt
- lx:ps:ge:tkp:dt:cmt:ex:xp:xe:ex:xp:xe
- lx:ps:ge:ge:ge:tkp:cmt:dt:ex:xp:xe
- lx:rt:ps:ge:ge:tkp:dt
- lx:rt:ps:ge:eng:eng:eng:ge:tkp:tkp:dt:cmt:ex:xp:xe:ex:xp:xe:ex:xp:xe:ex:xp:xe:ex:xp:xe
- lx:rt:ps:ge:tkp:dt:ex:xp:xe
- lx:ps:ge:ge:tkp:dt:ex:xp:xe:ex:xp:xe


In [62]:
grammar = r"""
      lexfunc: {<lf>(<lv><ln|le>*)*}
      example: {<rf|xv><xn|xe>*}
      sense:   {<sn><ps><pn|gv|dv|gn|gp|dn|rn|ge|de|re>*<example>*<lexfunc>*}
      record:   {<lx><hm><sense>+<dt>}
    """


In [98]:
from xml.etree.ElementTree import ElementTree
from nltk.toolbox import ToolboxData
db = ToolboxData()
db.open(nltk.data.find('corpora/toolbox/iu_mien_samp.db'))
lexicon = db.parse(grammar, encoding='utf-8')
tree = ElementTree(lexicon)
with open("iu_mien_samp.xml", "wb") as output:
    tree.write(output)

TypeError: cannot use a string pattern on a bytes-like object

In [92]:
type(grammar)

bytes

In [93]:
grammar

b'\n      lexfunc: {<lf>(<lv><ln|le>*)*}\n      example: {<rf|xv><xn|xe>*}\n      sense:   {<sn><ps><pn|gv|dv|gn|gp|dn|rn|ge|de|re>*<example>*<lexfunc>*}\n      record:   {<lx><hm><sense>+<dt>}\n    '

## 11.6 Describing Language Resources using OLAC Metadata