In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview-/-requirements" data-toc-modified-id="Overview-/-requirements-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview / requirements</a></span></li><li><span><a href="#Import-packages-and-data" data-toc-modified-id="Import-packages-and-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Import packages and data</a></span></li><li><span><a href="#Vocabulary-normalization" data-toc-modified-id="Vocabulary-normalization-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Vocabulary normalization</a></span><ul class="toc-item"><li><span><a href="#Buckeye-unique-wordforms" data-toc-modified-id="Buckeye-unique-wordforms-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Buckeye-unique wordforms</a></span><ul class="toc-item"><li><span><a href="#Loading-the-Buckeye-orthographic-vocabulary" data-toc-modified-id="Loading-the-Buckeye-orthographic-vocabulary-3.1.1"><span class="toc-item-num">3.1.1&nbsp;&nbsp;</span>Loading the Buckeye orthographic vocabulary</a></span></li><li><span><a href="#Loading-the-Fisher-vocabulary" data-toc-modified-id="Loading-the-Fisher-vocabulary-3.1.2"><span class="toc-item-num">3.1.2&nbsp;&nbsp;</span>Loading the Fisher vocabulary</a></span></li><li><span><a href="#Normalize-case" data-toc-modified-id="Normalize-case-3.1.3"><span class="toc-item-num">3.1.3&nbsp;&nbsp;</span>Normalize case</a></span></li><li><span><a href="#Buckeye---main-Fisher-transcripts" data-toc-modified-id="Buckeye---main-Fisher-transcripts-3.1.4"><span class="toc-item-num">3.1.4&nbsp;&nbsp;</span>Buckeye - main Fisher transcripts</a></span></li><li><span><a href="#Buckeye---BBN-Fisher-transcripts" data-toc-modified-id="Buckeye---BBN-Fisher-transcripts-3.1.5"><span class="toc-item-num">3.1.5&nbsp;&nbsp;</span>Buckeye - BBN Fisher transcripts</a></span></li></ul></li><li><span><a href="#Interrupted-wordforms" data-toc-modified-id="Interrupted-wordforms-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Interrupted wordforms</a></span></li><li><span><a href="#Spelled-out-words-and-initialisms-spoken-as-letter-sequences" data-toc-modified-id="Spelled-out-words-and-initialisms-spoken-as-letter-sequences-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Spelled out words and initialisms spoken as letter sequences</a></span></li><li><span><a href="#Common-speech-collocations" data-toc-modified-id="Common-speech-collocations-3.4"><span class="toc-item-num">3.4&nbsp;&nbsp;</span>Common speech collocations</a></span><ul class="toc-item"><li><span><a href="#Conclusion" data-toc-modified-id="Conclusion-3.4.1"><span class="toc-item-num">3.4.1&nbsp;&nbsp;</span>Conclusion</a></span></li></ul></li><li><span><a href="#Interrupted" data-toc-modified-id="Interrupted-3.5"><span class="toc-item-num">3.5&nbsp;&nbsp;</span>Interrupted</a></span></li></ul></li><li><span><a href="#Utterance-segmentation-of-Buckeye" data-toc-modified-id="Utterance-segmentation-of-Buckeye-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Utterance segmentation of Buckeye</a></span><ul class="toc-item"><li><span><a href="#Buckeye" data-toc-modified-id="Buckeye-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Buckeye</a></span></li><li><span><a href="#Fisher" data-toc-modified-id="Fisher-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Fisher</a></span></li><li><span><a href="#main-transcriptions" data-toc-modified-id="main-transcriptions-4.3"><span class="toc-item-num">4.3&nbsp;&nbsp;</span>main transcriptions</a></span></li></ul></li><li><span><a href="#Utterance-segmentation" data-toc-modified-id="Utterance-segmentation-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Utterance segmentation</a></span></li></ul></div>

**Eric Meinhardt / emeinhardt@ucsd.edu**

# Overview / requirements

The goal of this notebook is to produce (/document the production of) code + data for a language model that can be applied to the Buckeye corpus trained on Fisher corpus transcripts.

Because these are spoken corpora and were created with different annotation conventions, two steps need to be completed before this is possible.

 1. Fisher and Buckeye need to have common representations for speech errors/interrupted words, things like filled pauses and backchannel ("um" vs. "umm", "mhm" vs. "mhmm"), initialisms, and to have non-speech noises removed.
 2. Buckeye does not have any pre-defined utterance segmentation. I will add utterance segmentation as described in Seyfarth (2014).

I will use Seyfarth's `buckeye` package (https://github.com/scjs/buckeye) for interfacing with the Buckeye corpus.

# Import packages and data

In [2]:
import os

In [3]:
import csv

In [4]:
from boilerplate import *
from probdist import *

In [5]:
project_dir = "/mnt/cube/home/AD/emeinhar/wr"

# Vocabulary normalization

To start, we'll look at wordforms unique to Buckeye relative to Fisher and vice versa and use that to identify what needs to be normalized and how.

## Buckeye-unique wordforms

(The vocabulary files loaded below were produced by other scripts.)

### Loading the Buckeye orthographic vocabulary

In [6]:
%ls *buckeye*

buckeye_orthography_phonemic_relation_noSyllNasals.tsv
buckeye_orthography_phonemic_relation.tsv
buckeye_pronunciation_dictionary.txt


In [7]:
%cat -n buckeye_orthography_phonemic_relation.tsv | head -20

     1	Orthography	Phonemic_Transcription
     2	'em	ɛ.m
     3	Ellimen	ɛ.l.ʌ.m.ɛ.n
     4	Ellison	ɛ.l.ɪ.s.ʌ.n
     5	Ralph	ɹ.æ.l.f
     6	a	eɪ
     7	a's	eɪ.z.z
     8	aaron's	eɪ.ɹ.ʌ.n.z
     9	abandoned	ʌ.b.æ.n.d.ʌ.n.d
    10	abercrombie	æ.b.ɚ.k.ɹ.ɑ.m.b.i
    11	abhorrent	ʌ.b.h.oʊ.ɹ.ʌ.n.t
    12	abide	ʌ.b.aɪ.d
    13	ability	ʌ.b.ɪ.l.ʌ.t.i
    14	able	eɪ.b.l̩
    15	abortion	ʌ.b.oʊ.ɹ.ʃ.ʌ.n
    16	abortions	ʌ.b.oʊ.ɹ.ʃ.ʌ.n.z
    17	about	ʌ.b.aʊ.t
    18	above	ʌ.b.ʌ.v
    19	abraham	eɪ.b.ɹ.ʌ.h.æ.m
    20	abroad	ʌ.b.ɹ.ɑ.d
cat: write error: Broken pipe


In [8]:
buckeye_orth_phon_relation_fn = 'buckeye_orthography_phonemic_relation.tsv'

In [9]:
buckeye_orth_phon_rel = []
with open(buckeye_orth_phon_relation_fn) as csv_file:
    dict_reader = csv.DictReader(csv_file, delimiter='\t')
    for row in dict_reader:
        buckeye_orth_phon_rel.append(row)

In [10]:
len(buckeye_orth_phon_rel)
buckeye_orth_phon_rel[:5]

7998

[OrderedDict([('Orthography', "'em"), ('Phonemic_Transcription', 'ɛ.m')]),
 OrderedDict([('Orthography', 'Ellimen'),
              ('Phonemic_Transcription', 'ɛ.l.ʌ.m.ɛ.n')]),
 OrderedDict([('Orthography', 'Ellison'),
              ('Phonemic_Transcription', 'ɛ.l.ɪ.s.ʌ.n')]),
 OrderedDict([('Orthography', 'Ralph'),
              ('Phonemic_Transcription', 'ɹ.æ.l.f')]),
 OrderedDict([('Orthography', 'a'), ('Phonemic_Transcription', 'eɪ')])]

In [11]:
buckeye_vocab = set(map(lambda d: d['Orthography'],
                        buckeye_orth_phon_rel))

In [12]:
len(buckeye_vocab)

7998

### Loading the Fisher vocabulary

In [13]:
fisher_repo_dir = '/mnt/cube/home/AD/emeinhar/fisher-lm'

In [14]:
os.chdir(fisher_repo_dir)

In [15]:
%ls *vocab*

fisher_vocabulary_bbn.txt  fisher_vocabulary_main.txt


In [16]:
%cat -n fisher_vocabulary_bbn.txt | head -20

     1	
     2	'and
     3	'berserkly'
     4	'bout
     5	'burb
     6	'burban
     7	'burbs
     8	'cau
     9	'cause
    10	'cept
    11	'cide
    12	'cisco
    13	'cize
    14	'course
    15	'cuse
    16	'do
    17	'em
    18	'em's
    19	'ems
    20	'everybody's
cat: write error: Broken pipe


In [17]:
%cat -n fisher_vocabulary_main.txt | head -20

     1	
     2	'and
     3	'berserkly'
     4	'bout
     5	'burb
     6	'burban
     7	'burbs
     8	'cau
     9	'cause
    10	'cept
    11	'cide
    12	'cisco
    13	'cize
    14	'course
    15	'cuse
    16	'do
    17	'em
    18	'em's
    19	'ems
    20	'everybody's
cat: write error: Broken pipe


In [18]:
fisher_vocab_bbn_fn = 'fisher_vocabulary_bbn.txt'
fisher_vocab_main_fn = 'fisher_vocabulary_main.txt'

In [19]:
fisher_vocab_bbn = []
with open(fisher_vocab_bbn_fn) as file:
    for line in file:
        fisher_vocab_bbn.append(line.rstrip())

In [20]:
fisher_vocab_main = []
with open(fisher_vocab_main_fn) as file:
    for line in file:
        fisher_vocab_main.append(line.rstrip())

In [21]:
len(fisher_vocab_bbn)
len(fisher_vocab_main)

42014

44065

In [22]:
fisher_vocab_bbn = set(fisher_vocab_bbn)
fisher_vocab_main = set(fisher_vocab_main)

### Normalize case

In [23]:
def compare_sizes(before, after):
    print("{0} vs. {1}".format(len(before), len(after)))

In [24]:
def normalize_case(vocabulary):
    return set(map(lambda w: w.lower(),
                   vocabulary))

In [25]:
buckeye_vocab_lc = set(normalize_case(buckeye_vocab))
fisher_vocab_bbn_lc = set(normalize_case(fisher_vocab_bbn))
fisher_vocab_main_lc = set(normalize_case(fisher_vocab_main))

In [26]:
compare_sizes(buckeye_vocab, buckeye_vocab_lc)
compare_sizes(fisher_vocab_bbn, fisher_vocab_bbn_lc)
compare_sizes(fisher_vocab_main, fisher_vocab_main_lc)

7998 vs. 7998
42014 vs. 42014
44065 vs. 44065


### Buckeye - main Fisher transcripts

In [27]:
buckeye_minus_fisher_main = buckeye_vocab_lc - fisher_vocab_main_lc
len(buckeye_minus_fisher_main)

506

In [28]:
buckeye_minus_fisher_main

{'aderal',
 "adriatico's",
 'aeronautical',
 'ag',
 'agitation',
 'agler',
 'ails',
 'aline',
 'amazement',
 'ambulation',
 "amy's",
 'analyzes',
 "angela's",
 'angriest',
 'anorism',
 'apologetics',
 'apostolic',
 'appaling',
 'aroma',
 'arranges',
 'auditorially',
 'auditoriums',
 'automating',
 'autoworkers',
 'babysits',
 'backbeats',
 'backstreet',
 'backwater',
 'bainby',
 'ballplayer',
 'baptism',
 'barbarism',
 'bargained',
 'basslines',
 'beatitudes',
 'beechwold',
 'bellbottoms',
 'berth',
 'bethel',
 'bexley',
 'biff',
 'biggin',
 'bioinformatics',
 'blisters',
 'bloodpressure',
 'bons',
 'booklets',
 'borden',
 "borden's",
 'botany',
 'boxspring',
 'brailles',
 'brandenberg',
 'brink',
 'butterball',
 'byrds',
 'c.a.h.s.',
 'cahs',
 'canberra',
 'canvassing',
 'caregivers',
 'carryable',
 'cartoony',
 'carwash',
 'ccd',
 "center'd",
 'cesareans',
 'chants',
 'charleton',
 'chauvinist',
 'cheapskate',
 'check-up',
 'cheetohs',
 'chested',
 'chilicothe',
 'chillers',
 'chocol

Of interest/potentially worth following up on: `mm-hmm` and (maaaaybe, as an example of how initialisms are handled) `c.a.h.s`

### Buckeye - BBN Fisher transcripts

In [29]:
buckeye_minus_fisher_bbn = buckeye_vocab_lc - fisher_vocab_bbn_lc
len(buckeye_minus_fisher_bbn)

545

In [30]:
buckeye_minus_fisher_bbn - buckeye_minus_fisher_main

{'appreciable',
 'buts',
 'clearinghouse',
 'compartment',
 'concerted',
 'deads',
 'displacing',
 'emory',
 'epidural',
 'faintest',
 'foolishly',
 'glam',
 'godliness',
 "grandparent's",
 'gravesite',
 'henderson',
 'inequitable',
 'inequity',
 'infuse',
 'intangibles',
 'janelle',
 'jubilee',
 'marysville',
 'oasis',
 'pistols',
 'referendum',
 'richland',
 'salvage',
 'segregating',
 'shawn',
 'silence',
 'tardy',
 'timeless',
 'twenty-five',
 'um-huh',
 'ushered',
 'watertown',
 'westerville',
 'wrestlers'}

Of interest/potentially worth following up on: `um-huh`

## Interrupted wordforms

Wordforms that are broken off (or resumed in the middle) by the speaker are indicated in the Fisher corpus transcripts by a dash at the end (or beginning) of the (orthographic) word, e.g.

 - from the main transcription: 
 
```
20.20 27.12 B: me neither my best friend's not worth a mill- my best friend's worth than a billion dollars would not trade her for the whole world

26.23 32.09 A: [noise] right i wouldn't um do you kid- esp- uh today i mean
```

 - from the BBN transcription:
 
 ```
 [NOISE] RIGHT I WOULDN'T UM DO YOU KID- ESP- UH TODAY I MEAN  (fe_03_05863-A-0008)
PEOPLE LOOK AT MONEY MONEY IS EVERYTHING IT'S LIKE THEY THINK THAT MONEY'S GOING TO RESOLVE ALL THEIR PROBLEMS AND A MILLION DOLLARS TODAY IS NOT A LOT OF MONEY AND I WOULD AND THAT'S A BETRAYAL TO ME [NOISE]  (fe_03_05863-A-0009)
 ```

In [31]:
hasDash = lambda w: '-' in w
len(set(filter(hasDash, fisher_vocab_bbn_lc)))
len(set(filter(hasDash, fisher_vocab_main_lc)))
len(set(filter(hasDash, buckeye_vocab_lc)))

301

606

24

In [32]:
set(filter(hasDash, buckeye_vocab_lc))

{'check-up',
 'closed-minded',
 'cop-out',
 'cross-trained',
 'huh-uh',
 'hum-um',
 'im-ing',
 'la-dee-da',
 'mm-hmm',
 'out-r-inn',
 'pooh-poohing',
 'posi-',
 'pre-college',
 're-replace',
 'shut-outs',
 'tom-boy',
 'twenty-five',
 'uh-huh',
 'uh-hum',
 'uh-oh',
 'uh-uh',
 'um-hmm',
 'um-huh',
 'um-hum'}

In [33]:
hasBrackets = lambda w: '[' in w or ']' in w
hasParens = lambda w: '(' in w or ')' in w
set(filter(hasBrackets, fisher_vocab_bbn_lc))
set(filter(hasParens, fisher_vocab_bbn_lc))
set(filter(hasBrackets, fisher_vocab_main_lc))
set(filter(hasParens, fisher_vocab_main_lc))

set()

set()

set()

set()

## Spelled out words and initialisms spoken as letter sequences

 - Initialisms (and spelled out words) in Fisher were transcribed using underscores and periods.
 - According to the Buckeye corpus manual (p. 9 https://buckeyecorpus.osu.edu/BuckeyeCorpusmanual.pdf), initialisms that are spoken letter by letter were supposed to be transcribed with spaces between each of the (capitalized) letters, except for `OK` (which is supposed to be rendered as `okay`) and `TV` (which is supposed to be rendered as `TV`).
   - In fact, it looks like each letter of each initialisms was annotated as a separate word (with the exception of `c.a.h.s` and `TV` as `tv`).

In [34]:
hasUnderscore = lambda w: '_' in w
set(filter(hasUnderscore, buckeye_vocab_lc)) #empty
bbn_initialisms = set(filter(hasUnderscore, fisher_vocab_bbn_lc))
len(bbn_initialisms)
main_initialisms = set(filter(hasUnderscore, fisher_vocab_main_lc))
len(main_initialisms)
bbn_initialisms - main_initialisms
main_initialisms

set()

824

906

set()

{'a._a.',
 'a._a._r._p.',
 'a._a._u.',
 'a._b.',
 'a._b._c.',
 "a._b._c.'s",
 'a._b._c._d.',
 'a._b._c.s',
 'a._c.',
 'a._c._c.',
 'a._c._d._c.',
 'a._c._l.',
 'a._c._l._u.',
 'a._c._m._e.',
 'a._c._n.',
 'a._c._o._a.',
 'a._c._p.',
 'a._c._s.',
 'a._c._t.',
 'a._c._t.s',
 'a._d.',
 'a._d._a.',
 'a._d._d.',
 'a._d._h._d.',
 'a._d._s.',
 'a._f._c.',
 'a._f._d._c.',
 'a._f._l.',
 'a._g._p.',
 'a._h._d.',
 'a._i.',
 'a._i._d._s.',
 'a._i._t.',
 'a._j.',
 'a._j._c.',
 'a._k.',
 'a._k._a.',
 'a._k._c.',
 'a._l.',
 'a._l._o.',
 'a._l._p._a.',
 'a._m.',
 "a._m.'s",
 'a._m._c.',
 "a._m._c.'s",
 'a._m._c.s',
 'a._m._d.',
 'a._n._r.',
 'a._o.',
 'a._o._l.',
 'a._p.',
 'a._p._a.',
 'a._p._n.',
 'a._p._o.',
 'a._r.',
 'a._s.',
 'a._s._a._m.',
 'a._s._c.',
 'a._s._l.',
 'a._s._p._c._a.',
 'a._s._t._u.',
 'a._t.',
 'a._t._c.',
 'a._t._m.',
 "a._t._m.'s",
 'a._t._m.s',
 'a._t._p.',
 'a._t._s.',
 'a._t._v.',
 "a._t._v.'s",
 'a._t._v.s',
 'a._u.',
 'a._w._a._c._s.',
 'a._z._n.',
 'b._a.',
 'b._a._d.',


In [35]:
q = 'a._a.'

In [36]:
q.replace('.', '')

'a_a'

In [37]:
undoInitialismRep = lambda s: s.replace('.','').replace('_','')
undoneInitalisms = set(map(undoInitialismRep, main_initialisms))
len(undoneInitalisms)

900

In [38]:
undoneInitialismsAlsoInFisherVocab = {i for i in undoneInitalisms if i in fisher_vocab_main_lc}
len(undoneInitialismsAlsoInFisherVocab)
list(undoneInitialismsAlsoInFisherVocab)[:10]
len(undoneInitialismsAlsoInFisherVocab & buckeye_vocab_lc)
undoneInitialismsAlsoInFisherVocab & buckeye_vocab_lc

251

['its', 'ku', 'fisher', 'wwe', 'pcs', 'ac', 'al', 'us', 'unt', 'um']

68

{'act',
 'acts',
 'ad',
 'add',
 'aids',
 'al',
 'am',
 'as',
 'at',
 'bad',
 'bet',
 'cap',
 'cat',
 'cd',
 'co',
 'da',
 'dare',
 'do',
 'dos',
 'dot',
 'em',
 'er',
 'fit',
 'flu',
 'go',
 'hi',
 'in',
 'ins',
 'it',
 "it's",
 'its',
 'jet',
 'ken',
 'la',
 'lan',
 'las',
 'lax',
 'me',
 'meat',
 'mist',
 'mm',
 'mud',
 'noise',
 'oh',
 'ok',
 'pal',
 'phd',
 'programs',
 'sad',
 'sat',
 'scott',
 'sea',
 'so',
 'tee',
 'the',
 'tv',
 "tv's",
 'uh',
 'um',
 'up',
 'ups',
 'us',
 'visa',
 'what',
 'wick',
 'yard',
 'yes',
 'ymca'}

In [39]:
'OK' in buckeye_vocab_lc
'okay' in buckeye_vocab_lc

False

True

In [40]:
'TV' in buckeye_vocab #????
'tv' in buckeye_vocab
'tv' in buckeye_vocab_lc

False

True

True

In [41]:
{w for w in buckeye_vocab_lc if '.' in w}

{'c.a.h.s.'}

## Common speech collocations

In [42]:
'gonna' in fisher_vocab_bbn_lc
'gonna' in fisher_vocab_main_lc

'wanna' in fisher_vocab_bbn_lc
'wanna' in fisher_vocab_main_lc

'yknow' in fisher_vocab_bbn_lc #False
'yknow' in fisher_vocab_main_lc #False


'kinda' in fisher_vocab_bbn_lc
'kinda' in fisher_vocab_main_lc

'sorta' in fisher_vocab_bbn_lc
'sorta' in fisher_vocab_main_lc

'hafta' in fisher_vocab_bbn_lc #False
'hafta' in fisher_vocab_main_lc #False

True

True

True

True

False

False

True

True

True

True

False

False

In [72]:
'gonna' in buckeye_minus_fisher_main
'wanna' in buckeye_minus_fisher_main
'yknow' in buckeye_minus_fisher_main
'kinda' in buckeye_minus_fisher_main
'sorta' in buckeye_minus_fisher_main
'hafta' in buckeye_minus_fisher_main

False

False

True

False

False

True

In [43]:
len(buckeye_vocab)
len(buckeye_vocab_lc)
' '
len( buckeye_vocab - fisher_vocab_bbn )      #bbn vocab is monocase
len( buckeye_vocab_lc - fisher_vocab_bbn_lc )
' '
len( buckeye_vocab - fisher_vocab_main )
len( buckeye_vocab_lc - fisher_vocab_main_lc )

7998

7998

' '

546

545

' '

507

506

In [44]:
set(map(lambda w: w.lower(), (buckeye_vocab - fisher_vocab_main))) - (buckeye_vocab_lc - fisher_vocab_main_lc)

{'ralph'}

### Conclusion

After removing tokens from Fisher transcripts and 

## Interrupted

# Utterance segmentation of Buckeye

## Buckeye

In [48]:
os.chdir(project_dir)

In [49]:
%ls *buckeye*

buckeye_orthography_phonemic_relation_noSyllNasals.tsv
buckeye_orthography_phonemic_relation.tsv
buckeye_pronunciation_dictionary.txt


In [50]:
# see https://github.com/scjs/buckeye 
import buckeye as b

In [51]:
buckeye_dir = '/mnt/cube/home/AD/emeinhar/buckeye-zips'

In [52]:
os.chdir(buckeye_dir)
os.getcwd()
print(os.listdir())

'/mnt/cube/home/AD/emeinhar/buckeye-zips'

['s05.zip', 's02.zip', 's39.zip', 's37.zip', 's30.zip', 's21.zip', 's26.zip', 's28.zip', 's31.zip', 's36.zip', 's38.zip', 's03.zip', 's04.zip', 's15.zip', 's12.zip', 's29.zip', 's27.zip', 's16.zip', 's18.zip', 's24.zip', 's35.zip', 's40.zip', 's09.zip', 's07.zip', 's19.zip', 's17.zip', 's10.zip', 's01.zip', 's06.zip', 's34.zip']


In [53]:
files = os.listdir()
speakers = [b.Speaker.from_zip(file) for file in files]
len(speakers)
tracks = list(union([set(s.tracks) for s in speakers]))
len(tracks)

30

193

In [59]:
t = tracks[0]

In [60]:
t.words

[Pause('{B_TRANS}', 0.0, 2.405),
 Pause('<IVER>', 2.405, 2.505),
 Pause('<IVER-well_but>', 2.505, 2.831),
 Pause('<IVER>', 2.831, 4.087),
 Word('all', 4.087, 4.332, ['aa', 'l'], ['ao', 'l'], 'DT'),
 Word('the', 4.332, 4.446, ['dh', 'iy'], ['dh', 'ah'], 'DT'),
 Word('politicians', 4.446, 5.8, ['p', 'aa', 'l', 'ih', 't', 'ih', 'sh', 'ah', 'n', 'z'], ['p', 'aa', 'l', 'ih', 't', 'iy', 'sh', 'ih', 'n', 'z'], 'NNS'),
 Pause('<SIL>', 5.8, 6.124),
 Word('all', 6.124, 6.206213, ['aa', 'l'], ['ao'], 'DT'),
 Word('right', 6.206213, 6.384, ['r', 'ay', 't'], ['r', 'ay', 't'], 'RB'),
 Word('take', 6.384, 6.615324, ['t', 'ey', 'k'], ['t', 'ey', 'k'], 'VB'),
 Word('the', 6.615324, 6.693521, ['dh', 'iy'], ['dh', 'ah'], 'DT'),
 Word('governor', 6.693521, 7.19, ['g', 'ah', 'v', 'er', 'n', 'er'], ['g', 'ah', 'v', 'n', 'er'], 'NN'),
 Word('or', 7.19, 7.262, ['ow', 'r'], ['er'], 'CC'),
 Word('take', 7.262, 7.471, ['t', 'ey', 'k'], ['t', 'ey', 'k'], 'VB'),
 Word('the', 7.471, 7.524776, ['dh', 'iy'], ['dh', '

In [67]:
u = [each for each in b.utterance.words_to_utterances(t.words)]

In [68]:
len(u)

22

In [70]:
u[0]

Utterance([Word('all', 4.087, 4.332, ['aa', 'l'], ['ao', 'l'], 'DT'), Word('the', 4.332, 4.446, ['dh', 'iy'], ['dh', 'ah'], 'DT'), Word('politicians', 4.446, 5.8, ['p', 'aa', 'l', 'ih', 't', 'ih', 'sh', 'ah', 'n', 'z'], ['p', 'aa', 'l', 'ih', 't', 'iy', 'sh', 'ih', 'n', 'z'], 'NNS'), Pause('<SIL>', 5.8, 6.124), Word('all', 6.124, 6.206213, ['aa', 'l'], ['ao'], 'DT'), Word('right', 6.206213, 6.384, ['r', 'ay', 't'], ['r', 'ay', 't'], 'RB'), Word('take', 6.384, 6.615324, ['t', 'ey', 'k'], ['t', 'ey', 'k'], 'VB'), Word('the', 6.615324, 6.693521, ['dh', 'iy'], ['dh', 'ah'], 'DT'), Word('governor', 6.693521, 7.19, ['g', 'ah', 'v', 'er', 'n', 'er'], ['g', 'ah', 'v', 'n', 'er'], 'NN'), Word('or', 7.19, 7.262, ['ow', 'r'], ['er'], 'CC'), Word('take', 7.262, 7.471, ['t', 'ey', 'k'], ['t', 'ey', 'k'], 'VB'), Word('the', 7.471, 7.524776, ['dh', 'iy'], ['dh', 'ah'], 'DT'), Word('president', 7.524776, 8.008, ['p', 'r', 'eh', 'z', 'ih', 'd', 'ah', 'n', 't'], ['p', 'r', 'eh', 'z', 'en'], 'NN'), Word(

In [54]:
def getOrthography(word):
    return word.orthography

In [55]:
def isWord(container):
    return type(container) == b.containers.Word

def getWords(track):
    return [w for w in track.words if isWord(w)]
#     return filter(isWord, track.words)

In [56]:
words = list(union([set(getWords(t)) for t in tracks]))
len(words)

216062

In [57]:
orthographic_wordform_tokens = list(map(getOrthography, words))
len(orthographic_wordform_tokens)
orthographic_wordform_types = set(orthographic_wordform_tokens)
len(orthographic_wordform_types)

216062

7999

In [58]:
from collections import Counter

In [13]:
orthographic_wordform_counts = Counter(orthographic_wordform_tokens)

In [14]:
threshold = 5
uncommon_wordform_types = [w for w in orthographic_wordform_types if orthographic_wordform_counts[w] <= threshold]
len(uncommon_wordform_types)

6030

In [15]:
threshold = 1
uncommon_wordform_types = [w for w in orthographic_wordform_types if orthographic_wordform_counts[w] <= threshold]
len(uncommon_wordform_types)

3441

## Fisher

In [21]:
# fisher_dir = "/mnt/truffle/corpora/fisher_english_transcripts/fe_03_p2_tran"

## main transcriptions

In [22]:
# fisher_main_data_dir = os.path.join(fisher_dir, 'data' + '/' + 'trans')

In [20]:
# os.chdir(fisher_main_data_dir)
# print(os.listdir())

['058', '059', '060', '061', '062', '063', '064', '065', '066', '067', '068', '069', '070', '071', '072', '073', '074', '075', '076', '077', '078', '079', '080', '081', '082', '083', '084', '085', '086', '087', '088', '089', '090', '091', '092', '093', '094', '095', '096', '097', '098', '099', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', '111', '112', '113', '114', '115', '116']


In [None]:
# def filenameToRelation_orig(fn, curr_dir):
#     return {'filename':fn,
#             'call_id':fn[6:11],
#             'location':os.path.join(main_data_dir, curr_dir + '/' + fn)}

In [None]:
# main_call_relation = []
# for folder in trans_folders:
#     os.chdir(folder)
#     for file in os.listdir():
#         main_call_relation.append(filenameToRelation_orig(file, folder))
#     os.chdir('..')

In [None]:
# def get_raw_transcription_lines(callfile_fn):
#     lines = []
#     with open(callfile_fn, 'r', encoding='latin-1') as the_file:
#         for line in the_file:
#             if line[0] != '#' and line.rstrip() != '':
#                 lines.append(line.rstrip())
#     return lines

# Utterance segmentation