In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Eric Meinhardt / emeinhardt@ucsd.edu**

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Overview" data-toc-modified-id="Overview-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Overview</a></span><ul class="toc-item"><li><span><a href="#Preprocessing-steps" data-toc-modified-id="Preprocessing-steps-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Preprocessing steps</a></span></li><li><span><a href="#Dependencies" data-toc-modified-id="Dependencies-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Dependencies</a></span></li><li><span><a href="#Outputs" data-toc-modified-id="Outputs-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Outputs</a></span></li></ul></li><li><span><a href="#Imports-/-loading-data" data-toc-modified-id="Imports-/-loading-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Imports / loading data</a></span><ul class="toc-item"><li><span><a href="#Overview-of-what's-in-the-2003-release-of-Switchboard" data-toc-modified-id="Overview-of-what's-in-the-2003-release-of-Switchboard-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Overview of what's in the 2003 release of Switchboard</a></span></li><li><span><a href="#Retrieving-files-from-conversation-ids" data-toc-modified-id="Retrieving-files-from-conversation-ids-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Retrieving files from conversation ids</a></span></li></ul></li><li><span><a href="#Parsing-and-pre-processing" data-toc-modified-id="Parsing-and-pre-processing-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Parsing and pre-processing</a></span><ul class="toc-item"><li><span><a href="#Parsing-the-utterance-files" data-toc-modified-id="Parsing-the-utterance-files-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Parsing the utterance files</a></span></li><li><span><a href="#Parsing-the-word-files" data-toc-modified-id="Parsing-the-word-files-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Parsing the word files</a></span></li><li><span><a href="#Organizing-all-utterances-and-some-summary-stats" data-toc-modified-id="Organizing-all-utterances-and-some-summary-stats-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Organizing all utterances and some summary stats</a></span><ul class="toc-item"><li><span><a href="#Summary" data-toc-modified-id="Summary-3.3.1"><span class="toc-item-num">3.3.1&nbsp;&nbsp;</span>Summary</a></span></li></ul></li></ul></li><li><span><a href="#Comparison-with-Fisher" data-toc-modified-id="Comparison-with-Fisher-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Comparison with Fisher</a></span></li><li><span><a href="#Relating-the-word-and-utterance-relations" data-toc-modified-id="Relating-the-word-and-utterance-relations-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Relating the word and utterance relations</a></span><ul class="toc-item"><li><span><a href="#Mapping-each-utterance-ID-to-a-sequence-of-word-relations" data-toc-modified-id="Mapping-each-utterance-ID-to-a-sequence-of-word-relations-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Mapping each utterance ID to a sequence of word relations</a></span></li><li><span><a href="#Mapping-each-utterance-ID-to-an-orthographic-word-sequence-+-word-duration-sequence-pair" data-toc-modified-id="Mapping-each-utterance-ID-to-an-orthographic-word-sequence-+-word-duration-sequence-pair-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>Mapping each utterance ID to an orthographic word sequence + word duration sequence pair</a></span></li><li><span><a href="#Making-a-word-relation-for-confusability-analysis" data-toc-modified-id="Making-a-word-relation-for-confusability-analysis-5.3"><span class="toc-item-num">5.3&nbsp;&nbsp;</span>Making a word relation for confusability analysis</a></span></li><li><span><a href="#Identifying-n-gram-contexts" data-toc-modified-id="Identifying-n-gram-contexts-5.4"><span class="toc-item-num">5.4&nbsp;&nbsp;</span>Identifying n-gram contexts</a></span></li></ul></li><li><span><a href="#Writing-things-to-file" data-toc-modified-id="Writing-things-to-file-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Writing things to file</a></span><ul class="toc-item"><li><span><a href="#Utterance-relation" data-toc-modified-id="Utterance-relation-6.1"><span class="toc-item-num">6.1&nbsp;&nbsp;</span>Utterance relation</a></span></li><li><span><a href="#Word-relation" data-toc-modified-id="Word-relation-6.2"><span class="toc-item-num">6.2&nbsp;&nbsp;</span>Word relation</a></span></li><li><span><a href="#Corpus-for-a-language-model" data-toc-modified-id="Corpus-for-a-language-model-6.3"><span class="toc-item-num">6.3&nbsp;&nbsp;</span>Corpus for a language model</a></span></li><li><span><a href="#Vocabulary-for-a-language-model" data-toc-modified-id="Vocabulary-for-a-language-model-6.4"><span class="toc-item-num">6.4&nbsp;&nbsp;</span>Vocabulary for a language model</a></span></li><li><span><a href="#Word-analysis-relation" data-toc-modified-id="Word-analysis-relation-6.5"><span class="toc-item-num">6.5&nbsp;&nbsp;</span>Word analysis relation</a></span></li><li><span><a href="#N-gram-contexts" data-toc-modified-id="N-gram-contexts-6.6"><span class="toc-item-num">6.6&nbsp;&nbsp;</span>N-gram contexts</a></span></li></ul></li></ul></div>

# Overview

The goal of this notebook is to produce a version of the 2003 release of the Switchboard corpus whose vocabulary has been normalized with respect to the Fisher corpus. The motivation for doing this is applying a language model trained on (a slightly processed version of) the Fisher corpus to Switchboard.

## Preprocessing steps

To that end, 
 1. Interrupted or broken-off wordforms - wordforms that speakers didn't finish production of or that they resumed production somewhere in the middle of have been replaced with `<rem>`.
 2. Non-speech noises (e.g. `[laughter]` or `[silence]`) are removed. Note that 'utterances' that in the original contained only things like this are not included in any post-processed data meant to be input to a language model.
 3. Words that transcribers weren't sure of(?) or that seemed like (whole words but) production errors or new coinages are apparently transcribed in curly braces. All such curly braces have been removed.
 4. The following tokens appear in the corpus with an underscore:
  - `{'<b_aside>', '<e_aside>', 'about_1', 'because_1', 'depends_1', 'especially_1', 'okay_1', "them's_1", 'them_1', "them_1's"}`
  - `_aside` tokens have been removed altogether and the trailing `_1` has been removed from all others.
 5. All characters are lower-cased.

In [2]:
import re

In [3]:
from more_itertools import replace
from funcy import compose

In [4]:
# unk = '<unk>'
unk = '<rem>'

In [5]:
interrupted_word_pattern = ".*-$"
resumed_word_pattern = "^-.*"

def isInterrupted(wordform):
    return re.match(interrupted_word_pattern, wordform) is not None

def isResumed(wordform):
    return re.match(resumed_word_pattern, wordform) is not None

def isBroken(wordform):
    return isInterrupted(wordform) or isResumed(wordform)

def hasBrokenWords(speech):
    speech_word_seq = speech.split(' ')
    broken_words = list(filter(isBroken, speech_word_seq))
    return len(broken_words) > 0

def remove_broken_words(speech, insertUnk=True):
    if insertUnk:
        replacement = unk
    else:
        replacement = ""
        
    speech_word_seq = speech.split(' ')
    speech_word_seq_fixed = ' '.join(replace(speech_word_seq, isBroken, (replacement,)))
    speech_fixed = ' '.join([w for w in speech_word_seq_fixed.split(' ') if len(w) > 0])
    return speech_fixed

remove_broken_words("we[ll]- well in our area we just introduced the um citywide")
remove_broken_words('[noise] -[o]kay')
remove_broken_words("o[kay]- [noise] -[o]kay well i think this is going to probably be the most difficult topic that i've discussed [laughter-so] [laughter-far] yes well have you returned anything lately")
remove_broken_words("well i just buy those California peeled -[to]ma[toes]- tomatoes")

'<rem> well in our area we just introduced the um citywide'

'[noise] <rem>'

"<rem> [noise] <rem> well i think this is going to probably be the most difficult topic that i've discussed [laughter-so] [laughter-far] yes well have you returned anything lately"

'well i just buy those California peeled <rem> tomatoes'

In [6]:
def remove_non_speech(speech):
    speech_filtered_words = [w for w in speech.split(' ') if len(w) > 0 and w[0] != '[' and w[-1] != ']']
    speech_out = ' '.join(speech_filtered_words)
    speech_out = speech_out.rstrip()
    return speech_out

remove_non_speech('[noise] -[o]kay')
remove_non_speech("o[kay]- [noise] -[o]kay well i think this is going to probably be the most difficult topic that i've discussed [laughter-so] [laughter-far] yes well have you returned anything lately")
remove_non_speech("n[ot]- not big[oted]- or very m[ore]- m[ore]- you know more in the twentieth century [laughter-now] let's say or or very [vocalized-noise] {unbigoted} and they even have added uh this uh other")

'-[o]kay'

"o[kay]- -[o]kay well i think this is going to probably be the most difficult topic that i've discussed yes well have you returned anything lately"

"n[ot]- not big[oted]- or very m[ore]- m[ore]- you know more in the twentieth century let's say or or very {unbigoted} and they even have added uh this uh other"

In [7]:
def hasCurlyBraces(wordform):
    return '{' in wordform or '}' in wordform

def isCurlyBraced(wordform):
    if len(wordform) == 0:
        return False
    return wordform[0] == '{' and wordform[-1] == '}'

def removeCurlyBraces(wordform):
    if not isCurlyBraced(wordform):
        return wordform
    return wordform[1:-1]

def remove_curly_braces(speech):
    speech_word_seq = speech.split(' ')
    speech_word_seq_fixed = ' '.join(list(map(removeCurlyBraces, speech_word_seq)))
    speech_fixed = ' '.join([w for w in speech_word_seq_fixed.split(' ') if len(w) > 0])
    return speech_fixed

remove_curly_braces("n[ot]- not big[oted]- or very m[ore]- m[ore]- you know more in the twentieth century [laughter-now] let's say or or very [vocalized-noise] {unbigoted} and they even have added uh this uh other")
remove_curly_braces("yeah yes i i know there's a long scientific name but it's like {polytechnochloride} and all that fun stuff um i like")
remove_curly_braces("well it it it'll it'll catch on pretty soon i'll i'll i'll {betcha} i i have a sixteen year old almost sixteen year old Golden Retriever and she's gone the other way because she's so old she doesn't have much control")
remove_curly_braces("that works all right th[ey]- they don't uh w[ant]- want the mothers to be with the little {fishies}")

"n[ot]- not big[oted]- or very m[ore]- m[ore]- you know more in the twentieth century [laughter-now] let's say or or very [vocalized-noise] unbigoted and they even have added uh this uh other"

"yeah yes i i know there's a long scientific name but it's like polytechnochloride and all that fun stuff um i like"

"well it it it'll it'll catch on pretty soon i'll i'll i'll betcha i i have a sixteen year old almost sixteen year old Golden Retriever and she's gone the other way because she's so old she doesn't have much control"

"that works all right th[ey]- they don't uh w[ant]- want the mothers to be with the little fishies"

In [8]:
def hasUnderscore(wordform):
    return '_' in wordform

def fixUnderscore(wordform):
    if not hasUnderscore(wordform):
        return wordform
    fixed = wordform.replace('_1', '')
    return fixed

def fix_underscores(speech):
    speech_word_seq = speech.split(' ')
    speech_word_seq_filtered = [w for w in speech_word_seq if w != '<b_aside>' and w != '<e_aside>']
    speech_word_seq_fixed = ' '.join(list(map(fixUnderscore, speech_word_seq_filtered)))
    speech_fixed = ' '.join([w for w in speech_word_seq_fixed.split(' ') if len(w) > 0])
    return speech_fixed

fix_underscores("because_1 yes dogs need training but cats usually don't even need any training seem like they automatically go to their little litter box they think yeah")
fix_underscores('to feed me or to pick me up and love me <b_aside> yes you sweet thing <e_aside>')

"because yes dogs need training but cats usually don't even need any training seem like they automatically go to their little litter box they think yeah"

'to feed me or to pick me up and love me yes you sweet thing'

In [9]:
def lowercase(speech):
    return speech.lower()

In [10]:
process = compose(lowercase, fix_underscores, remove_curly_braces, remove_non_speech, remove_broken_words)

process("we[ll]- well in our area we just introduced the um citywide")
process('[noise] -[o]kay')
process("o[kay]- [noise] -[o]kay well i think this is going to probably be the most difficult topic that i've discussed [laughter-so] [laughter-far] yes well have you returned anything lately")
process("well i just buy those California peeled -[to]ma[toes]- tomatoes")
process("n[ot]- not big[oted]- or very m[ore]- m[ore]- you know more in the twentieth century [laughter-now] let's say or or very [vocalized-noise] {unbigoted} and they even have added uh this uh other")
process("yeah yes i i know there's a long scientific name but it's like {polytechnochloride} and all that fun stuff um i like")
process("well it it it'll it'll catch on pretty soon i'll i'll i'll {betcha} i i have a sixteen year old almost sixteen year old Golden Retriever and she's gone the other way because she's so old she doesn't have much control")
process("that works all right th[ey]- they don't uh w[ant]- want the mothers to be with the little {fishies}")
process("because_1 yes dogs need training but cats usually don't even need any training seem like they automatically go to their little litter box they think yeah")
process('to feed me or to pick me up and love me <b_aside> yes you sweet thing <e_aside>')

'<rem> well in our area we just introduced the um citywide'

'<rem>'

"<rem> <rem> well i think this is going to probably be the most difficult topic that i've discussed yes well have you returned anything lately"

'well i just buy those california peeled <rem> tomatoes'

"<rem> not <rem> or very <rem> <rem> you know more in the twentieth century let's say or or very unbigoted and they even have added uh this uh other"

"yeah yes i i know there's a long scientific name but it's like polytechnochloride and all that fun stuff um i like"

"well it it it'll it'll catch on pretty soon i'll i'll i'll betcha i i have a sixteen year old almost sixteen year old golden retriever and she's gone the other way because she's so old she doesn't have much control"

"that works all right <rem> they don't uh <rem> want the mothers to be with the little fishies"

"because yes dogs need training but cats usually don't even need any training seem like they automatically go to their little litter box they think yeah"

'to feed me or to pick me up and love me yes you sweet thing'

## Dependencies
 - `more_itertools`, `funcy`, `joblib` and Unix shell command cell/line magics are used throughout, though none have essential functionality that couldn't relatively easily be replaced by something else if necessary.

## Outputs

If run successfully, this notebook will create nine files as outputs:
 1. A .json file containing a list of objects (Python dictionaries), where each object is a finitary relation describing an utterance (and associated metadata) in the Switchboard corpus.
 2. A .json file containing a list of objects (Python dictionaries), where each object is a finitary relation describing a wordform token (and associated metadata) in the Switchboard corpus.
 3. A .txt file containing one utterance from Switchboard per line.
 4. A .txt file containing the vocabulary (one wordform per line) of the previous file.
 5. A .json file containing a further-annotated version of the word relation (#2 above) for other research purposes.
 6. A set of .txt files containing the unigram, bigram, trigram, and fourgram contexts that occur in file #3.

# Imports / loading data

In [11]:
import os
import csv
import json

In [12]:
from itertools import filterfalse, chain

In [13]:
from collections import Counter

In [14]:
os.getcwd()

'/mnt/cube/home/AD/emeinhar/switchboard-lm'

In [15]:
switchboard_lm_dir = '/mnt/cube/home/AD/emeinhar/switchboard-lm'

In [16]:
from joblib import Parallel, delayed

J = 30
BACKEND = 'multiprocessing'
# BACKEND = 'loky'
V = 10
PREFER = 'processes'
# PREFER = 'threads'

def par(gen_expr):
    return Parallel(n_jobs=J, backend=BACKEND, verbose=V, prefer=PREFER)(gen_expr)

## Overview of what's in the 2003 release of Switchboard

See https://www.isip.piconepress.com/projects/switchboard/.

In [17]:
swbd2003_dir = "/mnt/truffle/corpora/switchboard_word_alignments/swb_ms98_transcriptions"

In [18]:
os.chdir(swbd2003_dir)
os.listdir()

['20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '30',
 '31',
 '32',
 '33',
 '34',
 '35',
 '36',
 '37',
 '38',
 '39',
 '40',
 '41',
 '42',
 '43',
 '44',
 '45',
 '46',
 '47',
 '48',
 '49',
 'AAREADME.text',
 'sw-ms98-dict.text']

In [19]:
%cat -n AAREADME.text | head -100

     1	This release contains manually corrected word alignments and transcriptions,
     2	and an updated dictionary.  
     3	
     4	Word alignments were generated by using the most recent release of the
     5	transcriptions and performing forced alignments using ISIP's Hub 5E
     6	recognition system (which is also publicly available). These
     7	alignments were manually reviewed for accurate discrimination
     8	between speech and non-speech.
     9	
    10	The released data is available at:
    11	
    12	 http://www.isip.msstate.edu/projects/switchboard/releases/switchboard_word_alignments.tar.gz
    13	
    14	As always, let us know if you have any questions. 
    15	
    16	Files contained in this release:
    17	
    18	2001
    19	 sw2001A-ms98-a-trans.text
    20	 sw2001A-ms98-a-word.text
    21	 sw2001B-ms98-a-trans.text
    22	 sw2001B-ms98-a-word.text
    23	2005
    24	 sw2005A-ms98-a-trans.text
    25	 sw2005A-ms98-a-word.text
    26	 sw200

In [20]:
swbd2003_folders = list(filterfalse(lambda s: s[-5:] == '.text', os.listdir()))
swbd2003_folders

['20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '30',
 '31',
 '32',
 '33',
 '34',
 '35',
 '36',
 '37',
 '38',
 '39',
 '40',
 '41',
 '42',
 '43',
 '44',
 '45',
 '46',
 '47',
 '48',
 '49']

In [21]:
%ls 20/2001

[0m[01;32msw2001A-ms98-a-trans.text[0m*  [01;32msw2001B-ms98-a-trans.text[0m*
[01;32msw2001A-ms98-a-word.text[0m*   [01;32msw2001B-ms98-a-word.text[0m*


In [22]:
# swbd2003_files = []
conversation_ids = []
for folder in swbd2003_folders:
    os.chdir(folder)
    conversation_ids.extend(os.listdir())
    os.chdir('..')
len(conversation_ids)
conversation_ids[:30]

2438

['2001',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2012',
 '2013',
 '2014',
 '2015',
 '2017',
 '2018',
 '2019',
 '2020',
 '2022',
 '2023',
 '2024',
 '2025',
 '2027',
 '2028',
 '2032',
 '2035',
 '2036',
 '2038',
 '2039',
 '2040',
 '2041',
 '2044',
 '2045']

In [23]:
%cat -n ./20/2001/sw2001A-ms98-a-trans.text | head -50

     1	sw2001A-ms98-a-0001 0.000000 0.977625 [silence]
     2	sw2001A-ms98-a-0002 0.977625 11.561375 hi um yeah i'd like to talk about how you dress for work and and um what do you normally what type of outfit do you normally have to wear
     3	sw2001A-ms98-a-0003 11.561375 19.804875 [silence]
     4	sw2001A-ms98-a-0004 19.804875 21.312375 um-hum
     5	sw2001A-ms98-a-0005 21.312375 27.362000 [silence]
     6	sw2001A-ms98-a-0006 27.362000 28.932750 and is
     7	sw2001A-ms98-a-0007 28.932750 33.898250 [silence]
     8	sw2001A-ms98-a-0008 33.898250 40.123250 right right is there is there um an[y]- is there a like a code of dress where you work do they ask
     9	sw2001A-ms98-a-0009 40.123250 41.552625 right
    10	sw2001A-ms98-a-0010 41.552625 50.129500 [silence]
    11	sw2001A-ms98-a-0011 50.129500 51.567500 [noise] right
    12	sw2001A-ms98-a-0012 51.567500 56.957000 [silence]
    13	sw2001A-ms98-a-0013 56.957000 65.593625 right right and does it does it change i guess um

In [24]:
%cat -n ./20/2001/sw2001A-ms98-a-word.text | head -100

     1	sw2001A-ms98-a-0001 0.000000 0.977625 [silence]
     2	sw2001A-ms98-a-0002 0.977625 1.215250 [silence]
     3	sw2001A-ms98-a-0002 1.215250 1.724625 hi
     4	sw2001A-ms98-a-0002 1.724625 2.273625 [silence]
     5	sw2001A-ms98-a-0002 2.273625 2.927625 um
     6	sw2001A-ms98-a-0002 2.927625 3.221500 [silence]
     7	sw2001A-ms98-a-0002 3.221500 3.661750 yeah
     8	sw2001A-ms98-a-0002 3.661750 3.957625 i'd
     9	sw2001A-ms98-a-0002 3.957625 4.107625 like
    10	sw2001A-ms98-a-0002 4.107625 4.267625 to
    11	sw2001A-ms98-a-0002 4.267625 4.527625 talk
    12	sw2001A-ms98-a-0002 4.527625 4.941625 about
    13	sw2001A-ms98-a-0002 4.941625 5.126125 [silence]
    14	sw2001A-ms98-a-0002 5.126125 5.307625 how
    15	sw2001A-ms98-a-0002 5.307625 5.437625 you
    16	sw2001A-ms98-a-0002 5.437625 5.735375 dress
    17	sw2001A-ms98-a-0002 5.735375 5.901125 [silence]
    18	sw2001A-ms98-a-0002 5.901125 6.077625 for
    19	sw2001A-ms98-a-0002 6.077625 6.477625 work
    20	sw

For each of ≈2400 conversations, there is 
 - a sequence of utterances for speaker A.
 - a sequence of utterances for speaker B.
 - a sequence of orthographic wordforms (and non-speech events) for speaker A.
 - a sequence of orthographic wordforms (and non-speech events) for speaker B.
 
For each utterance, there is a start and end time.

For each orthographic wordform or non-speech event, there is a duration.

## Retrieving files from conversation ids

In [25]:
%ls 20/2001

[0m[01;32msw2001A-ms98-a-trans.text[0m*  [01;32msw2001B-ms98-a-trans.text[0m*
[01;32msw2001A-ms98-a-word.text[0m*   [01;32msw2001B-ms98-a-word.text[0m*


In [26]:
conversation_ids = set(conversation_ids)

In [27]:
def getDir(conversation_id):
    assert conversation_id in conversation_ids, '{0} is not a conversation id'.format(conversation_id)
    firstTwoDigits = conversation_id[:2]
    return os.path.join(swbd2003_dir, firstTwoDigits + '/' + conversation_id)

def getFilenames(conversation_id):
    assert conversation_id in conversation_ids, '{0} is not a conversation id'.format(conversation_id)
    return os.listdir(getDir(conversation_id))

getDir('2045')
getFilenames('2045')

'/mnt/truffle/corpora/switchboard_word_alignments/swb_ms98_transcriptions/20/2045'

['sw2045A-ms98-a-trans.text',
 'sw2045A-ms98-a-word.text',
 'sw2045B-ms98-a-trans.text',
 'sw2045B-ms98-a-word.text']

In [28]:
def getUtteranceFP(conversation_id, speaker):
    fp_prefix = getDir(conversation_id)
    
    assert speaker in {'A', 'B'}, 'Speaker must be either "A" or "B", got {0}'.format(speaker)
    
    fn_prefix = 'sw'
    fn_suffix = '-ms98-a-trans.text'
    fn = fn_prefix + conversation_id + speaker + fn_suffix
    assert fn in set(getFilenames(conversation_id))
    
    return os.path.join(fp_prefix, fn)

def getWordFP(conversation_id, speaker):
    fp_prefix = getDir(conversation_id)
    
    assert speaker in {'A', 'B'}, 'Speaker must be either "A" or "B", got {0}'.format(speaker)
    
    fn_prefix = 'sw'
    fn_suffix = '-ms98-a-word.text'
    fn = fn_prefix + conversation_id + speaker + fn_suffix
    assert fn in set(getFilenames(conversation_id))
    
    return os.path.join(fp_prefix, fn)

getUtteranceFP('2045','A')
getWordFP('2045','A')

'/mnt/truffle/corpora/switchboard_word_alignments/swb_ms98_transcriptions/20/2045/sw2045A-ms98-a-trans.text'

'/mnt/truffle/corpora/switchboard_word_alignments/swb_ms98_transcriptions/20/2045/sw2045A-ms98-a-word.text'

In [29]:
%cat -n /mnt/truffle/corpora/switchboard_word_alignments/swb_ms98_transcriptions/20/2045/sw2045A-ms98-a-trans.text | head -10

     1	sw2045A-ms98-a-0001 0.000000 1.411625 all right
     2	sw2045A-ms98-a-0002 1.411625 3.668625 i play volleyball and
     3	sw2045A-ms98-a-0003 3.668625 6.593875 softball and ceramics
     4	sw2045A-ms98-a-0004 6.593875 19.817125 [silence]
     5	sw2045A-ms98-a-0005 19.817125 21.945500 i didn't think of working out as a hobby
     6	sw2045A-ms98-a-0006 21.945500 22.259750 [laughter]
     7	sw2045A-ms98-a-0007 22.259750 25.479625 or playing with the computer i do both at home
     8	sw2045A-ms98-a-0008 25.479625 35.911625 [silence]
     9	sw2045A-ms98-a-0009 35.911625 43.178125 uh we've gotten a little Atari computer uh husband describes it as a a computer with training wheels
    10	sw2045A-ms98-a-0010 43.178125 44.533500 uh


# Parsing and pre-processing

## Parsing the utterance files

In [30]:
def parse_swbd_utterance_line(line):
    parts = line.split(' ')
#     parts = [e for e in line.split(' ') if e != '']
#     return parts
    utterance_id = parts[0]
    
    start = float(parts[1])
    end = float(parts[2])
    duration = end - start
    
    if len(parts) > 3:
        speech_raw_words = parts[3:]
        speech_raw = ' '.join( speech_raw_words )
    else:
#         speech_raw_words = 
        speech_raw = ''
    
    if speech_raw == '':
        speech_OnlyWords = ''
        has_word_internal_brackets = False
    else:
        speech_wordforms = speech_raw.split(' ')
        speech_wordforms = [w for w in speech_wordforms if len(w) > 0 and w[0] != '[' and w[-1] != ']']
        speech_OnlyWords = ' '.join(speech_wordforms)
        hasBrackets = lambda w: "[" in w or "]" in w
        has_word_internal_brackets = True if any(list(map(hasBrackets, speech_wordforms))) else False
        speech = process(speech_raw)
#         if any(list(map(hasBrackets, speech_wordforms))):
#             raise Exception("Word-internal brackets found in\n\t{0}".format(speech_OnlyWords))
    
#     has_words = False if len(speech_raw) == 0 or (speech_raw[0] == '[' and speech_raw[-1] == ']') else True
    has_words = False if speech == '' else True
    
    line_rel = {'utterance_id':utterance_id,
                'conversation_id':utterance_id[2:6],
                'speaker':utterance_id[6:7],
                'start':start,
                'end':end,
                'duration':duration,
                'speech_raw':speech_raw,
#                 'speech_words':speech_OnlyWords,
                'speech':speech,
#                 'n_words':len(speech_OnlyWords.split(' ')),
                'n_words':0 if not has_words else len(speech.split(' ')),
                'has_words':has_words,
                'has_word_internal_brackets':has_word_internal_brackets
#                 'speech_noquotes':speech_noquotes
               }
    
    return line_rel

In [31]:
def read_swbd_file(swbd_fp):
    lines = []
    with open(swbd_fp, 'r') as the_file:
        for line in the_file:
            lines.append(line.rstrip())
    return lines

def get_utterance_relations(conversation_id, speaker):
    utterance_fp = getUtteranceFP(conversation_id, speaker)
    file_lines = read_swbd_file(utterance_fp)
    parsed_rel = tuple([parse_swbd_utterance_line(l) for l in file_lines])
    return parsed_rel

In [32]:
utts_2045A = get_utterance_relations('2045', 'A')
utts_2045A[:10]

({'utterance_id': 'sw2045A-ms98-a-0001',
  'conversation_id': '2045',
  'speaker': 'A',
  'start': 0.0,
  'end': 1.411625,
  'duration': 1.411625,
  'speech_raw': 'all right',
  'speech': 'all right',
  'n_words': 2,
  'has_words': True,
  'has_word_internal_brackets': False},
 {'utterance_id': 'sw2045A-ms98-a-0002',
  'conversation_id': '2045',
  'speaker': 'A',
  'start': 1.411625,
  'end': 3.668625,
  'duration': 2.257,
  'speech_raw': 'i play volleyball and',
  'speech': 'i play volleyball and',
  'n_words': 4,
  'has_words': True,
  'has_word_internal_brackets': False},
 {'utterance_id': 'sw2045A-ms98-a-0003',
  'conversation_id': '2045',
  'speaker': 'A',
  'start': 3.668625,
  'end': 6.593875,
  'duration': 2.9252499999999997,
  'speech_raw': 'softball and ceramics',
  'speech': 'softball and ceramics',
  'n_words': 3,
  'has_words': True,
  'has_word_internal_brackets': False},
 {'utterance_id': 'sw2045A-ms98-a-0004',
  'conversation_id': '2045',
  'speaker': 'A',
  'start': 6.

## Parsing the word files

In [33]:
%cat -n ./20/2001/sw2001A-ms98-a-word.text | head -10

     1	sw2001A-ms98-a-0001 0.000000 0.977625 [silence]
     2	sw2001A-ms98-a-0002 0.977625 1.215250 [silence]
     3	sw2001A-ms98-a-0002 1.215250 1.724625 hi
     4	sw2001A-ms98-a-0002 1.724625 2.273625 [silence]
     5	sw2001A-ms98-a-0002 2.273625 2.927625 um
     6	sw2001A-ms98-a-0002 2.927625 3.221500 [silence]
     7	sw2001A-ms98-a-0002 3.221500 3.661750 yeah
     8	sw2001A-ms98-a-0002 3.661750 3.957625 i'd
     9	sw2001A-ms98-a-0002 3.957625 4.107625 like
    10	sw2001A-ms98-a-0002 4.107625 4.267625 to


In [34]:
def parse_swbd_word_line(line):
#     parts = line.split(' ')
    #SOME of the word files are tab separated
    #SOME use four spaces
    #SOME use one space
    # !!!!! who the hell let this happen ???
    if '\t' in line:
        parts = line.split('\t')
    else:
        parts = [e for e in line.split(' ') if e != '']
#     return parts
    utterance_id = parts[0]
    
    start = float(parts[1])
    end = float(parts[2])
    duration = end - start
    
    if len(parts) > 3:
        speech_raw_words = parts[3:]
        speech_raw = ' '.join( speech_raw_words )
    else:
#         speech_raw_words = 
        speech_raw = ''
    
    if speech_raw == '':
        speech_OnlyWords = ''
        has_word_internal_brackets = False
    else:
        speech_wordforms = speech_raw.split(' ')
        speech_wordforms = [w for w in speech_wordforms if len(w) > 0 and w[0] != '[' and w[-1] != ']']
        speech_OnlyWords = ' '.join(speech_wordforms)
        hasBrackets = lambda w: "[" in w or "]" in w
        has_word_internal_brackets = True if any(list(map(hasBrackets, speech_wordforms))) else False
        speech = process(speech_raw)
#         if any(list(map(hasBrackets, speech_wordforms))):
#             raise Exception("Word-internal brackets found in\n\t{0}".format(speech_OnlyWords))
    
#     has_words = False if len(speech_raw) == 0 or (speech_raw[0] == '[' and speech_raw[-1] == ']') else True
    has_words = False if speech == '' else True
    
    line_rel = {'utterance_id':utterance_id,
                'conversation_id':utterance_id[2:6],
                'speaker':utterance_id[6:7],
                'start':start,
                'end':end,
                'duration':duration,
                'speech_raw':speech_raw,
#                 'speech_words':speech_OnlyWords,
                'speech':speech,
                'has_words':has_words,
                'has_word_internal_brackets':has_word_internal_brackets
#                 'speech_noquotes':speech_noquotes
               }
    
    return line_rel

In [35]:
def get_word_relations(conversation_id, speaker):
    word_fp = getWordFP(conversation_id, speaker)
    file_lines = read_swbd_file(word_fp)
    parsed_rel = tuple([parse_swbd_word_line(l) for l in file_lines])
    return parsed_rel

In [36]:
words_2045A = get_word_relations('2045', 'A')
words_2045A[:10]

({'utterance_id': 'sw2045A-ms98-a-0001',
  'conversation_id': '2045',
  'speaker': 'A',
  'start': 0.0,
  'end': 0.47125,
  'duration': 0.47125,
  'speech_raw': '[silence]',
  'speech': '',
  'has_words': False,
  'has_word_internal_brackets': False},
 {'utterance_id': 'sw2045A-ms98-a-0001',
  'conversation_id': '2045',
  'speaker': 'A',
  'start': 0.47125,
  'end': 0.640625,
  'duration': 0.169375,
  'speech_raw': 'all',
  'speech': 'all',
  'has_words': True,
  'has_word_internal_brackets': False},
 {'utterance_id': 'sw2045A-ms98-a-0001',
  'conversation_id': '2045',
  'speaker': 'A',
  'start': 0.640625,
  'end': 0.909125,
  'duration': 0.26849999999999996,
  'speech_raw': 'right',
  'speech': 'right',
  'has_words': True,
  'has_word_internal_brackets': False},
 {'utterance_id': 'sw2045A-ms98-a-0001',
  'conversation_id': '2045',
  'speaker': 'A',
  'start': 0.909125,
  'end': 1.411625,
  'duration': 0.5025,
  'speech_raw': '[silence]',
  'speech': '',
  'has_words': False,
  'has_

In [37]:
# words = [get_word_relations(convo_id, spkr) for convo_id in conversation_ids for spkr in ('A','B')]

#~6s w/ J = 30 on wittgenstein
words = par(delayed(get_word_relations)(convo_id, spkr) for convo_id in conversation_ids for spkr in ('A','B'))
len(words)

[Parallel(n_jobs=30)]: Using backend MultiprocessingBackend with 30 concurrent workers.
[Parallel(n_jobs=30)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=30)]: Batch computation too fast (0.0220s.) Setting batch_size=18.
[Parallel(n_jobs=30)]: Done  12 tasks      | elapsed:    0.1s
[Parallel(n_jobs=30)]: Done  25 tasks      | elapsed:    0.1s
[Parallel(n_jobs=30)]: Done  38 tasks      | elapsed:    0.1s
[Parallel(n_jobs=30)]: Done  53 tasks      | elapsed:    0.2s
[Parallel(n_jobs=30)]: Done 204 tasks      | elapsed:    1.0s
[Parallel(n_jobs=30)]: Done 510 tasks      | elapsed:    1.6s
[Parallel(n_jobs=30)]: Done 816 tasks      | elapsed:    2.2s
[Parallel(n_jobs=30)]: Batch computation too slow (2.0016s.) Setting batch_size=9.
[Parallel(n_jobs=30)]: Done 1158 tasks      | elapsed:    2.6s
[Parallel(n_jobs=30)]: Done 1500 tasks      | elapsed:    3.2s
[Parallel(n_jobs=30)]: Done 1851 tasks      | elapsed:    3.6s
[Parallel(n_jobs=30)]: Done 2103 tasks      | elapsed:    4.

4876

In [38]:
words = list(chain.from_iterable(words))
len(words)

4051206

In [39]:
words_with_brackets = [w for w in words if '[' in w['speech_raw'] or ']' in w['speech_raw']]
len(words_with_brackets)

1002688

In [40]:
len([w for w in words if w['has_words']])

3071988

## Organizing all utterances and some summary stats

In [41]:
len(conversation_ids)

2438

In [42]:
#46.1 s on wittgenstein
utterances = []
for conversation_id in conversation_ids:
    utterances.extend(get_utterance_relations(conversation_id, 'A'))
    utterances.extend(get_utterance_relations(conversation_id, 'B'))
len(utterances)

391593

In [43]:
utterances_with_some_words = [u for u in utterances if u['has_words']]
len(utterances_with_some_words)

248826

In [44]:
def getWordformTokens(utterance_rel):
#     return utterance_rel['speech_words'].split(' ')
    return utterance_rel['speech'].split(' ')

In [45]:
wordformTokens = []
for u in utterances_with_some_words:
    wordformTokens.extend(getWordformTokens(u))
len(wordformTokens)

3071988

In [46]:
orthographic_wordform_counter_swbd = Counter(wordformTokens)
orthographic_wordform_counter_swbd.most_common()

[('i', 118103),
 ('and', 108691),
 ('the', 97331),
 ('you', 81057),
 ('a', 72629),
 ('to', 70475),
 ('uh', 69814),
 ('that', 66114),
 ('it', 55323),
 ('of', 55011),
 ('know', 48002),
 ('yeah', 47322),
 ('in', 39968),
 ('they', 32992),
 ('have', 29470),
 ('but', 28913),
 ('<rem>', 28531),
 ("it's", 27382),
 ('so', 27145),
 ('we', 25379),
 ('is', 24448),
 ('was', 23850),
 ('like', 23386),
 ('well', 22578),
 ('just', 22286),
 ('um', 21196),
 ("that's", 20848),
 ('do', 20671),
 ('for', 19691),
 ('think', 19149),
 ("don't", 18897),
 ('oh', 18825),
 ('or', 16928),
 ('on', 16749),
 ('right', 16732),
 ('uh-huh', 16294),
 ('um-hum', 16006),
 ('my', 14924),
 ('what', 14923),
 ('really', 14798),
 ('not', 14395),
 ('be', 14094),
 ('with', 13949),
 ('are', 13291),
 ('if', 13233),
 ('there', 12842),
 ('one', 12657),
 ("i'm", 12356),
 ('about', 12098),
 ('all', 11926),
 ('get', 11738),
 ('because', 11372),
 ('out', 10932),
 ('had', 10912),
 ('at', 10698),
 ('them', 10371),
 ('as', 10040),
 ('up', 982

In [47]:
orthographic_wordform_types = set(wordformTokens)
len(orthographic_wordform_types)

27559

In [48]:
hasSquareBrackets = lambda w: "[" in w or "]" in w
orthographic_wordforms_with_brackets = {w for w in orthographic_wordform_types if hasSquareBrackets(w)}
len(orthographic_wordforms_with_brackets)
orthographic_wordforms_with_brackets

0

set()

In [49]:
{w for w in orthographic_wordforms_with_brackets if w[0] == '-'}

set()

In [50]:
utts_with_brackets = [u for u in utterances_with_some_words if u['has_word_internal_brackets']]
len(utts_with_brackets)

20459

In [51]:
utts_with_brackets[:10]

[{'utterance_id': 'sw3411A-ms98-a-0015',
  'conversation_id': '3411',
  'speaker': 'A',
  'start': 60.398125,
  'end': 65.451375,
  'duration': 5.0532499999999985,
  'speech_raw': "yeah i've always liked that i liked the the one year they had or the couple o[f]- years they had were uh",
  'speech': "yeah i've always liked that i liked the the one year they had or the couple <rem> years they had were uh",
  'n_words': 22,
  'has_words': True,
  'has_word_internal_brackets': True},
 {'utterance_id': 'sw3411A-ms98-a-0027',
  'conversation_id': '3411',
  'speaker': 'A',
  'start': 115.585125,
  'end': 123.488625,
  'duration': 7.903499999999994,
  'speech_raw': "right now they've got a uh a unique fea[ture]- uh feature in them_1 now if uh if you have a front end accident",
  'speech': "right now they've got a uh a unique <rem> uh feature in them now if uh if you have a front end accident",
  'n_words': 23,
  'has_words': True,
  'has_word_internal_brackets': True},
 {'utterance_id': 'sw341

In [52]:
utts_with_word_initial_brackets = [u for u in utts_with_brackets 
                                   if any(map(lambda w: w[0] == '-', 
                                              u['speech_raw'].split(' ')))]
len(utts_with_word_initial_brackets)

470

In [53]:
utts_with_word_initial_brackets[:10]

[{'utterance_id': 'sw2744B-ms98-a-0019',
  'conversation_id': '2744',
  'speaker': 'B',
  'start': 69.227875,
  'end': 76.106875,
  'duration': 6.879000000000005,
  'speech_raw': "[noise] -[be]cau[se]- you know up to what the effort they put into it you know i've seen that for years yeah",
  'speech': "<rem> you know up to what the effort they put into it you know i've seen that for years yeah",
  'n_words': 20,
  'has_words': True,
  'has_word_internal_brackets': True},
 {'utterance_id': 'sw3547A-ms98-a-0001',
  'conversation_id': '3547',
  'speaker': 'A',
  'start': 0.0,
  'end': 0.56025,
  'duration': 0.56025,
  'speech_raw': '[noise] -[o]kay',
  'speech': '<rem>',
  'n_words': 1,
  'has_words': True,
  'has_word_internal_brackets': True},
 {'utterance_id': 'sw4228A-ms98-a-0046',
  'conversation_id': '4228',
  'speaker': 'A',
  'start': 259.963,
  'end': 261.437875,
  'duration': 1.4748749999999973,
  'speech_raw': "-[th]at's interesting",
  'speech': '<rem> interesting',
  'n_words

In [54]:
utts_with_word_initial_brackets

[{'utterance_id': 'sw2744B-ms98-a-0019',
  'conversation_id': '2744',
  'speaker': 'B',
  'start': 69.227875,
  'end': 76.106875,
  'duration': 6.879000000000005,
  'speech_raw': "[noise] -[be]cau[se]- you know up to what the effort they put into it you know i've seen that for years yeah",
  'speech': "<rem> you know up to what the effort they put into it you know i've seen that for years yeah",
  'n_words': 20,
  'has_words': True,
  'has_word_internal_brackets': True},
 {'utterance_id': 'sw3547A-ms98-a-0001',
  'conversation_id': '3547',
  'speaker': 'A',
  'start': 0.0,
  'end': 0.56025,
  'duration': 0.56025,
  'speech_raw': '[noise] -[o]kay',
  'speech': '<rem>',
  'n_words': 1,
  'has_words': True,
  'has_word_internal_brackets': True},
 {'utterance_id': 'sw4228A-ms98-a-0046',
  'conversation_id': '4228',
  'speaker': 'A',
  'start': 259.963,
  'end': 261.437875,
  'duration': 1.4748749999999973,
  'speech_raw': "-[th]at's interesting",
  'speech': '<rem> interesting',
  'n_words

In [55]:
utts_with_braces = [u for u in utterances_with_some_words if any(map(hasCurlyBraces, 
                                                                     u['speech_raw'].split(' ')))]
len(utts_with_braces)

352

In [56]:
utts_with_braces[:10]

[{'utterance_id': 'sw2662B-ms98-a-0077',
  'conversation_id': '2662',
  'speaker': 'B',
  'start': 385.564,
  'end': 390.141625,
  'duration': 4.577624999999955,
  'speech_raw': 'yeah great well it was n[ice]- nice talking to you {alrighty} bye-bye',
  'speech': 'yeah great well it was <rem> nice talking to you alrighty bye-bye',
  'n_words': 12,
  'has_words': True,
  'has_word_internal_brackets': True},
 {'utterance_id': 'sw2258A-ms98-a-0051',
  'conversation_id': '2258',
  'speaker': 'A',
  'start': 202.75375,
  'end': 211.038375,
  'duration': 8.284625000000005,
  'speech_raw': 'yeah i recall a {snirtstorm} out there one day where about this time of year that there was a big cold front coming in and they got the dirt up in the air and then it snowed a little bit',
  'speech': 'yeah i recall a snirtstorm out there one day where about this time of year that there was a big cold front coming in and they got the dirt up in the air and then it snowed a little bit',
  'n_words': 40,
  'h

In [57]:
w_with_underscore = {w for w in orthographic_wordform_types if hasUnderscore(w)}
len(w_with_underscore)
w_with_underscore

0

set()

In [58]:
utts_with_underscore = [u for u in utterances_with_some_words if any(map(hasUnderscore, 
                                                                         u['speech'].split()))]
len(utts_with_underscore)

0

In [59]:
utts_with_underscore[:10]

[]

### Summary

There are
 - ≈2400 conversations.
 - ≈250k (pre-segmented) utterances.
 - ≈3m wordform tokens.
 - about 28k wordform types. (≥6k "orthographic wordforms" - not included in the 28k count - are interrupted or repaired wordforms.)
 
Other notes:
 - every utterance and wordform token has a duration associated with it.

# Comparison with Fisher

In [60]:
fisher_repo_dir = '/mnt/cube/home/AD/emeinhar/fisher-lm'

In [61]:
os.chdir(fisher_repo_dir)

In [62]:
%ls *vocab*

fisher_vocabulary_bbn.txt  fisher_vocabulary_main.txt


In [63]:
%cat -n fisher_vocabulary_bbn.txt | head -20

     1	'and
     2	'berserkly'
     3	'bout
     4	'burb
     5	'burban
     6	'burbs
     7	'cau
     8	'cause
     9	'cept
    10	'cide
    11	'cisco
    12	'cize
    13	'course
    14	'cuse
    15	'do
    16	'em
    17	'em's
    18	'ems
    19	'everybody's
    20	'fess
cat: write error: Broken pipe


In [64]:
%cat -n fisher_vocabulary_main.txt | head -20

     1	'and
     2	'berserkly'
     3	'bout
     4	'burb
     5	'burban
     6	'burbs
     7	'cau
     8	'cause
     9	'cept
    10	'cide
    11	'cisco
    12	'cize
    13	'course
    14	'cuse
    15	'do
    16	'em
    17	'em's
    18	'ems
    19	'everybody's
    20	'fess
cat: write error: Broken pipe


In [65]:
fisher_vocab_bbn_fn = 'fisher_vocabulary_bbn.txt'
fisher_vocab_main_fn = 'fisher_vocabulary_main.txt'

In [66]:
fisher_vocab_bbn = []
with open(fisher_vocab_bbn_fn) as file:
    for line in file:
        fisher_vocab_bbn.append(line.rstrip())


fisher_vocab_main = []
with open(fisher_vocab_main_fn) as file:
    for line in file:
        fisher_vocab_main.append(line.rstrip())

In [67]:
len(fisher_vocab_bbn)
len(fisher_vocab_main)

42013

44064

In [68]:
fisher_vocab_bbn = set(fisher_vocab_bbn)
fisher_vocab_main = set(fisher_vocab_main)

In [69]:
def compare_sizes(before, after):
    print("{0} vs. {1}".format(len(before), len(after)))

In [70]:
def normalize_case(vocabulary):
    return set(map(lambda w: w.lower(),
                   vocabulary))

In [71]:
swbd_vocab = orthographic_wordform_types
swbd_vocab_lc = set(normalize_case(swbd_vocab))
fisher_vocab_bbn_lc = set(normalize_case(fisher_vocab_bbn))
fisher_vocab_main_lc = set(normalize_case(fisher_vocab_main))

In [72]:
compare_sizes(swbd_vocab, swbd_vocab_lc)
compare_sizes(fisher_vocab_bbn, fisher_vocab_bbn_lc)
compare_sizes(fisher_vocab_main, fisher_vocab_main_lc)

27559 vs. 27559
42013 vs. 42013
44064 vs. 44064


In [73]:
vocab_unique_to_fisher = fisher_vocab_main_lc - swbd_vocab_lc
len(vocab_unique_to_fisher)

22846

In [74]:
vocab_unique_to_swbd = swbd_vocab_lc - fisher_vocab_main_lc
len(vocab_unique_to_swbd)

6341

In [75]:
vocab_unique_to_swbd

{'dawes',
 'obgyn',
 'carbide',
 'movers',
 'formality',
 'snoot',
 'reshingled',
 'vdb',
 'bramalea',
 "julio's",
 'skimmers',
 'windham',
 'juarez',
 'tangerine',
 'tussle',
 'snazzier',
 'applicator',
 'pondering',
 'insufficiency',
 'foreground',
 'edmunds',
 'clemson',
 'inconsistency',
 'scribes',
 'pathologically',
 'colloquium',
 'undertook',
 'sandblasted',
 "neiman's",
 'hilltops',
 'mucky',
 "bradshaw's",
 'electrode',
 'patchwork',
 'kingpins',
 'staffs',
 "texan's",
 'borger',
 'neurosurgeon',
 'tilling',
 'rewelded',
 "sportsman's",
 'digitize',
 'agape',
 'skillet',
 'playthings',
 'furrier',
 'notifies',
 'jigs',
 'whacks',
 "lennon's",
 'utd',
 'anti-drunk',
 'expressways',
 'self-made',
 'tarten',
 "commission's",
 "market'll",
 'polluters',
 'pollutant',
 'towered',
 'deltaing',
 'celltrex',
 'quixote',
 'freidman',
 'steams',
 'elicited',
 'amigas',
 "designer's",
 'hewett',
 'ergotisms',
 'subroutine',
 'inhering',
 "siberian's",
 "linebacker's",
 'sequentially',
 

In [76]:
swbd_unique_vocab_by_freq = Counter({w:orthographic_wordform_counter_swbd[w] for w in vocab_unique_to_swbd})
swbd_unique_vocab_by_freq.most_common()

[('ibm', 117),
 ('hum-um', 96),
 ('bins', 85),
 ('hmo', 66),
 ('recyclable', 48),
 ('vcr', 45),
 ('play-offs', 44),
 ('lewisville', 40),
 ('freon', 38),
 ('nra', 31),
 ('pbs', 30),
 ('gm', 29),
 ('nfl', 27),
 ('addison', 27),
 ('switchboard', 26),
 ('byu', 26),
 ('cpa', 26),
 ('dukakis', 23),
 ("ti's", 23),
 ('nc', 23),
 ('unix', 22),
 ('oldsmobile', 22),
 ('aetna', 22),
 ('nolan', 22),
 ('nba', 22),
 ('recyclables', 22),
 ('amiga', 22),
 ('eds', 21),
 ('clarion', 21),
 ('fajitas', 21),
 ('tex-mex', 20),
 ('tarpley', 20),
 ('republics', 20),
 ('epa', 18),
 ("cd's", 18),
 ('bmw', 18),
 ('tvs', 18),
 ('irs', 18),
 ('denton', 17),
 ('molding', 17),
 ('muffler', 17),
 ('aikman', 17),
 ('fords', 17),
 ('gasket', 16),
 ('coda', 16),
 ('rowlett', 16),
 ('smu', 16),
 ('flatliners', 16),
 ('framing', 16),
 ('attleboro', 16),
 ('yeltsin', 16),
 ('utd', 15),
 ('tole', 15),
 ('illustrated', 15),
 ('unlv', 14),
 ('doctoral', 14),
 ('subaru', 14),
 ('crappie', 14),
 ('20/20', 14),
 ('semiautomatic',

In [77]:
swbd_wfs_w_braces = {w for w in vocab_unique_to_swbd if '{' in w}
len(swbd_wfs_w_braces)
swbd_wfs_w_braces

0

set()

In [78]:
[w for w in vocab_unique_to_swbd if hasCurlyBraces(w) and not isCurlyBraced(w)]

[]

# Relating the word and utterance relations

In [79]:
utterances[10]
utterances[11]

{'utterance_id': 'sw3411A-ms98-a-0011',
 'conversation_id': '3411',
 'speaker': 'A',
 'start': 41.79075,
 'end': 45.87,
 'duration': 4.079249999999995,
 'speech_raw': '[silence]',
 'speech': '',
 'n_words': 0,
 'has_words': False,
 'has_word_internal_brackets': False}

{'utterance_id': 'sw3411A-ms98-a-0012',
 'conversation_id': '3411',
 'speaker': 'A',
 'start': 45.87,
 'end': 50.7335,
 'duration': 4.863500000000002,
 'speech_raw': "the Seville that's a sharp looking car that really is it it always has been though",
 'speech': "the seville that's a sharp looking car that really is it it always has been though",
 'n_words': 16,
 'has_words': True,
 'has_word_internal_brackets': False}

In [80]:
[u for u in utterances if u['utterance_id'] == 'sw2923A-ms98-a-0012']

[{'utterance_id': 'sw2923A-ms98-a-0012',
  'conversation_id': '2923',
  'speaker': 'A',
  'start': 44.795875,
  'end': 49.45625,
  'duration': 4.660374999999995,
  'speech_raw': "it's just music by different groups it's you know things you you wouldn't you wouldn't uh",
  'speech': "it's just music by different groups it's you know things you you wouldn't you wouldn't uh",
  'n_words': 16,
  'has_words': True,
  'has_word_internal_brackets': False}]

## Mapping each utterance ID to a sequence of word relations

Given an utterance id, we want to be able to retrieve the corresponding sequence of elements in the word relation.

In [81]:
def get_word_relation_seq(utterance_id, remove_non_speech=True):
    w_rels = [w for w in words if w['utterance_id'] == utterance_id]
    if remove_non_speech:
        w_rels = [w for w in w_rels if w['has_words']]
    w_rels_sorted = sorted(w_rels, key=lambda w_rel: w_rel['start'])
    return w_rels

In [82]:
[u for u in utterances if u['utterance_id'] == 'sw2923A-ms98-a-0012']

[{'utterance_id': 'sw2923A-ms98-a-0012',
  'conversation_id': '2923',
  'speaker': 'A',
  'start': 44.795875,
  'end': 49.45625,
  'duration': 4.660374999999995,
  'speech_raw': "it's just music by different groups it's you know things you you wouldn't you wouldn't uh",
  'speech': "it's just music by different groups it's you know things you you wouldn't you wouldn't uh",
  'n_words': 16,
  'has_words': True,
  'has_word_internal_brackets': False}]

In [83]:
words_in_the_utt = get_word_relation_seq('sw2923A-ms98-a-0012', False)
len(words_in_the_utt)
only_words = [w for w in words_in_the_utt if w['has_words']]
len(only_words)
' '.join([w['speech'] for w in only_words])
' '.join([w['speech'] for w in only_words]) == [u for u in utterances if u['utterance_id'] == 'sw2923A-ms98-a-0012'][0]['speech']

18

16

"it's just music by different groups it's you know things you you wouldn't you wouldn't uh"

True

In [84]:
get_word_relation_seq('sw2923A-ms98-a-0012', True)

[{'utterance_id': 'sw2923A-ms98-a-0012',
  'conversation_id': '2923',
  'speaker': 'A',
  'start': 45.565875,
  'end': 45.725875,
  'duration': 0.1600000000000037,
  'speech_raw': "it's",
  'speech': "it's",
  'has_words': True,
  'has_word_internal_brackets': False},
 {'utterance_id': 'sw2923A-ms98-a-0012',
  'conversation_id': '2923',
  'speaker': 'A',
  'start': 45.725875,
  'end': 46.105875,
  'duration': 0.37999999999999545,
  'speech_raw': 'just',
  'speech': 'just',
  'has_words': True,
  'has_word_internal_brackets': False},
 {'utterance_id': 'sw2923A-ms98-a-0012',
  'conversation_id': '2923',
  'speaker': 'A',
  'start': 46.105875,
  'end': 46.375875,
  'duration': 0.2700000000000031,
  'speech_raw': 'music',
  'speech': 'music',
  'has_words': True,
  'has_word_internal_brackets': False},
 {'utterance_id': 'sw2923A-ms98-a-0012',
  'conversation_id': '2923',
  'speaker': 'A',
  'start': 46.375875,
  'end': 46.475875,
  'duration': 0.10000000000000142,
  'speech_raw': 'by',
  '

## Mapping each utterance ID to an orthographic word sequence + word duration sequence pair

In [85]:
def get_words_and_durations(utterance_id):
    w_rels = get_word_relation_seq(utterance_id)
    orth_wfs = tuple([w['speech'] for w in w_rels])
    durations = tuple([w['duration'] for w in w_rels])
#     assert len(orth_wfs) == len(durations)
    return orth_wfs, durations

In [86]:
get_words_and_durations('sw2923A-ms98-a-0012')

(("it's",
  'just',
  'music',
  'by',
  'different',
  'groups',
  "it's",
  'you',
  'know',
  'things',
  'you',
  'you',
  "wouldn't",
  'you',
  "wouldn't",
  'uh'),
 (0.1600000000000037,
  0.37999999999999545,
  0.2700000000000031,
  0.10000000000000142,
  0.3200000000000003,
  0.3499999999999943,
  0.20000000000000284,
  0.060000000000002274,
  0.22999999999999687,
  0.18312500000000398,
  0.13787499999999397,
  0.08900000000000574,
  0.17999999999999972,
  0.0899999999999963,
  0.3100000000000023,
  0.1700000000000017))

In [87]:
tuple(zip(*get_words_and_durations('sw2923A-ms98-a-0012')))

(("it's", 0.1600000000000037),
 ('just', 0.37999999999999545),
 ('music', 0.2700000000000031),
 ('by', 0.10000000000000142),
 ('different', 0.3200000000000003),
 ('groups', 0.3499999999999943),
 ("it's", 0.20000000000000284),
 ('you', 0.060000000000002274),
 ('know', 0.22999999999999687),
 ('things', 0.18312500000000398),
 ('you', 0.13787499999999397),
 ('you', 0.08900000000000574),
 ("wouldn't", 0.17999999999999972),
 ('you', 0.0899999999999963),
 ("wouldn't", 0.3100000000000023),
 ('uh', 0.1700000000000017))

## Making a word relation for confusability analysis

For each word in the corpus, we want
 - the orthographic wordform
 - all preceding wordforms in the utterance
 - the 4 preceding wordforms in the utterance
 - whether the word is adjacent to any disfluencies, pauses, or filled pauses
 - whether the word begins or ends with a clitic
 - the duration
 - the utterance ID
 - the conversation ID
 - the speaker

In [88]:
def isPauseOrDisfluency(speech_word_token):
    return speech_word_token == unk
#     if type(container) == b.containers.Word:
#         return False
#     return True


filledPauses = {
 'ah',
 'hm',
 'hmm',
#  'huh',
#  'huh-uh',
#  'hum',
#  'hum-um',
#  'mm',
#  'mm-hmm',
 'uh',
#  'uh-huh',
#  'uh-hum',
#  'uh-uh',
 'um',
#  'um-hmm',
#  'um-huh',
#  'um-hum'
}

def isFilledPause(speech_word_token):
    return speech_word_token in filledPauses

In [89]:
vocabulary = orthographic_wordform_types

In [90]:
"yknow" in vocabulary
"y'know" in vocabulary
"ya" in vocabulary

False

False

False

In [91]:
clitic_suffixes = {"n't", "'ve", "'ll", "'s", "s'","'d", "'re", "'m", "'em", "'d've"}

In [92]:
words_with_clitic_suffix = {w for w in vocabulary 
                            if any({suff in w for suff in clitic_suffixes})}
len(vocabulary)
len(words_with_clitic_suffix)
len(words_with_clitic_suffix - {w for w in words_with_clitic_suffix
                                if "'s" in w})

27559

1350

169

In [93]:
words_with_clitic_suffix - {w for w in words_with_clitic_suffix
                                if "'s" in w or "s'" in w}

{"ain't",
 "aren't",
 "ashley'll",
 "ball'll",
 "baltimore'll",
 "brother'll",
 "can't",
 "can'ts",
 "car'll",
 "cat'll",
 "children'll",
 "color'd",
 "could've",
 "couldn't",
 "day'll",
 "didn't",
 "doesn't",
 "don't",
 "don'ts",
 "duke'll",
 "everybody'd",
 "grandmother'll",
 "hadn't",
 "hasn't",
 "haven't",
 "he'd",
 "he'll",
 "how'd",
 "husband'll",
 "i'd",
 "i'll",
 "i'm",
 "i've",
 "isn't",
 "it'd",
 "it'll",
 "kid'll",
 "lady'll",
 "market'll",
 "may've",
 "might've",
 "mine'd",
 "must've",
 "needn't",
 "neighbor'll",
 "nobody'll",
 "nothing'll",
 "one'll",
 "people'll",
 "physician'll",
 "rest'll",
 "shan't",
 "she'd",
 "she'll",
 "should've",
 "shouldn't",
 "shouldn'ts",
 "some'll",
 "somebody'd",
 "somebody'll",
 "someone'll",
 "that'd",
 "that'll",
 "there'd",
 "there'll",
 "there're",
 "there've",
 "they'd",
 "they'll",
 "they're",
 "they've",
 "to've",
 "town'd",
 "tracy'd",
 "valentine'll",
 "wasn't",
 "we'd",
 "we'll",
 "we're",
 "we've",
 "weren't",
 "what'd",
 "what'll

In [94]:
def hasClitic(orthographic_wordform_token):
    w = orthographic_wordform_token
    return w in words_with_clitic_suffix

In [95]:
s = "it's just music by different groups it's you know things you you wouldn't you wouldn't uh"
ws = s.split(' ')
len(ws)
i = 3
j = 8

ws[j]
ws[j-4 if j-4 > 0 else 0:j]

ws[i]
ws[i-4 if i-4 > 0 else 0:i]

def get_ngram_context(word_seq, i, n=None):
    if n is None:
        n = 4
    assert n > 0
    assert i > -1
    assert i < len(word_seq)
    return word_seq[i-n if i-n > 0 else 0:i]

get_ngram_context(ws, 8)
get_ngram_context(ws, 3)

def get_ngram_contexts(word_seq, n=None):
    if n is None:
        n = 4
    assert n > 0
    return tuple(map(lambda i: get_ngram_context(word_seq, i),
                     range(len(word_seq))))

def get_all_prefixes(word_seq):
    return [word_seq[0:i] for i in range(len(word_seq)+1)]

def get_all_contexts(word_seq):
    return get_all_prefixes(word_seq)[:-1]

get_ngram_contexts(ws)
get_all_contexts(ws)

16

'know'

['different', 'groups', "it's", 'you']

'by'

["it's", 'just', 'music']

['different', 'groups', "it's", 'you']

["it's", 'just', 'music']

([],
 ["it's"],
 ["it's", 'just'],
 ["it's", 'just', 'music'],
 ["it's", 'just', 'music', 'by'],
 ['just', 'music', 'by', 'different'],
 ['music', 'by', 'different', 'groups'],
 ['by', 'different', 'groups', "it's"],
 ['different', 'groups', "it's", 'you'],
 ['groups', "it's", 'you', 'know'],
 ["it's", 'you', 'know', 'things'],
 ['you', 'know', 'things', 'you'],
 ['know', 'things', 'you', 'you'],
 ['things', 'you', 'you', "wouldn't"],
 ['you', 'you', "wouldn't", 'you'],
 ['you', "wouldn't", 'you', "wouldn't"])

[[],
 ["it's"],
 ["it's", 'just'],
 ["it's", 'just', 'music'],
 ["it's", 'just', 'music', 'by'],
 ["it's", 'just', 'music', 'by', 'different'],
 ["it's", 'just', 'music', 'by', 'different', 'groups'],
 ["it's", 'just', 'music', 'by', 'different', 'groups', "it's"],
 ["it's", 'just', 'music', 'by', 'different', 'groups', "it's", 'you'],
 ["it's", 'just', 'music', 'by', 'different', 'groups', "it's", 'you', 'know'],
 ["it's",
  'just',
  'music',
  'by',
  'different',
  'groups',
  "it's",
  'you',
  'know',
  'things'],
 ["it's",
  'just',
  'music',
  'by',
  'different',
  'groups',
  "it's",
  'you',
  'know',
  'things',
  'you'],
 ["it's",
  'just',
  'music',
  'by',
  'different',
  'groups',
  "it's",
  'you',
  'know',
  'things',
  'you',
  'you'],
 ["it's",
  'just',
  'music',
  'by',
  'different',
  'groups',
  "it's",
  'you',
  'know',
  'things',
  'you',
  'you',
  "wouldn't"],
 ["it's",
  'just',
  'music',
  'by',
  'different',
  'groups',
  "it's",
  'you',
  'kno

In [96]:
def get_words_durations_and_contexts(utterance_id, n=None):
    if n is None:
        n = 4
    
    w_rels = get_word_relation_seq(utterance_id)
    orth_wfs = tuple([w['speech'] for w in w_rels])
    durations = tuple([w['duration'] for w in w_rels])
    local_contexts = get_ngram_contexts(orth_wfs)
    full_contexts = get_all_contexts(orth_wfs)
#     assert len(orth_wfs) == len(durations)
    return orth_wfs, durations, local_contexts, full_contexts

In [97]:
get_words_durations_and_contexts('sw2923A-ms98-a-0012')

(("it's",
  'just',
  'music',
  'by',
  'different',
  'groups',
  "it's",
  'you',
  'know',
  'things',
  'you',
  'you',
  "wouldn't",
  'you',
  "wouldn't",
  'uh'),
 (0.1600000000000037,
  0.37999999999999545,
  0.2700000000000031,
  0.10000000000000142,
  0.3200000000000003,
  0.3499999999999943,
  0.20000000000000284,
  0.060000000000002274,
  0.22999999999999687,
  0.18312500000000398,
  0.13787499999999397,
  0.08900000000000574,
  0.17999999999999972,
  0.0899999999999963,
  0.3100000000000023,
  0.1700000000000017),
 ((),
  ("it's",),
  ("it's", 'just'),
  ("it's", 'just', 'music'),
  ("it's", 'just', 'music', 'by'),
  ('just', 'music', 'by', 'different'),
  ('music', 'by', 'different', 'groups'),
  ('by', 'different', 'groups', "it's"),
  ('different', 'groups', "it's", 'you'),
  ('groups', "it's", 'you', 'know'),
  ("it's", 'you', 'know', 'things'),
  ('you', 'know', 'things', 'you'),
  ('know', 'things', 'you', 'you'),
  ('things', 'you', 'you', "wouldn't"),
  ('you', 'y

In [98]:
tuple(zip(*get_words_durations_and_contexts('sw2923A-ms98-a-0012')))

(("it's", 0.1600000000000037, (), ()),
 ('just', 0.37999999999999545, ("it's",), ("it's",)),
 ('music', 0.2700000000000031, ("it's", 'just'), ("it's", 'just')),
 ('by',
  0.10000000000000142,
  ("it's", 'just', 'music'),
  ("it's", 'just', 'music')),
 ('different',
  0.3200000000000003,
  ("it's", 'just', 'music', 'by'),
  ("it's", 'just', 'music', 'by')),
 ('groups',
  0.3499999999999943,
  ('just', 'music', 'by', 'different'),
  ("it's", 'just', 'music', 'by', 'different')),
 ("it's",
  0.20000000000000284,
  ('music', 'by', 'different', 'groups'),
  ("it's", 'just', 'music', 'by', 'different', 'groups')),
 ('you',
  0.060000000000002274,
  ('by', 'different', 'groups', "it's"),
  ("it's", 'just', 'music', 'by', 'different', 'groups', "it's")),
 ('know',
  0.22999999999999687,
  ('different', 'groups', "it's", 'you'),
  ("it's", 'just', 'music', 'by', 'different', 'groups', "it's", 'you')),
 ('things',
  0.18312500000000398,
  ('groups', "it's", 'you', 'know'),
  ("it's",
   'just',


In [99]:
[u for u in utterances if u['utterance_id'] == 'sw2923A-ms98-a-0012']

[{'utterance_id': 'sw2923A-ms98-a-0012',
  'conversation_id': '2923',
  'speaker': 'A',
  'start': 44.795875,
  'end': 49.45625,
  'duration': 4.660374999999995,
  'speech_raw': "it's just music by different groups it's you know things you you wouldn't you wouldn't uh",
  'speech': "it's just music by different groups it's you know things you you wouldn't you wouldn't uh",
  'n_words': 16,
  'has_words': True,
  'has_word_internal_brackets': False}]

In [100]:
len(ws)
ws[-1]
ws[len(ws)-1]
((len(ws)-15)-1)
ws[15]
' '
ws[0]
ws[len(ws)-len(ws)]

ws[3]
(len(ws) - 3)
len(ws) - (len(ws) - 3)
ws[len(ws) - (len(ws) - 3)]
ws[-13]

16

'uh'

'uh'

0

'uh'

' '

"it's"

"it's"

'by'

13

3

'by'

'by'

In [101]:
def hasAdjacentPauseOrDisfluency(left_rel, right_rel):
    if left_rel is not None:
        return isPauseOrDisfluency(left_rel['orthographic_wordform'])
    if right_rel is not None:
        return isPauseOrDisfluency(right_rel['orthographic_wordform'])
    return False

def hasAdjacentFilledPause(left_rel, right_rel):
    if left_rel is not None:
        return isFilledPause(left_rel['orthographic_wordform'])
    if right_rel is not None:
        return isFilledPause(right_rel['orthographic_wordform'])
    return False

def make_analysis_relations(utterance_id):
    utt_rel_matches = [u for u in utterances if u['utterance_id'] == utterance_id]
    assert len(utt_rel_matches) > 0, 'No matching utterance found for id {0}'.format(utterance_id)
    assert len(utt_rel_matches) == 1, 'Should be exactly one match for an utterance id, got these instead\n{0}'.format(utt_rel_matches)
    utt_rel = utt_rel_matches[0]
    
    orth_wfs, durations, local_contexts, full_contexts = get_words_durations_and_contexts(utterance_id)
    zipped = tuple(zip(orth_wfs, durations, local_contexts, full_contexts))
    make_rel = lambda triple: {'utterance_id':utterance_id,
                               'conversation_id':utt_rel['conversation_id'],
                               'speaker':utt_rel['speaker'],
                               'orthographic_wordform':triple[0],
                               'duration':triple[1],
                               'preceding_4_wordforms':triple[2],
#                                'Preceding_4_Wordforms':list(triple[2]),
                               'preceding_wordforms':triple[3]
                              }
    as_rels = list(map(make_rel, zipped))
    for i, w_rel in enumerate(as_rels):
        w_rel.update({'dist_from_left_edge':i})
        w_rel.update({'dist_from_right_edge':len(as_rels)-i-1})
        
        if i > 0:
            left_rel = as_rels[i-1]
        else:
            left_rel = None
        if i < len(as_rels) - 2:
            right_rel = as_rels[i+1]
        else:
            right_rel = None
        w_rel.update({'hasAdjacentPauseOrDisfluency':hasAdjacentPauseOrDisfluency(left_rel, right_rel)})
        w_rel.update({'hasAdjacentFilledPause':hasAdjacentFilledPause(left_rel, right_rel)})
        
        w_rel.update({'hasClitic':hasClitic(w_rel['orthographic_wordform'])})
        
    return as_rels

make_analysis_relations('sw2923A-ms98-a-0012')

[{'utterance_id': 'sw2923A-ms98-a-0012',
  'conversation_id': '2923',
  'speaker': 'A',
  'orthographic_wordform': "it's",
  'duration': 0.1600000000000037,
  'preceding_4_wordforms': (),
  'preceding_wordforms': (),
  'dist_from_left_edge': 0,
  'dist_from_right_edge': 15,
  'hasAdjacentPauseOrDisfluency': False,
  'hasAdjacentFilledPause': False,
  'hasClitic': True},
 {'utterance_id': 'sw2923A-ms98-a-0012',
  'conversation_id': '2923',
  'speaker': 'A',
  'orthographic_wordform': 'just',
  'duration': 0.37999999999999545,
  'preceding_4_wordforms': ("it's",),
  'preceding_wordforms': ("it's",),
  'dist_from_left_edge': 1,
  'dist_from_right_edge': 14,
  'hasAdjacentPauseOrDisfluency': False,
  'hasAdjacentFilledPause': False,
  'hasClitic': False},
 {'utterance_id': 'sw2923A-ms98-a-0012',
  'conversation_id': '2923',
  'speaker': 'A',
  'orthographic_wordform': 'music',
  'duration': 0.2700000000000031,
  'preceding_4_wordforms': ("it's", 'just'),
  'preceding_wordforms': ("it's", '

In [102]:
utterance_ids = [u['utterance_id'] for u in utterances]
len(utterance_ids)

391593

In [103]:
V = 10

In [105]:
# analysis_relations = list(map(make_analysis_relations,
#                               utterance_ids))

#used to take 305m on wittgenstein with J = 30
# with extra exclusion annotations, this now takes ~355m
analysis_relations = par(delayed(make_analysis_relations)(utterance_id) for utterance_id in utterance_ids)
len(analysis_relations)

[Parallel(n_jobs=30)]: Using backend MultiprocessingBackend with 30 concurrent workers.
[Parallel(n_jobs=30)]: Done   1 tasks      | elapsed:    1.3s
[Parallel(n_jobs=30)]: Done  12 tasks      | elapsed:    2.5s
[Parallel(n_jobs=30)]: Done  25 tasks      | elapsed:    2.9s
[Parallel(n_jobs=30)]: Done  38 tasks      | elapsed:    3.3s
[Parallel(n_jobs=30)]: Done  53 tasks      | elapsed:    4.3s
[Parallel(n_jobs=30)]: Done  68 tasks      | elapsed:    5.2s
[Parallel(n_jobs=30)]: Done  85 tasks      | elapsed:    6.0s
[Parallel(n_jobs=30)]: Done 102 tasks      | elapsed:    7.0s
[Parallel(n_jobs=30)]: Done 121 tasks      | elapsed:    7.6s
[Parallel(n_jobs=30)]: Done 140 tasks      | elapsed:    8.7s
[Parallel(n_jobs=30)]: Done 161 tasks      | elapsed:    9.9s
[Parallel(n_jobs=30)]: Done 182 tasks      | elapsed:   11.0s
[Parallel(n_jobs=30)]: Done 205 tasks      | elapsed:   12.1s
[Parallel(n_jobs=30)]: Done 228 tasks      | elapsed:   13.6s
[Parallel(n_jobs=30)]: Done 253 tasks      |

[Parallel(n_jobs=30)]: Done 9881 tasks      | elapsed:  8.8min
[Parallel(n_jobs=30)]: Done 10022 tasks      | elapsed:  9.0min
[Parallel(n_jobs=30)]: Done 10165 tasks      | elapsed:  9.1min
[Parallel(n_jobs=30)]: Done 10308 tasks      | elapsed:  9.2min
[Parallel(n_jobs=30)]: Done 10453 tasks      | elapsed:  9.3min
[Parallel(n_jobs=30)]: Done 10598 tasks      | elapsed:  9.5min
[Parallel(n_jobs=30)]: Done 10745 tasks      | elapsed:  9.6min
[Parallel(n_jobs=30)]: Done 10892 tasks      | elapsed:  9.7min
[Parallel(n_jobs=30)]: Done 11041 tasks      | elapsed:  9.9min
[Parallel(n_jobs=30)]: Done 11190 tasks      | elapsed: 10.0min
[Parallel(n_jobs=30)]: Done 11341 tasks      | elapsed: 10.2min
[Parallel(n_jobs=30)]: Done 11492 tasks      | elapsed: 10.3min
[Parallel(n_jobs=30)]: Done 11645 tasks      | elapsed: 10.5min
[Parallel(n_jobs=30)]: Done 11798 tasks      | elapsed: 10.6min
[Parallel(n_jobs=30)]: Done 11953 tasks      | elapsed: 10.7min
[Parallel(n_jobs=30)]: Done 14390 tasks  

[Parallel(n_jobs=30)]: Done 40268 tasks      | elapsed: 36.1min
[Parallel(n_jobs=30)]: Done 40553 tasks      | elapsed: 36.4min
[Parallel(n_jobs=30)]: Done 40838 tasks      | elapsed: 36.6min
[Parallel(n_jobs=30)]: Done 41125 tasks      | elapsed: 36.9min
[Parallel(n_jobs=30)]: Done 41412 tasks      | elapsed: 37.1min
[Parallel(n_jobs=30)]: Done 41701 tasks      | elapsed: 37.4min
[Parallel(n_jobs=30)]: Done 41990 tasks      | elapsed: 37.7min
[Parallel(n_jobs=30)]: Done 42281 tasks      | elapsed: 37.9min
[Parallel(n_jobs=30)]: Done 42572 tasks      | elapsed: 38.2min
[Parallel(n_jobs=30)]: Done 42865 tasks      | elapsed: 38.4min
[Parallel(n_jobs=30)]: Done 43158 tasks      | elapsed: 38.7min
[Parallel(n_jobs=30)]: Done 43453 tasks      | elapsed: 39.0min
[Parallel(n_jobs=30)]: Done 43748 tasks      | elapsed: 39.3min
[Parallel(n_jobs=30)]: Done 44045 tasks      | elapsed: 39.5min
[Parallel(n_jobs=30)]: Done 44342 tasks      | elapsed: 39.8min
[Parallel(n_jobs=30)]: Done 44641 tasks 

[Parallel(n_jobs=30)]: Done 85225 tasks      | elapsed: 76.6min
[Parallel(n_jobs=30)]: Done 85638 tasks      | elapsed: 77.0min
[Parallel(n_jobs=30)]: Done 86053 tasks      | elapsed: 77.4min
[Parallel(n_jobs=30)]: Done 86468 tasks      | elapsed: 77.7min
[Parallel(n_jobs=30)]: Done 86885 tasks      | elapsed: 78.1min
[Parallel(n_jobs=30)]: Done 87302 tasks      | elapsed: 78.5min
[Parallel(n_jobs=30)]: Done 87721 tasks      | elapsed: 78.9min
[Parallel(n_jobs=30)]: Done 88140 tasks      | elapsed: 79.2min
[Parallel(n_jobs=30)]: Done 88561 tasks      | elapsed: 79.6min
[Parallel(n_jobs=30)]: Done 88982 tasks      | elapsed: 80.0min
[Parallel(n_jobs=30)]: Done 89405 tasks      | elapsed: 80.4min
[Parallel(n_jobs=30)]: Done 89828 tasks      | elapsed: 80.8min
[Parallel(n_jobs=30)]: Done 90253 tasks      | elapsed: 81.2min
[Parallel(n_jobs=30)]: Done 90678 tasks      | elapsed: 81.6min
[Parallel(n_jobs=30)]: Done 91105 tasks      | elapsed: 81.9min
[Parallel(n_jobs=30)]: Done 91532 tasks 

[Parallel(n_jobs=30)]: Done 145201 tasks      | elapsed: 130.8min
[Parallel(n_jobs=30)]: Done 145740 tasks      | elapsed: 131.3min
[Parallel(n_jobs=30)]: Done 146281 tasks      | elapsed: 131.8min
[Parallel(n_jobs=30)]: Done 146822 tasks      | elapsed: 132.2min
[Parallel(n_jobs=30)]: Done 147365 tasks      | elapsed: 132.7min
[Parallel(n_jobs=30)]: Done 147908 tasks      | elapsed: 133.2min
[Parallel(n_jobs=30)]: Done 148453 tasks      | elapsed: 133.7min
[Parallel(n_jobs=30)]: Done 148998 tasks      | elapsed: 134.2min
[Parallel(n_jobs=30)]: Done 149545 tasks      | elapsed: 134.7min
[Parallel(n_jobs=30)]: Done 150092 tasks      | elapsed: 135.2min
[Parallel(n_jobs=30)]: Done 150641 tasks      | elapsed: 135.7min
[Parallel(n_jobs=30)]: Done 151190 tasks      | elapsed: 136.2min
[Parallel(n_jobs=30)]: Done 151741 tasks      | elapsed: 136.7min
[Parallel(n_jobs=30)]: Done 152292 tasks      | elapsed: 137.2min
[Parallel(n_jobs=30)]: Done 152845 tasks      | elapsed: 137.7min
[Parallel(

[Parallel(n_jobs=30)]: Done 220388 tasks      | elapsed: 199.1min
[Parallel(n_jobs=30)]: Done 221053 tasks      | elapsed: 199.7min
[Parallel(n_jobs=30)]: Done 221718 tasks      | elapsed: 200.4min
[Parallel(n_jobs=30)]: Done 222385 tasks      | elapsed: 201.0min
[Parallel(n_jobs=30)]: Done 223052 tasks      | elapsed: 201.6min
[Parallel(n_jobs=30)]: Done 223721 tasks      | elapsed: 202.3min
[Parallel(n_jobs=30)]: Done 224390 tasks      | elapsed: 202.9min
[Parallel(n_jobs=30)]: Done 225061 tasks      | elapsed: 203.5min
[Parallel(n_jobs=30)]: Done 225732 tasks      | elapsed: 204.1min
[Parallel(n_jobs=30)]: Done 226405 tasks      | elapsed: 204.7min
[Parallel(n_jobs=30)]: Done 227078 tasks      | elapsed: 205.3min
[Parallel(n_jobs=30)]: Done 227753 tasks      | elapsed: 206.0min
[Parallel(n_jobs=30)]: Done 228428 tasks      | elapsed: 206.6min
[Parallel(n_jobs=30)]: Done 229105 tasks      | elapsed: 207.2min
[Parallel(n_jobs=30)]: Done 229782 tasks      | elapsed: 207.9min
[Parallel(

[Parallel(n_jobs=30)]: Done 311201 tasks      | elapsed: 281.7min
[Parallel(n_jobs=30)]: Done 311990 tasks      | elapsed: 282.5min
[Parallel(n_jobs=30)]: Done 312781 tasks      | elapsed: 283.2min
[Parallel(n_jobs=30)]: Done 313572 tasks      | elapsed: 283.9min
[Parallel(n_jobs=30)]: Done 314365 tasks      | elapsed: 284.6min
[Parallel(n_jobs=30)]: Done 315158 tasks      | elapsed: 285.3min
[Parallel(n_jobs=30)]: Done 315953 tasks      | elapsed: 286.0min
[Parallel(n_jobs=30)]: Done 316748 tasks      | elapsed: 286.7min
[Parallel(n_jobs=30)]: Done 317545 tasks      | elapsed: 287.4min
[Parallel(n_jobs=30)]: Done 318342 tasks      | elapsed: 288.2min
[Parallel(n_jobs=30)]: Done 319141 tasks      | elapsed: 289.0min
[Parallel(n_jobs=30)]: Done 319940 tasks      | elapsed: 289.7min
[Parallel(n_jobs=30)]: Done 320741 tasks      | elapsed: 290.5min
[Parallel(n_jobs=30)]: Done 321542 tasks      | elapsed: 291.2min
[Parallel(n_jobs=30)]: Done 322345 tasks      | elapsed: 291.9min
[Parallel(

391593

In [106]:
word_analysis_relation = list(chain.from_iterable(analysis_relations))
len(word_analysis_relation)

3071988

In [107]:
word_analysis_relation[0]

{'utterance_id': 'sw3411A-ms98-a-0002',
 'conversation_id': '3411',
 'speaker': 'A',
 'orthographic_wordform': 'what',
 'duration': 0.1395,
 'preceding_4_wordforms': (),
 'preceding_wordforms': (),
 'dist_from_left_edge': 0,
 'dist_from_right_edge': 9,
 'hasAdjacentPauseOrDisfluency': False,
 'hasAdjacentFilledPause': False,
 'hasClitic': False}

In [108]:
[r for r in word_analysis_relation if r['orthographic_wordform'] == '<rem>']

[{'utterance_id': 'sw3411A-ms98-a-0015',
  'conversation_id': '3411',
  'speaker': 'A',
  'orthographic_wordform': '<rem>',
  'duration': 0.030000000000001137,
  'preceding_4_wordforms': ('had', 'or', 'the', 'couple'),
  'preceding_wordforms': ('yeah',
   "i've",
   'always',
   'liked',
   'that',
   'i',
   'liked',
   'the',
   'the',
   'one',
   'year',
   'they',
   'had',
   'or',
   'the',
   'couple'),
  'dist_from_left_edge': 16,
  'dist_from_right_edge': 5,
  'hasAdjacentPauseOrDisfluency': False,
  'hasAdjacentFilledPause': False,
  'hasClitic': False},
 {'utterance_id': 'sw3411A-ms98-a-0027',
  'conversation_id': '3411',
  'speaker': 'A',
  'orthographic_wordform': '<rem>',
  'duration': 0.1980000000000075,
  'preceding_4_wordforms': ('a', 'uh', 'a', 'unique'),
  'preceding_wordforms': ('right',
   'now',
   "they've",
   'got',
   'a',
   'uh',
   'a',
   'unique'),
  'dist_from_left_edge': 8,
  'dist_from_right_edge': 14,
  'hasAdjacentPauseOrDisfluency': False,
  'hasAd

## Identifying n-gram contexts

In [109]:
local_contexts = [r['preceding_4_wordforms'] for r in word_analysis_relation]
len(local_contexts)

3071988

In [110]:
fourgram_contexts = [tuple(c) for c in local_contexts if len(c) == 4]
len(fourgram_contexts)

2314779

In [111]:
Counter(fourgram_contexts).most_common()

[(('i', "don't", 'know', 'if'), 757),
 (('i', "don't", 'know', 'i'), 729),
 (('a', 'lot', 'of', 'people'), 608),
 (('a', 'lot', 'of', 'the'), 493),
 (('i', "don't", 'know', 'what'), 457),
 (("there's", 'a', 'lot', 'of'), 402),
 (('have', 'a', 'lot', 'of'), 370),
 (('and', 'uh', 'you', 'know'), 368),
 (('and', 'things', 'like', 'that'), 355),
 (('i', "don't", 'i', "don't"), 353),
 (('uh', 'i', "don't", 'know'), 343),
 (('i', "don't", 'know', 'how'), 338),
 (('what', 'do', 'you', 'think'), 335),
 (('or', 'something', 'like', 'that'), 311),
 (('a', 'little', 'bit', 'of'), 298),
 (('to', 'be', 'able', 'to'), 291),
 (('you', 'know', 'i', 'mean'), 290),
 (('you', 'know', 'i', 'think'), 276),
 (('it', 'was', 'it', 'was'), 275),
 (('you', 'know', 'i', "don't"), 265),
 (('and', 'i', "don't", 'know'), 258),
 (('i', 'mean', 'you', 'know'), 242),
 (('and', 'i', 'think', 'that'), 240),
 (('one', 'of', 'the', 'things'), 239),
 (('and', 'a', 'lot', 'of'), 230),
 (('do', 'a', 'lot', 'of'), 228),
 (('<

In [112]:
trigram_contexts = [tuple(c) for c in local_contexts if len(c) == 3]
len(trigram_contexts)

156659

In [113]:
Counter(trigram_contexts).most_common()

[(('i', "don't", 'know'), 1027),
 (('yeah', 'i', 'think'), 493),
 (('you', 'know', 'i'), 449),
 (('well', 'i', 'think'), 396),
 (('and', 'you', 'know'), 334),
 (('you', 'know', 'and'), 333),
 (('and', 'i', 'think'), 332),
 (('and', 'uh', 'i'), 299),
 (('yeah', 'well', 'i'), 286),
 (('uh', 'you', 'know'), 277),
 (('well', 'you', 'know'), 251),
 (('i', "don't", 'think'), 215),
 (('what', 'do', 'you'), 197),
 (('well', 'i', "don't"), 193),
 (('yeah', 'yeah', 'i'), 191),
 (('you', 'know', 'the'), 190),
 (('yeah', '<rem>', 'i'), 189),
 (('you', 'know', "it's"), 183),
 (('but', 'uh', 'i'), 182),
 (('and', 'uh', 'you'), 182),
 (('do', 'you', 'have'), 181),
 (('yeah', 'i', 'i'), 181),
 (('a', 'lot', 'of'), 178),
 (('yeah', 'i', "don't"), 177),
 (('yeah', 'i', 'know'), 175),
 (('well', 'i', 'guess'), 172),
 (('i', 'think', 'that'), 171),
 (('and', 'it', 'was'), 167),
 (('but', 'i', 'think'), 161),
 (('and', 'i', "don't"), 159),
 (('and', 'uh', 'so'), 159),
 (('yeah', 'i', 'mean'), 158),
 (('uh'

In [114]:
bigram_contexts = [tuple(c) for c in local_contexts if len(c) == 2]
len(bigram_contexts)

166939

In [115]:
Counter(bigram_contexts).most_common()

[(('you', 'know'), 4791),
 (('yeah', 'i'), 2940),
 (('and', 'uh'), 2762),
 (('well', 'i'), 2257),
 (('and', 'i'), 2025),
 (('i', "don't"), 1909),
 (('i', 'think'), 1673),
 (('yeah', 'well'), 1319),
 (('oh', 'yeah'), 1260),
 (('oh', 'i'), 1141),
 (('yeah', 'yeah'), 1139),
 (('yeah', "that's"), 1034),
 (('but', 'uh'), 989),
 (('yeah', 'and'), 975),
 (('but', 'i'), 950),
 (('and', 'then'), 947),
 (('i', 'mean'), 908),
 (('so', 'i'), 902),
 (('uh', 'i'), 855),
 (('well', "that's"), 834),
 (('do', 'you'), 789),
 (('and', 'so'), 785),
 (('and', 'they'), 709),
 (('i', 'know'), 681),
 (('yeah', 'it'), 669),
 (('and', 'you'), 652),
 (('and', 'it'), 648),
 (('i', 'guess'), 640),
 (('no', 'i'), 616),
 (('yeah', "it's"), 608),
 (('um', 'i'), 604),
 (('oh', "that's"), 591),
 (("that's", 'right'), 575),
 (('and', 'the'), 558),
 (('<rem>', 'i'), 554),
 (('it', 'was'), 507),
 (('and', 'and'), 503),
 (('well', 'you'), 491),
 (('oh', 'well'), 456),
 (('and', 'we'), 456),
 (('yeah', '<rem>'), 445),
 (('a

In [116]:
unigram_contexts = [tuple(c) for c in local_contexts if len(c) == 1]
len(unigram_contexts)
len(set(unigram_contexts))
len(set(vocabulary))

184785

4521

27559

In [117]:
Counter(unigram_contexts).most_common()

[(('yeah',), 18375),
 (('and',), 17982),
 (('oh',), 12699),
 (('i',), 12686),
 (('well',), 10727),
 (('uh',), 8556),
 (('you',), 6928),
 (('but',), 6374),
 (('so',), 5657),
 (('um',), 3868),
 (("that's",), 3801),
 (('right',), 3167),
 (('no',), 2654),
 (('<rem>',), 2494),
 (('the',), 2491),
 (("it's",), 2396),
 (('it',), 2362),
 (('uh-huh',), 2199),
 (('that',), 2135),
 (('um-hum',), 1996),
 (('they',), 1837),
 (('okay',), 1828),
 (('yes',), 1708),
 (('we',), 1521),
 (('what',), 1333),
 (("i'm",), 1211),
 (('because',), 1209),
 (('a',), 1091),
 (('do',), 1089),
 (('in',), 1001),
 (('is',), 939),
 (('to',), 836),
 (('if',), 775),
 (('my',), 735),
 (('how',), 731),
 (('all',), 666),
 (('or',), 657),
 (('now',), 641),
 (("i've",), 589),
 (('just',), 552),
 (('for',), 531),
 (('when',), 530),
 (('he',), 530),
 (('not',), 503),
 (('are',), 495),
 (('have',), 483),
 (('which',), 473),
 (("they're",), 469),
 (('like',), 432),
 (('there',), 431),
 (('really',), 399),
 (('as',), 396),
 (('where

# Writing things to file

We want on-disk representations of 
 1. the pre-processed utterance relation
 2. the pre-processed word relation
 3. a file with just the pre-processed utterances
 4. the vocabulary in the pre-processed utterances
 5. the word analysis relation

## Utterance relation

In [118]:
os.getcwd()

'/mnt/cube/home/AD/emeinhar/fisher-lm'

In [119]:
os.chdir(switchboard_lm_dir)
switchboard_lm_dir
os.listdir()

'/mnt/cube/home/AD/emeinhar/switchboard-lm'

['.gitignore',
 '.ipynb_checkpoints',
 'swbd2003_words_relation.json',
 'Switchboard annotation exploration.ipynb',
 'README.md',
 'swbd2003_utterances_relation.json',
 'swbd2003_vocabulary.txt',
 'swbd2003_words_analysis_relation.json',
 'swbd2003_utterances.txt',
 'Preprocessing Switchboard (2003 release) corpus transcriptions for ease of processing and use with kenlm.ipynb',
 '.git']

In [120]:
swbd_utterances_relation_fn = 'swbd2003_utterances_relation.json'

swbd_utterances_relation_path = os.path.join(switchboard_lm_dir, swbd_utterances_relation_fn)

#~9s on wittgenstein
with open(swbd_utterances_relation_path, 'w', encoding='utf8') as json_file:
    json.dump(utterances, json_file, indent=1, ensure_ascii=False)

In [121]:
swbd_utterances_relation_path

'/mnt/cube/home/AD/emeinhar/switchboard-lm/swbd2003_utterances_relation.json'

In [122]:
%cat -n /mnt/cube/home/AD/emeinhar/switchboard-lm/swbd2003_utterances_relation.json | head -60

     1	[
     2	 {
     3	  "utterance_id": "sw3411A-ms98-a-0001",
     4	  "conversation_id": "3411",
     5	  "speaker": "A",
     6	  "start": 0.0,
     7	  "end": 0.197,
     8	  "duration": 0.197,
     9	  "speech_raw": "[noise]",
    10	  "speech": "",
    11	  "n_words": 0,
    12	  "has_words": false,
    13	  "has_word_internal_brackets": false
    14	 },
    15	 {
    16	  "utterance_id": "sw3411A-ms98-a-0002",
    17	  "conversation_id": "3411",
    18	  "speaker": "A",
    19	  "start": 0.197,
    20	  "end": 2.56175,
    21	  "duration": 2.36475,
    22	  "speech_raw": "what kind of car would you like to buy next",
    23	  "speech": "what kind of car would you like to buy next",
    24	  "n_words": 10,
    25	  "has_words": true,
    26	  "has_word_internal_brackets": false
    27	 },
    28	 {
    29	  "utterance_id": "sw3411A-ms98-a-0003",
    30	  "conversation_id": "3411",
    31	  "speaker": "A",
    32	  "start": 2.56175,
    33	  "en

## Word relation

In [123]:
os.getcwd()

'/mnt/cube/home/AD/emeinhar/switchboard-lm'

In [124]:
os.chdir(switchboard_lm_dir)
switchboard_lm_dir
os.listdir()

'/mnt/cube/home/AD/emeinhar/switchboard-lm'

['.gitignore',
 '.ipynb_checkpoints',
 'swbd2003_words_relation.json',
 'Switchboard annotation exploration.ipynb',
 'README.md',
 'swbd2003_utterances_relation.json',
 'swbd2003_vocabulary.txt',
 'swbd2003_words_analysis_relation.json',
 'swbd2003_utterances.txt',
 'Preprocessing Switchboard (2003 release) corpus transcriptions for ease of processing and use with kenlm.ipynb',
 '.git']

In [125]:
words[0]

{'utterance_id': 'sw3411A-ms98-a-0001',
 'conversation_id': '3411',
 'speaker': 'A',
 'start': 0.0,
 'end': 0.001875,
 'duration': 0.001875,
 'speech_raw': '[silence]',
 'speech': '',
 'has_words': False,
 'has_word_internal_brackets': False}

In [126]:
swbd_words_relation_fn = 'swbd2003_words_relation.json'

swbd_words_relation_path = os.path.join(switchboard_lm_dir, swbd_words_relation_fn)

#~80s on wittgenstein
with open(swbd_words_relation_path, 'w', encoding='utf8') as json_file:
    json.dump(words, json_file, indent=1, ensure_ascii=False)

In [127]:
swbd_words_relation_path

'/mnt/cube/home/AD/emeinhar/switchboard-lm/swbd2003_words_relation.json'

In [128]:
%cat -n /mnt/cube/home/AD/emeinhar/switchboard-lm/swbd2003_words_relation.json | head -60

     1	[
     2	 {
     3	  "utterance_id": "sw3411A-ms98-a-0001",
     4	  "conversation_id": "3411",
     5	  "speaker": "A",
     6	  "start": 0.0,
     7	  "end": 0.001875,
     8	  "duration": 0.001875,
     9	  "speech_raw": "[silence]",
    10	  "speech": "",
    11	  "has_words": false,
    12	  "has_word_internal_brackets": false
    13	 },
    14	 {
    15	  "utterance_id": "sw3411A-ms98-a-0001",
    16	  "conversation_id": "3411",
    17	  "speaker": "A",
    18	  "start": 0.001875,
    19	  "end": 0.073125,
    20	  "duration": 0.07125,
    21	  "speech_raw": "[noise]",
    22	  "speech": "",
    23	  "has_words": false,
    24	  "has_word_internal_brackets": false
    25	 },
    26	 {
    27	  "utterance_id": "sw3411A-ms98-a-0001",
    28	  "conversation_id": "3411",
    29	  "speaker": "A",
    30	  "start": 0.073125,
    31	  "end": 0.197,
    32	  "duration": 0.12387500000000001,
    33	  "speech_raw": "[silence]",
    34	  "speech": "",

## Corpus for a language model

In [129]:
os.getcwd()

'/mnt/cube/home/AD/emeinhar/switchboard-lm'

In [130]:
os.chdir(switchboard_lm_dir)
switchboard_lm_dir
os.listdir()

'/mnt/cube/home/AD/emeinhar/switchboard-lm'

['.gitignore',
 '.ipynb_checkpoints',
 'swbd2003_words_relation.json',
 'Switchboard annotation exploration.ipynb',
 'README.md',
 'swbd2003_utterances_relation.json',
 'swbd2003_vocabulary.txt',
 'swbd2003_words_analysis_relation.json',
 'swbd2003_utterances.txt',
 'Preprocessing Switchboard (2003 release) corpus transcriptions for ease of processing and use with kenlm.ipynb',
 '.git']

In [131]:
utterances[0]
[u for u in utterances if u['has_words'] and u['speech'] == '']
[u for u in utterances if not u['has_words'] and u['speech'] != '']

{'utterance_id': 'sw3411A-ms98-a-0001',
 'conversation_id': '3411',
 'speaker': 'A',
 'start': 0.0,
 'end': 0.197,
 'duration': 0.197,
 'speech_raw': '[noise]',
 'speech': '',
 'n_words': 0,
 'has_words': False,
 'has_word_internal_brackets': False}

[]

[]

In [132]:
just_utts_w_words = [u['speech'] for u in utterances if u['has_words']]
len(just_utts_w_words)
num_words = sum([u['n_words'] for u in utterances if u['has_words']])
num_words

248826

3071988

In [133]:
%%capture
#cell magic above suppresses useless output of file.write

swbd2003_utterances_fn = 'swbd2003_utterances.txt'

swbd2003_utterances_path = os.path.join(switchboard_lm_dir, swbd2003_utterances_fn)


with open(swbd2003_utterances_path, 'w') as file:
    #write all lines at once using writelines and this takes <1s
    utterances_w_linebreaks = list(map(lambda l: l + "\n", just_utts_w_words))
    file.writelines(utterances_w_linebreaks)

    #if someone else is using the machine heavily doing other things, this can take upwards of 2m
#     for utterance in just_utts_w_words:
#         file.write("{0}\n".format(utterance));

In [134]:
swbd2003_utterances_path

'/mnt/cube/home/AD/emeinhar/switchboard-lm/swbd2003_utterances.txt'

In [135]:
%cat -n /mnt/cube/home/AD/emeinhar/switchboard-lm/swbd2003_utterances.txt | head -60

     1	what kind of car would you like to buy next
     2	huh why would that be a
     3	okay well that's good if you had uh no financial requirements if you could buy any car in the entire world no matter what it cost what would you buy
     4	say what
     5	which type of cadillac uh is your favorite
     6	the seville that's a sharp looking car that really is it it always has been though
     7	you know it doesn't have the coup de ville or the sedan de ville squareness it never really has it's always had it's own unique look
     8	yeah i've always liked that i liked the the one year they had or the couple <rem> years they had were uh
     9	the trunk head would look like belt buckles across the back of it
    10	i thought now that looks sharp that looks real sharp
    11	um-hum
    12	yeah and uh the i i've never really uh i've never ridden in one recently um but they're supposed to be just real smooth
    13	just a nice comfortable ride
    14	uh-huh
    15	right now

## Vocabulary for a language model

In [136]:
os.getcwd()

'/mnt/cube/home/AD/emeinhar/switchboard-lm'

In [137]:
os.chdir(switchboard_lm_dir)
switchboard_lm_dir
os.listdir()

'/mnt/cube/home/AD/emeinhar/switchboard-lm'

['.gitignore',
 '.ipynb_checkpoints',
 'swbd2003_words_relation.json',
 'Switchboard annotation exploration.ipynb',
 'README.md',
 'swbd2003_utterances_relation.json',
 'swbd2003_vocabulary.txt',
 'swbd2003_words_analysis_relation.json',
 'swbd2003_utterances.txt',
 'Preprocessing Switchboard (2003 release) corpus transcriptions for ease of processing and use with kenlm.ipynb',
 '.git']

In [138]:
word_rels_with_speech = [w_rel for w_rel in words if w_rel['has_words']]
len(word_rels_with_speech)
word_rels_with_speech[0]

3071988

{'utterance_id': 'sw3411A-ms98-a-0002',
 'conversation_id': '3411',
 'speaker': 'A',
 'start': 0.3275,
 'end': 0.467,
 'duration': 0.1395,
 'speech_raw': 'what',
 'speech': 'what',
 'has_words': True,
 'has_word_internal_brackets': False}

In [139]:
[w for w in word_rels_with_speech if w['speech'] == '']

[]

In [140]:
just_word_tokens = [w_rel['speech'] for w_rel in word_rels_with_speech]
len(just_word_tokens)
just_word_tokens[:10]

3071988

['what', 'kind', 'of', 'car', 'would', 'you', 'like', 'to', 'buy', 'next']

In [141]:
word_types = set(just_word_tokens)
len(word_types)

27559

In [142]:
{w for w in word_types if ('ah' in w or 'um' in w or 'mm' in w) and len(w) < 5}

{'ah',
 'aha',
 'alum',
 'bah',
 'bahr',
 'blah',
 'bum',
 'bump',
 'bums',
 'crum',
 'cum',
 'dah',
 'drum',
 'dumb',
 'dump',
 'emmy',
 'gah',
 'gum',
 'hum',
 'hump',
 'jump',
 'lump',
 'mums',
 'nah',
 'numb',
 'oahu',
 'onum',
 'plum',
 'pump',
 'rum',
 'scum',
 'shah',
 'sum',
 'sump',
 'sums',
 'swum',
 'um',
 'utah',
 'yeah'}

In [143]:
sorted_vocabulary = sorted(list(word_types))

In [144]:
%%capture
#cell magic above suppresses useless output of file.write

swbd2003_words_fn = 'swbd2003_vocabulary.txt'

swbd2003_words_path = os.path.join(switchboard_lm_dir, swbd2003_words_fn)


with open(swbd2003_words_path, 'w') as file:
    #write all lines at once using writelines and this takes <1s
    words_w_linebreaks = list(map(lambda l: l + "\n", sorted_vocabulary))
    file.writelines(words_w_linebreaks)

#     for w in list(word_types):
#         file.write("{0}\n".format(w));

In [145]:
swbd2003_words_path

'/mnt/cube/home/AD/emeinhar/switchboard-lm/swbd2003_vocabulary.txt'

In [146]:
%cat -n /mnt/cube/home/AD/emeinhar/switchboard-lm/swbd2003_vocabulary.txt | head -60

     1	007
     2	1
     3	100
     4	101
     5	10a
     6	1200
     7	125k
     8	128
     9	13th
    10	13ths
    11	150
    12	1500
    13	2
    14	20/20
    15	2000
    16	21
    17	260
    18	286
    19	287
    20	2ci
    21	302
    22	365
    23	380
    24	386
    25	4-runner
    26	401k
    27	401k's
    28	48
    29	486
    30	49er
    31	49ers
    32	4h
    33	5
    34	500
    35	60
    36	635
    37	635's
    38	69
    39	6s
    40	7-eleven
    41	7-up
    42	7094
    43	747
    44	8088
    45	90210
    46	9050
    47	911
    48	990
    49	<rem>
    50	a
    51	a&e
    52	a&m
    53	a's
    54	a1
    55	aames
    56	aaron
    57	aarp
    58	aba
    59	aback
    60	abacus
cat: write error: Broken pipe


## Word analysis relation

In [147]:
os.getcwd()

'/mnt/cube/home/AD/emeinhar/switchboard-lm'

In [148]:
os.chdir(switchboard_lm_dir)
switchboard_lm_dir
os.listdir()

'/mnt/cube/home/AD/emeinhar/switchboard-lm'

['.gitignore',
 '.ipynb_checkpoints',
 'swbd2003_words_relation.json',
 'Switchboard annotation exploration.ipynb',
 'README.md',
 'swbd2003_utterances_relation.json',
 'swbd2003_vocabulary.txt',
 'swbd2003_words_analysis_relation.json',
 'swbd2003_utterances.txt',
 'Preprocessing Switchboard (2003 release) corpus transcriptions for ease of processing and use with kenlm.ipynb',
 '.git']

In [149]:
swbd_words_analysis_relation_fn = 'swbd2003_words_analysis_relation.json'

swbd_words_analysis_relation_path = os.path.join(switchboard_lm_dir, swbd_words_analysis_relation_fn)

#~60s on wittgenstein
with open(swbd_words_analysis_relation_path, 'w', encoding='utf8') as json_file:
    json.dump(word_analysis_relation, json_file, indent=1, ensure_ascii=False)

In [150]:
swbd_words_analysis_relation_path

'/mnt/cube/home/AD/emeinhar/switchboard-lm/swbd2003_words_analysis_relation.json'

In [151]:
%cat -n /mnt/cube/home/AD/emeinhar/switchboard-lm/swbd2003_words_analysis_relation.json | head -60

     1	[
     2	 {
     3	  "utterance_id": "sw3411A-ms98-a-0002",
     4	  "conversation_id": "3411",
     5	  "speaker": "A",
     6	  "orthographic_wordform": "what",
     7	  "duration": 0.1395,
     8	  "preceding_4_wordforms": [],
     9	  "preceding_wordforms": [],
    10	  "dist_from_left_edge": 0,
    11	  "dist_from_right_edge": 9,
    12	  "hasAdjacentPauseOrDisfluency": false,
    13	  "hasAdjacentFilledPause": false,
    14	  "hasClitic": false
    15	 },
    16	 {
    17	  "utterance_id": "sw3411A-ms98-a-0002",
    18	  "conversation_id": "3411",
    19	  "speaker": "A",
    20	  "orthographic_wordform": "kind",
    21	  "duration": 0.16999999999999998,
    22	  "preceding_4_wordforms": [
    23	   "what"
    24	  ],
    25	  "preceding_wordforms": [
    26	   "what"
    27	  ],
    28	  "dist_from_left_edge": 1,
    29	  "dist_from_right_edge": 8,
    30	  "hasAdjacentPauseOrDisfluency": false,
    31	  "hasAdjacentFilledPause": false,
    

## N-gram contexts

In [152]:
%%capture
#cell magic above suppresses useless output of file.write

fourgrams_switchboard_alpha = sorted(list(set(fourgram_contexts)))

fourgrams_switchboard_fn = 'swbd2003_fourgram_contexts.txt'

fourgrams_switchboard_fp = os.path.join(switchboard_lm_dir, fourgrams_switchboard_fn)

with open(fourgrams_switchboard_fp, 'w') as file:
    for w in fourgrams_switchboard_alpha:
        file.write("{0}\n".format(w));
        
len(fourgrams_switchboard_alpha)

In [153]:
%%capture
#cell magic above suppresses useless output of file.write

trigrams_switchboard_alpha = sorted(list(set(trigram_contexts)))

trigrams_switchboard_fn = 'swbd2003_trigram_contexts.txt'

trigrams_switchboard_fp = os.path.join(switchboard_lm_dir, trigrams_switchboard_fn)

with open(trigrams_switchboard_fp, 'w') as file:
    for w in trigrams_switchboard_alpha:
        file.write("{0}\n".format(w));
        
len(trigrams_switchboard_alpha)

In [154]:
%%capture
#cell magic above suppresses useless output of file.write

bigrams_switchboard_alpha = sorted(list(set(bigram_contexts)))

bigrams_switchboard_fn = 'swbd2003_bigram_contexts.txt'

bigrams_switchboard_fp = os.path.join(switchboard_lm_dir, bigrams_switchboard_fn)

with open(bigrams_switchboard_fp, 'w') as file:
    for w in bigrams_switchboard_alpha:
        file.write("{0}\n".format(w));
        
len(bigrams_switchboard_alpha)

In [155]:
%%capture
#cell magic above suppresses useless output of file.write

unigrams_switchboard_alpha = sorted(list(set(unigram_contexts)))

unigrams_switchboard_fn = 'swbd2003_unigram_contexts.txt'

unigrams_switchboard_fp = os.path.join(switchboard_lm_dir, unigrams_switchboard_fn)

with open(unigrams_switchboard_fp, 'w') as file:
    for w in unigrams_switchboard_alpha:
        file.write("{0}\n".format(w));
        
len(unigrams_switchboard_alpha)

In [156]:
total_num_contexts = len(unigrams_switchboard_alpha) + len(bigrams_switchboard_alpha) + len(trigrams_switchboard_alpha) + len(fourgrams_switchboard_alpha)
total_num_contexts

1958397

In [157]:
len(sorted_vocabulary)

27559