Skip to content

Commit

Permalink
Cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
dimazest committed Nov 19, 2013
1 parent 31f9637 commit 24d28c8
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 44 deletions.
28 changes: 3 additions & 25 deletions fowler/switchboard/main.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,6 @@
from collections import Counter

from .util import (
tokens,
ContextBefore,
WordUtterance,
writer,
)
from .util import WordUtterance, writer
from .options import Dispatcher


Expand Down Expand Up @@ -49,23 +44,6 @@ def tags(


@writer(command)
def word_document(utterances, ngram_len):
def word_document(utterances, ngram_len, verbose):
"""Word document."""
return WordUtterance(utterances, ngram_len=ngram_len)


@writer(command)
def inner(utterances, ngram_len):
return tokens(utterances, n=ngram_len)


@writer(
command,
extra_options=(
('c', 'context-len', 3, 'Length of the context in "before mode.'),
)
)
def before(utterances, ngram_len, context_len):
return ContextBefore(utterances, context_len, ngram_len=ngram_len)


return WordUtterance(utterances, ngram_len=ngram_len, verbose=verbose)
3 changes: 3 additions & 0 deletions fowler/switchboard/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ def utterances_iter():
if 'corpus' in f_args:
kwargs['corpus'] = corpus

if 'limit' in f_args:
kwargs['limit'] = limit

return func(*args, **kwargs)

return wrapper
52 changes: 33 additions & 19 deletions fowler/switchboard/util.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,42 @@
import sys
from itertools import chain
from collections import Counter, deque
from functools import wraps

from .io import write_cooccurrence_matrix


def tokens(utterances, n=1):
for utterance in utterances:
ngram = deque([], n)
for w, _ in utterance.pos_lemmas():
ngram.append(w)
yield utterance.act_tag, '_'.join(ngram)
def utterance_ngrams(utterance, ngram_len=1):
ngram = deque(['<BEGIN>'], ngram_len)

words = chain(utterance.pos_words(), ['<END>'])

def ContextBefore(utterances, context_len=3, ngram_len=1):
context = deque([], context_len)
for w in words:
ngram.append(w)

for utterance in utterances:
context.append(utterance)
yield '_'.join(ngram)

for token in tokens(context, ngram_len):
yield token


def WordUtterance(utterances, ngram_len):
def WordUtterance(utterances, ngram_len, verbose=False):
for document_id, utterance in enumerate(utterances):
words = utterance.pos_words()
ngrams = utterance_ngrams(utterance, ngram_len=ngram_len)

if verbose:
sys.stderr.write(
'Document id: {document_id}\n'
'Words: {ngrams}\n'
'\n'.format(
document_id=document_id,
ngrams=' '.join(ngrams),
)
)

# TODO: it would be nice to treat utterances that don't
# contain any word differently.
if not words:
if not ngrams:
yield '<NON_VERBAL>', document_id
for word in words:
yield word, document_id
for ngram in ngrams:
yield ngram, document_id


def writer(
Expand All @@ -40,7 +46,8 @@ def writer(
def wrapper(f):
options = extra_options + (
('n', 'ngram_len', 1, 'Length of the tokens (bigrams, ngrams).'),
('o', 'output', 'out.h5', 'The output file.'),
('o', 'output', 'swda-{limit}items-{ngram_len}gram.h5', 'The output file.'),
('v', 'verbose', False, 'Be verbose.'),
)

@command(options=options)
Expand All @@ -49,9 +56,16 @@ def wrapped(
utterances_iter,
output,
corpus,
limit,
**context
):
counter = Counter(f(utterances_iter(), **context))

output = output.format(
limit=limit,
ngram_len=context['ngram_len'],
)

return write_cooccurrence_matrix(counter, output, utterances_iter())

return wrapped
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def run_tests(self):
'pandas',
'python-dateutil',
'scikit-learn',
'scipy',
'setuptools',
'tables',
],
Expand Down

0 comments on commit 24d28c8

Please sign in to comment.