Skip to content

Commit

Permalink
chore: update ngrams
Browse files Browse the repository at this point in the history
  • Loading branch information
severinsimmler committed Sep 1, 2018
1 parent a122d1b commit 6e7a46a
Showing 1 changed file with 4 additions and 1 deletion.
5 changes: 4 additions & 1 deletion src/cophi/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"""

import collections
import itertools

import pandas as pd
import regex as re
Expand All @@ -18,7 +19,9 @@ def construct_ngrams(tokens, n=2, sep=" "):
n (int): Number of tokens per ngram.
sep (str): Separator between tokens within an ngram.
"""
return (sep.join(ngram) for ngram in zip(*[tokens[i:] for i in range(n)]))
return (sep.join(ngram)
for ngram in zip(*(itertools.islice(i, token, None)
for token, i in enumerate(itertools.tee(tokens, 2)))))

def find_tokens(document, token_pattern=r"\p{L}+\p{P}?\p{L}+", maximum=None):
"""
Expand Down

0 comments on commit 6e7a46a

Please sign in to comment.