Skip to content

Commit

Permalink
v1.0.7
Browse files Browse the repository at this point in the history
  • Loading branch information
severinsimmler committed Nov 24, 2018
2 parents d4e302f + d97ee7e commit ff5fc06
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 4 deletions.
2 changes: 1 addition & 1 deletion src/cophi/__version__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
VERSION = (1, 0, 6)
VERSION = (1, 0, 7)

__version__ = ".".join(map(str, VERSION))
14 changes: 12 additions & 2 deletions src/cophi/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import collections
import itertools
import logging
import math
import pathlib

Expand All @@ -18,6 +19,8 @@
import cophi.utils
import cophi.complexity

logger = logging.getLogger(__name__)


class Textfile:
"""Model class for a Textfile.
Expand Down Expand Up @@ -351,8 +354,15 @@ def __init__(self, documents, sparse=False):
else:
matrix = pd.DataFrame
self.documents = documents
self.dtm = matrix({document.title: document.bow
for document in self.documents})
def count_corpus(documents):
corpus = dict()
for document in documents:
logger.info("Processing '{}'...".format(document.title))
corpus[document.title] = document.bow
return corpus
counts = count_corpus(self.documents)
logger.info("Constructing document-term matrix...")
self.dtm = matrix(counts)
self.dtm = self.dtm.T.fillna(0).astype(int)

@staticmethod
Expand Down
2 changes: 1 addition & 1 deletion src/cophi/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def construct_ngrams(tokens, n=2, sep=" "):
return (sep.join(ngram)
for ngram in zip(*(itertools.islice(i, token, None)
for token, i in enumerate(itertools.tee(tokens,
2)))))
n)))))


def find_tokens(document, token_pattern=r"\p{L}+\p{P}?\p{L}+", maximum=None):
Expand Down

0 comments on commit ff5fc06

Please sign in to comment.