Skip to content

Commit

Permalink
release: v1.0.8
Browse files Browse the repository at this point in the history
  • Loading branch information
severinsimmler committed Dec 23, 2018
2 parents ff5fc06 + 9c12e8b commit ad026a0
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 12 deletions.
2 changes: 1 addition & 1 deletion src/cophi/__version__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
VERSION = (1, 0, 7)
VERSION = (1, 0, 8)

__version__ = ".".join(map(str, VERSION))
7 changes: 4 additions & 3 deletions src/cophi/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def document(filepath, **kwargs):
return cophi.model.Document(textfile.content, **kwargs)


def corpus(directory, filepath_pattern="*.*", treat_as=None, encoding="utf-8",
def corpus(directory, filepath_pattern="*", treat_as=None, encoding="utf-8",
lowercase=True, n=None, token_pattern=r"\p{L}+\p{P}?\p{L}+",
maximum=None):
"""Pipe a collection of text files and create a Corpus object.
Expand All @@ -53,11 +53,12 @@ def corpus(directory, filepath_pattern="*.*", treat_as=None, encoding="utf-8",
"""
if not isinstance(directory, pathlib.Path):
directory = pathlib.Path(directory)
filepaths = directory.glob(filepath_pattern)
filepaths = directory.rglob(filepath_pattern)

def lazy_reading(filepaths):
for filepath in filepaths:
yield cophi.model.Textfile(filepath, treat_as, encoding)
if filepath.is_file() and ".git" not in str(filepath):
yield cophi.model.Textfile(filepath, treat_as, encoding)

metadata = cophi.model.Metadata()
documents = pd.Series()
Expand Down
53 changes: 45 additions & 8 deletions src/cophi/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ class Textfile:
parent (str): Parent path of text file.
encoding (str): Encoding used for UTF when reading.
"""

def __init__(self, filepath, treat_as=None, encoding="utf-8"):
if isinstance(filepath, str):
filepath = pathlib.Path(filepath)
Expand All @@ -46,7 +47,7 @@ def __init__(self, filepath, treat_as=None, encoding="utf-8"):
self.suffix = self.filepath.suffix
self.parent = str(self.filepath.parent)
self.encoding = encoding
if treat_as and treat_as not in {".txt", ".xml"}:
if treat_as is not None and treat_as not in {".txt", ".xml"}:
raise ValueError("The file format '{}' is not supported. "
"Try '.txt', or '.xml'.".format(treat_as))
else:
Expand Down Expand Up @@ -83,13 +84,9 @@ def stringify(tree):
def content(self):
"""Content of text file.
"""
if (not self.treat_as) and\
(self.suffix == ".txt") or\
(self.treat_as == ".txt"):
if (self.treat_as is None and self.suffix == ".txt") or (self.treat_as == ".txt"):
return self.filepath.read_text(encoding=self.encoding)
elif ((not self.treat_as) and
(self.suffix == ".xml") or
(self.treat_as == ".xml")):
elif (self.treat_as is None and self.suffix == ".xml") or (self.treat_as == ".xml"):
tree = self.parse_xml()
return self.stringify(tree)

Expand Down Expand Up @@ -120,6 +117,7 @@ class Document:
maximum (int): Stopped tokenizing after that much tokens.
tokens (list): Tokenized content of the document.
"""

def __init__(self, text, title=None, token_pattern=r"\p{L}+\p{P}?\p{L}+",
lowercase=True, n=None, maximum=None):
self.text = text
Expand Down Expand Up @@ -342,6 +340,7 @@ class Corpus:
dtm (pd.DataFrame): Document-term matrix with absolute
word frequencies.
"""

def __init__(self, documents, sparse=False):
if sparse:
raise NotImplementedError("This feature is not yet "
Expand All @@ -354,6 +353,7 @@ def __init__(self, documents, sparse=False):
else:
matrix = pd.DataFrame
self.documents = documents

def count_corpus(documents):
corpus = dict()
for document in documents:
Expand All @@ -363,7 +363,7 @@ def count_corpus(documents):
counts = count_corpus(self.documents)
logger.info("Constructing document-term matrix...")
self.dtm = matrix(counts)
self.dtm = self.dtm.T.fillna(0).astype(int)
self.dtm = self.dtm.T

@staticmethod
def map_metadata(data, metadata, uuid="uuid", fields=["title"], sep="_"):
Expand Down Expand Up @@ -677,11 +677,48 @@ def orlov_z(self, max_iterations=100, min_tolerance=1):
max_iterations,
min_tolerance)

@staticmethod
def svmlight(dtm, filepath):
"""Export corpus to SVMLight format.
Parameters:
dtm: Document-term matrix.
filepath: Path to output file.
"""
with pathlib.Path(filepath).open("w", encoding="utf-8") as file:
for title, document in dtm.iterrows():
# Drop types with zero frequencies:
document = document.dropna()
features = ["{word}:{freq}".format(word=word, freq=int(
freq)) for word, freq in document.iteritems()]
export = "{title} {title} {features}\n".format(
title=title, features=" ".join(features))
file.write(export)

@staticmethod
def plaintext(dtm, filepath):
"""Export corpus to plain text format.
Parameters:
dtm: Document-term matrix.
filepath: Path to output file.
"""
with pathlib.Path(filepath).open("w", encoding="utf-8") as file:
for title, document in dtm.iterrows():
# Drop types with zero frequencies:
document = document.dropna()
features = [" ".join([word] * int(freq))
for word, freq in document.iteritems()]
export = "{title} {title} {features}\n".format(
title=title, features=" ".join(features))
file.write(export)


class Metadata(pd.DataFrame):
"""Handle corpus metadata.
Feel free to implement some fancy stuff here.
"""

def __init__(self, **kwargs):
super().__init__(**kwargs)
16 changes: 16 additions & 0 deletions tests/test_model.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import pathlib

import pytest
import lxml
import numpy as np
Expand Down Expand Up @@ -284,3 +286,17 @@ def test_herdan_vm(self, corpus):

def test_orlov_z(self, corpus):
assert corpus.orlov_z(max_iterations=1) == 7.461820552205992

def test_svmlight(self, corpus):
output = pathlib.Path("corpus.svmlight")
corpus.svmlight(corpus.dtm, output)
assert output.exists()
with output.open("r", encoding="utf-8") as file:
assert file.read() == "document document a:3 b:2 c:3 d:1 e:1 f:1\n"

def test_plaintext(self, corpus):
output = pathlib.Path("corpus.txt")
corpus.plaintext(corpus.dtm, output)
assert output.exists()
with output.open("r", encoding="utf-8") as file:
assert file.read() == "document document a a a b b c c c d e f\n"

0 comments on commit ad026a0

Please sign in to comment.