Skip to content

Commit

Permalink
chore: classmethod again
Browse files Browse the repository at this point in the history
  • Loading branch information
severinsimmler committed Dec 23, 2018
1 parent feca973 commit 45c4930
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 63 deletions.
4 changes: 2 additions & 2 deletions src/cophi/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,8 @@ def export(dtm, filepath, format="text"):
format: File format.
"""
if format.lower() in {"plaintext", "text"}:
cophi.utils.export_plaintext(dtm, filepath)
cophi.model.Corpus.export_plaintext(dtm, filepath)
elif format.lower() in {"svmlight"}:
cophi.utils.export_svmlight(dtm, filepath)
cophi.model.Corpus.export_svmlight(dtm, filepath)
else:
raise ValueError("'{}' is no supported file format.".format(format))
36 changes: 36 additions & 0 deletions src/cophi/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -677,6 +677,42 @@ def orlov_z(self, max_iterations=100, min_tolerance=1):
max_iterations,
min_tolerance)

@classmethod
def svmlight(cls, dtm, filepath):
"""Export corpus to SVMLight format.
Parameters:
dtm: Document-term matrix.
filepath: Path to output file.
"""
with pathlib.Path(filepath).open("w", encoding="utf-8") as file:
for title, document in dtm.iterrows():
# Drop types with zero frequencies:
document = document.dropna()
features = ["{word}:{freq}".format(word=word, freq=int(
freq)) for word, freq in document.iteritems()]
export = "{title} {title} {features}\n".format(
title=title, features=" ".join(features))
file.write(export)

@classmethod
def plaintext(cls, dtm, filepath):
"""Export corpus to plain text format.
Parameters:
dtm: Document-term matrix.
filepath: Path to output file.
"""
with pathlib.Path(filepath).open("w", encoding="utf-8") as file:
for title, document in dtm.iterrows():
# Drop types with zero frequencies:
document = document.dropna()
features = [" ".join([word] * int(freq))
for word, freq in document.iteritems()]
export = "{title} {title} {features}\n".format(
title=title, features=" ".join(features))
file.write(export)


class Metadata(pd.DataFrame):
"""Handle corpus metadata.
Expand Down
41 changes: 2 additions & 39 deletions src/cophi/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

import collections
import itertools
import pathlib

import pandas as pd
import regex as re
Expand All @@ -22,8 +21,8 @@ def construct_ngrams(tokens, n=2, sep=" "):
"""
return (sep.join(ngram)
for ngram in zip(*(itertools.islice(i, token, None)
for token, i in enumerate(itertools.tee(tokens,
n)))))
for token, i in enumerate(itertools.tee(tokens,
n)))))


def find_tokens(document, token_pattern=r"\p{L}+\p{P}?\p{L}+", maximum=None):
Expand Down Expand Up @@ -113,39 +112,3 @@ def _parameter(tokens, measure):
"freq_spectrum": pd.Series(freq_spectrum)}
else:
return {"num_types": len(set(tokens)), "num_tokens": len(tokens)}


def export_svmlight(dtm, filepath):
"""Export document-term matrix to SVMLight format.
Parameters:
dtm: Document-term matrix.
filepath: Path to output file.
"""
with pathlib.Path(filepath).open("w", encoding="utf-8") as file:
for title, document in dtm.iterrows():
# Drop types with zero frequencies:
document = document.dropna()
features = ["{word}:{freq}".format(word=word, freq=int(
freq)) for word, freq in document.iteritems()]
export = "{title} {title} {features}\n".format(
title=title, features=" ".join(features))
file.write(export)


def export_plaintext(dtm, filepath):
"""Export document-term matrix to plain text format.
Parameters:
dtm: Document-term matrix.
filepath: Path to output file.
"""
with pathlib.Path(filepath).open("w", encoding="utf-8") as file:
for title, document in matrix.iterrows():
# Drop types with zero frequencies:
document = document.dropna()
features = [" ".join([word] * int(freq))
for word, freq in document.iteritems()]
export = "{title} {title} {features}\n".format(
title=title, features=" ".join(features))
file.write(export)
14 changes: 14 additions & 0 deletions tests/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,3 +286,17 @@ def test_herdan_vm(self, corpus):

def test_orlov_z(self, corpus):
assert corpus.orlov_z(max_iterations=1) == 7.461820552205992

def test_svmlight(self, corpus):
output = pathlib.Path("corpus.svmlight")
cophi.model.Corpus.svmlight(corpus.dtm, output)
assert output.exists()
with output.open("r", encoding="utf-8") as file:
assert file.read() == "document document a:3 b:2 c:3 d:1 e:1 f:1\n"

def test_plaintext(self, corpus):
output = pathlib.Path("corpus.txt")
cophi.model.Corpus.plaintext(corpus.dtm, output)
assert output.exists()
with output.open("r", encoding="utf-8") as file:
assert file.read() == "document document a a a b b c c c d e f\n"
22 changes: 0 additions & 22 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,6 @@
DOCUMENT = PARAGRAPHS[0][0]
TOKENS = DOCUMENT.split(" ")

@pytest.fixture
def document():
return cophi.model.Document(DOCUMENT, "document", r"\w")

@pytest.fixture
def corpus(document):
return cophi.model.Corpus([document])

def test_construct_ngrams():
ngrams = cophi.utils.construct_ngrams(TOKENS)
assert list(ngrams) == ["A B", "B C", "C D", "D E", "E F"]
Expand Down Expand Up @@ -43,17 +35,3 @@ def test_parameter():
assert len(parameter) == 2
parameter = cophi.utils._parameter(TOKENS, "ttr")
assert len(parameter) == 2

def test_svmlight(self, corpus):
output = pathlib.Path("corpus.svmlight")
cophi.utils.export_svmlight(corpus.dtm, output)
assert output.exists()
with output.open("r", encoding="utf-8") as file:
assert file.read() == "document document a:1 b:1 c:1 d:1 e:1 f:1\n"

def test_plaintext(self, corpus):
output = pathlib.Path("corpus.txt")
cophi.utils.export_plaintext(corpus.dtm, output)
assert output.exists()
with output.open("r", encoding="utf-8") as file:
assert file.read() == "document document a b c d e f\n"

0 comments on commit 45c4930

Please sign in to comment.