chore: classmethod again

cophi-wue · Dec 23, 2018 · 45c4930 · 45c4930
1 parent feca973
commit 45c4930
Show file tree

Hide file tree

Showing 5 changed files with 54 additions and 63 deletions.
diff --git a/src/cophi/api.py b/src/cophi/api.py
@@ -93,8 +93,8 @@ def export(dtm, filepath, format="text"):
         format: File format.
     """
     if format.lower() in {"plaintext", "text"}:
-        cophi.utils.export_plaintext(dtm, filepath)
+        cophi.model.Corpus.export_plaintext(dtm, filepath)
     elif format.lower() in {"svmlight"}:
-        cophi.utils.export_svmlight(dtm, filepath)
+        cophi.model.Corpus.export_svmlight(dtm, filepath)
     else:
         raise ValueError("'{}' is no supported file format.".format(format))
diff --git a/src/cophi/model.py b/src/cophi/model.py
@@ -677,6 +677,42 @@ def orlov_z(self, max_iterations=100, min_tolerance=1):
                                         max_iterations,
                                         min_tolerance)
 
+    @classmethod
+    def svmlight(cls, dtm, filepath):
+        """Export corpus to SVMLight format.
+
+        Parameters:
+            dtm: Document-term matrix.
+            filepath: Path to output file.
+        """
+        with pathlib.Path(filepath).open("w", encoding="utf-8") as file:
+            for title, document in dtm.iterrows():
+                # Drop types with zero frequencies:
+                document = document.dropna()
+                features = ["{word}:{freq}".format(word=word, freq=int(
+                    freq)) for word, freq in document.iteritems()]
+                export = "{title} {title} {features}\n".format(
+                    title=title, features=" ".join(features))
+                file.write(export)
+
+    @classmethod
+    def plaintext(cls, dtm, filepath):
+        """Export corpus to plain text format.
+
+        Parameters:
+            dtm: Document-term matrix.
+            filepath: Path to output file.
+        """
+        with pathlib.Path(filepath).open("w", encoding="utf-8") as file:
+            for title, document in dtm.iterrows():
+                # Drop types with zero frequencies:
+                document = document.dropna()
+                features = [" ".join([word] * int(freq))
+                            for word, freq in document.iteritems()]
+                export = "{title} {title} {features}\n".format(
+                    title=title, features=" ".join(features))
+                file.write(export)
+
 
 class Metadata(pd.DataFrame):
     """Handle corpus metadata.

diff --git a/src/cophi/utils.py b/src/cophi/utils.py
@@ -7,7 +7,6 @@
 
 import collections
 import itertools
-import pathlib
 
 import pandas as pd
 import regex as re
@@ -22,8 +21,8 @@ def construct_ngrams(tokens, n=2, sep=" "):
     """
     return (sep.join(ngram)
             for ngram in zip(*(itertools.islice(i, token, None)
-                               for token, i in enumerate(itertools.tee(tokens,
-                                                                       n)))))
+                             for token, i in enumerate(itertools.tee(tokens,
+                                                                     n)))))
 
 
 def find_tokens(document, token_pattern=r"\p{L}+\p{P}?\p{L}+", maximum=None):
@@ -113,39 +112,3 @@ def _parameter(tokens, measure):
                 "freq_spectrum": pd.Series(freq_spectrum)}
     else:
         return {"num_types": len(set(tokens)), "num_tokens": len(tokens)}
-
-
-def export_svmlight(dtm, filepath):
-    """Export document-term matrix to SVMLight format.
-
-    Parameters:
-        dtm: Document-term matrix.
-        filepath: Path to output file.
-    """
-    with pathlib.Path(filepath).open("w", encoding="utf-8") as file:
-        for title, document in dtm.iterrows():
-            # Drop types with zero frequencies:
-            document = document.dropna()
-            features = ["{word}:{freq}".format(word=word, freq=int(
-                freq)) for word, freq in document.iteritems()]
-            export = "{title} {title} {features}\n".format(
-                title=title, features=" ".join(features))
-            file.write(export)
-
-
-def export_plaintext(dtm, filepath):
-    """Export document-term matrix to plain text format.
-
-    Parameters:
-        dtm: Document-term matrix.
-        filepath: Path to output file.
-    """
-    with pathlib.Path(filepath).open("w", encoding="utf-8") as file:
-        for title, document in matrix.iterrows():
-            # Drop types with zero frequencies:
-            document = document.dropna()
-            features = [" ".join([word] * int(freq))
-                        for word, freq in document.iteritems()]
-            export = "{title} {title} {features}\n".format(
-                title=title, features=" ".join(features))
-            file.write(export)
diff --git a/tests/test_model.py b/tests/test_model.py
@@ -286,3 +286,17 @@ def test_herdan_vm(self, corpus):
 
     def test_orlov_z(self, corpus):
         assert corpus.orlov_z(max_iterations=1) == 7.461820552205992
+
+    def test_svmlight(self, corpus):
+        output = pathlib.Path("corpus.svmlight")
+        cophi.model.Corpus.svmlight(corpus.dtm, output)
+        assert output.exists()
+        with output.open("r", encoding="utf-8") as file:
+            assert file.read() == "document document a:3 b:2 c:3 d:1 e:1 f:1\n"
+
+    def test_plaintext(self, corpus):
+        output = pathlib.Path("corpus.txt")
+        cophi.model.Corpus.plaintext(corpus.dtm, output)
+        assert output.exists()
+        with output.open("r", encoding="utf-8") as file:
+            assert file.read() == "document document a a a b b c c c d e f\n"
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -7,14 +7,6 @@
 DOCUMENT = PARAGRAPHS[0][0]
 TOKENS = DOCUMENT.split(" ")
 
-@pytest.fixture
-def document():
-    return cophi.model.Document(DOCUMENT, "document", r"\w")
-
-@pytest.fixture
-def corpus(document):
-    return cophi.model.Corpus([document])
-
 def test_construct_ngrams():
     ngrams = cophi.utils.construct_ngrams(TOKENS)
     assert list(ngrams) == ["A B", "B C", "C D", "D E", "E F"]
@@ -43,17 +35,3 @@ def test_parameter():
     assert len(parameter) == 2
     parameter = cophi.utils._parameter(TOKENS, "ttr")
     assert len(parameter) == 2
-
-def test_svmlight(self, corpus):
-    output = pathlib.Path("corpus.svmlight")
-    cophi.utils.export_svmlight(corpus.dtm, output)
-    assert output.exists()
-    with output.open("r", encoding="utf-8") as file:
-        assert file.read() == "document document a:1 b:1 c:1 d:1 e:1 f:1\n"
-
-def test_plaintext(self, corpus):
-    output = pathlib.Path("corpus.txt")
-    cophi.utils.export_plaintext(corpus.dtm, output)
-    assert output.exists()
-    with output.open("r", encoding="utf-8") as file:
-        assert file.read() == "document document a b c d e f\n"