release: v1.0.8

cophi-wue · Dec 23, 2018 · ad026a0 · ad026a0
2 parents ff5fc06 + 9c12e8b
commit ad026a0
Show file tree

Hide file tree

Showing 4 changed files with 66 additions and 12 deletions.
diff --git a/src/cophi/__version__.py b/src/cophi/__version__.py
@@ -1,3 +1,3 @@
-VERSION = (1, 0, 7)
+VERSION = (1, 0, 8)
 
 __version__ = ".".join(map(str, VERSION))
diff --git a/src/cophi/api.py b/src/cophi/api.py
@@ -33,7 +33,7 @@ def document(filepath, **kwargs):
     return cophi.model.Document(textfile.content, **kwargs)
 
 
-def corpus(directory, filepath_pattern="*.*", treat_as=None, encoding="utf-8",
+def corpus(directory, filepath_pattern="*", treat_as=None, encoding="utf-8",
            lowercase=True, n=None, token_pattern=r"\p{L}+\p{P}?\p{L}+",
            maximum=None):
     """Pipe a collection of text files and create a Corpus object.
@@ -53,11 +53,12 @@ def corpus(directory, filepath_pattern="*.*", treat_as=None, encoding="utf-8",
     """
     if not isinstance(directory, pathlib.Path):
         directory = pathlib.Path(directory)
-    filepaths = directory.glob(filepath_pattern)
+    filepaths = directory.rglob(filepath_pattern)
 
     def lazy_reading(filepaths):
         for filepath in filepaths:
-            yield cophi.model.Textfile(filepath, treat_as, encoding)
+            if filepath.is_file() and ".git" not in str(filepath):
+                yield cophi.model.Textfile(filepath, treat_as, encoding)
 
     metadata = cophi.model.Metadata()
     documents = pd.Series()

diff --git a/src/cophi/model.py b/src/cophi/model.py
@@ -38,6 +38,7 @@ class Textfile:
         parent (str): Parent path of text file.
         encoding (str): Encoding used for UTF when reading.
     """
+
     def __init__(self, filepath, treat_as=None, encoding="utf-8"):
         if isinstance(filepath, str):
             filepath = pathlib.Path(filepath)
@@ -46,7 +47,7 @@ def __init__(self, filepath, treat_as=None, encoding="utf-8"):
         self.suffix = self.filepath.suffix
         self.parent = str(self.filepath.parent)
         self.encoding = encoding
-        if treat_as and treat_as not in {".txt", ".xml"}:
+        if treat_as is not None and treat_as not in {".txt", ".xml"}:
             raise ValueError("The file format '{}' is not supported. "
                              "Try '.txt', or '.xml'.".format(treat_as))
         else:
@@ -83,13 +84,9 @@ def stringify(tree):
     def content(self):
         """Content of text file.
         """
-        if (not self.treat_as) and\
-           (self.suffix == ".txt") or\
-           (self.treat_as == ".txt"):
+        if (self.treat_as is None and self.suffix == ".txt") or (self.treat_as == ".txt"):
             return self.filepath.read_text(encoding=self.encoding)
-        elif ((not self.treat_as) and
-              (self.suffix == ".xml") or
-              (self.treat_as == ".xml")):
+        elif (self.treat_as is None and self.suffix == ".xml") or (self.treat_as == ".xml"):
             tree = self.parse_xml()
             return self.stringify(tree)
 
@@ -120,6 +117,7 @@ class Document:
         maximum (int): Stopped tokenizing after that much tokens.
         tokens (list): Tokenized content of the document.
     """
+
     def __init__(self, text, title=None, token_pattern=r"\p{L}+\p{P}?\p{L}+",
                  lowercase=True, n=None, maximum=None):
         self.text = text
@@ -342,6 +340,7 @@ class Corpus:
         dtm (pd.DataFrame): Document-term matrix with absolute
             word frequencies.
     """
+
     def __init__(self, documents, sparse=False):
         if sparse:
             raise NotImplementedError("This feature is not yet "
@@ -354,6 +353,7 @@ def __init__(self, documents, sparse=False):
         else:
             matrix = pd.DataFrame
         self.documents = documents
+
         def count_corpus(documents):
             corpus = dict()
             for document in documents:
@@ -363,7 +363,7 @@ def count_corpus(documents):
         counts = count_corpus(self.documents)
         logger.info("Constructing document-term matrix...")
         self.dtm = matrix(counts)
-        self.dtm = self.dtm.T.fillna(0).astype(int)
+        self.dtm = self.dtm.T
 
     @staticmethod
     def map_metadata(data, metadata, uuid="uuid", fields=["title"], sep="_"):
@@ -677,11 +677,48 @@ def orlov_z(self, max_iterations=100, min_tolerance=1):
                                         max_iterations,
                                         min_tolerance)
 
+    @staticmethod
+    def svmlight(dtm, filepath):
+        """Export corpus to SVMLight format.
+
+        Parameters:
+            dtm: Document-term matrix.
+            filepath: Path to output file.
+        """
+        with pathlib.Path(filepath).open("w", encoding="utf-8") as file:
+            for title, document in dtm.iterrows():
+                # Drop types with zero frequencies:
+                document = document.dropna()
+                features = ["{word}:{freq}".format(word=word, freq=int(
+                    freq)) for word, freq in document.iteritems()]
+                export = "{title} {title} {features}\n".format(
+                    title=title, features=" ".join(features))
+                file.write(export)
+
+    @staticmethod
+    def plaintext(dtm, filepath):
+        """Export corpus to plain text format.
+
+        Parameters:
+            dtm: Document-term matrix.
+            filepath: Path to output file.
+        """
+        with pathlib.Path(filepath).open("w", encoding="utf-8") as file:
+            for title, document in dtm.iterrows():
+                # Drop types with zero frequencies:
+                document = document.dropna()
+                features = [" ".join([word] * int(freq))
+                            for word, freq in document.iteritems()]
+                export = "{title} {title} {features}\n".format(
+                    title=title, features=" ".join(features))
+                file.write(export)
+
 
 class Metadata(pd.DataFrame):
     """Handle corpus metadata.
 
     Feel free to implement some fancy stuff here.
     """
+
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
diff --git a/tests/test_model.py b/tests/test_model.py
@@ -1,3 +1,5 @@
+import pathlib
+
 import pytest
 import lxml
 import numpy as np
@@ -284,3 +286,17 @@ def test_herdan_vm(self, corpus):
 
     def test_orlov_z(self, corpus):
         assert corpus.orlov_z(max_iterations=1) == 7.461820552205992
+
+    def test_svmlight(self, corpus):
+        output = pathlib.Path("corpus.svmlight")
+        corpus.svmlight(corpus.dtm, output)
+        assert output.exists()
+        with output.open("r", encoding="utf-8") as file:
+            assert file.read() == "document document a:3 b:2 c:3 d:1 e:1 f:1\n"
+
+    def test_plaintext(self, corpus):
+        output = pathlib.Path("corpus.txt")
+        corpus.plaintext(corpus.dtm, output)
+        assert output.exists()
+        with output.open("r", encoding="utf-8") as file:
+            assert file.read() == "document document a a a b b c c c d e f\n"