release: v1.3.0

cophi-wue · Apr 25, 2019 · 7d9289b · 7d9289b
1 parent 40cd9fd
commit 7d9289b
Show file tree

Hide file tree

Showing 17 changed files with 380 additions and 144 deletions.
diff --git a/.gitignore b/.gitignore
@@ -21,3 +21,6 @@ _build
 htmlcov
 tests.xml
 .pytest_cache
+notebooks/ddw-0.4.6.jar
+notebooks/ddw.log
+notebooks/configs/*
diff --git a/README.md b/README.md
@@ -1,15 +1,17 @@
 # A library for preprocessing
-`cophi` is a Python library for handling, modeling and processing text corpora. You
-can easily pipe a collection of text files using the high-level API:
+`cophi` is a Python library for handling, modeling and processing text corpora. You can easily pipe a collection of text files using the high-level API:
 
-```python
+```
 corpus, metadata = cophi.corpus(directory="british-fiction-corpus",
-                                filepath_pattern="*.txt",
+                                pathname_pattern="**/*.txt",
                                 encoding="utf-8",
                                 lowercase=True,
                                 token_pattern=r"\p{L}+\p{P}?\p{L}+")
 ```
 
+You can also plug the [DARIAH-DKPro-Wrapper](https://dariah-de.github.io/DARIAH-DKPro-Wrapper/) into this pipeline to lemmatize text, or just keep certain word types. Check out the introducing [Jupyter notebook](https://github.com/cophi-wue/cophi-toolbox/blob/master/notebooks/introducing-cophi.ipynb).
+
+
 ## Getting started
 To install the latest **stable** version:
 ```
@@ -21,16 +23,10 @@ To install the latest **development** version:
 $ pip install --upgrade git+https://github.com/cophi-wue/cophi-toolbox.git@testing
 ```
 
-Check out the introducing [Jupyter notebook](https://github.com/cophi-wue/cophi-toolbox/blob/master/notebooks/API.ipynb).
-
-## Contents
-- [`api`](https://github.com/cophi-wue/cophi-toolbox/blob/master/src/cophi_toolbox/api.py): High-level API.
-- [`model`](https://github.com/cophi-wue/cophi-toolbox/blob/master/src/cophi_toolbox/model.py): Low-level model classes.
-- [`complexity`](https://github.com/cophi-wue/cophi-toolbox/blob/master/src/cophi_toolbox/complexity.py): Measures that assess the linguistic and stylistic complexity of (literary) texts.
-- [`utils`](https://github.com/cophi-wue/cophi-toolbox/blob/master/src/cophi_toolbox/utils.py): Low-level helper functions.
+## Available complexity measures
+There are also a plenty of complexity metrics for measuring the lexical richness of (literary) texts.
 
 
-## Available complexity measures
 Measures that use sample size and vocabulary size:
   * Type-Token Ratio TTR
   * Guiraud’s R

diff --git a/notebooks/API.ipynb → notebooks/introducing-cophi.ipynb b/notebooks/API.ipynb → notebooks/introducing-cophi.ipynb
diff --git a/setup.py b/setup.py
@@ -72,7 +72,7 @@ def run(self):
         self.status("Pushing git tags ...")
         os.system("git tag v{0}".format(about["__version__"]))
         os.system("git push --tags")
-        
+
         sys.exit()
 
 setup(
@@ -91,14 +91,13 @@ def run(self):
     include_package_data=True,
     license="Apache 2.0",
     classifiers=[
-        "License :: OSI Approved :: MIT License",
         "Programming Language :: Python",
         "Programming Language :: Python :: 3",
         "Programming Language :: Python :: 3.4",
         "Programming Language :: Python :: 3.5",
         "Programming Language :: Python :: 3.6",
-        "Programming Language :: Python :: Implementation :: CPython",
-        "Programming Language :: Python :: Implementation :: PyPy"
+        "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: Implementation :: CPython"
     ],
     cmdclass={
         "upload": UploadCommand,

diff --git a/src/cophi/__init__.py b/src/cophi/__init__.py
@@ -1,19 +1,17 @@
-r"""
-**cophi** is a Python library for handling, modeling and processing text
+"""
+This is a Python library for handling, modeling and processing text
 corpora. You can easily pipe a collection of text files using the
 high-level API:
 
-.. code-block:: python
-
-   corpus, metadata = cophi.corpus(directory="british-fiction-corpus",
-                                   pathname_pattern="**/*.txt",
-                                   encoding="utf-8",
-                                   lowercase=True,
-                                   n=1,
-                                   token_pattern=r"\p{L}+\p{P}?\p{L}+")
+```
+corpus, metadata = cophi.corpus(directory="british-fiction-corpus",
+                                pathname_pattern="**/*.txt",
+                                encoding="utf-8",
+                                lowercase=True,
+                                token_pattern=r"\p{L}+\p{P}?\p{L}+")
+```
 
-
-There are also a plenty of complexity metrics for measuring lexical
+There are also a plenty of complexity metrics for measuring the lexical
 richness of (literary) texts.
 
 Measures that use sample size and vocabulary size:
@@ -46,4 +44,5 @@
 :module:`complexity` module.
 """
 
-from cophi.api import document, corpus, export
+
+from cophi.api import document, corpus
diff --git a/src/cophi/__version__.py b/src/cophi/__version__.py
@@ -1,3 +1,3 @@
-VERSION = (1, 2, 3)
+VERSION = (1, 3, 0)
 
 __version__ = ".".join(map(str, VERSION))
diff --git a/src/cophi/api.py b/src/cophi/api.py
@@ -6,21 +6,28 @@
 """
 
 import logging
-import pathlib
+from pathlib import Path
 import uuid
+
 import pandas as pd
-import cophi.model
+
+from cophi import dkpro, text
 
 
 logger = logging.getLogger(__name__)
 
 
-def document(filepath, **kwargs):
+def document(filepath, lemma=False, pos=None, jar="ddw-0.4.6.jar",
+             language="de", **kwargs):
     """Read a text file and create a Document object.
 
     Parameter:
         filepath (str): Path to the text file.
-        title (str): Describing title for the document. (optional).
+        lemma (bool): If True, lemmatize text (optional).
+        pos (list): If not None, filter POS tags (optional).
+        jar (str): Path to DARIAH-DKPro-Wrapper JAR file (optional).
+        language (str): Language of text (optional).
+        title (str): Describing title for the document (optional).
         lowercase (bool): If True, writes all letters in lowercase (optional).
         n (int): Number of tokens per ngram (optional).
         token_pattern (str): Regex pattern for one token (optional).
@@ -29,13 +36,17 @@ def document(filepath, **kwargs):
     Returns:
         A Document object.
     """
-    textfile = cophi.model.Textfile(filepath)
-    return cophi.model.Document(textfile.content, **kwargs)
+    if lemma or pos:
+        return dkpro.pipe(filepath, jar, language, lemma, pos, **kwargs)
+    else:
+        filepath = text.model.filepath(filepath)
+        return text.model.Document(filepath.content, **kwargs)
 
 
-def corpus(directory, filepath_pattern="*", treat_as=None, encoding="utf-8",
+def corpus(directory, filepath_pattern="*.txt", treat_as=None, encoding="utf-8",
            lowercase=True, n=None, token_pattern=r"\p{L}+\p{P}?\p{L}+",
-           maximum=None, metadata=True):
+           maximum=None, metadata=True, lemma=False, pos=None,
+           jar="ddw-0.4.6.jar", language="de"):
     """Pipe a collection of text files and create a Corpus object.
 
     Parameters:
@@ -48,58 +59,56 @@ def corpus(directory, filepath_pattern="*", treat_as=None, encoding="utf-8",
         token_pattern (str): Regex pattern for one token (optional).
         maximum (int): Stop tokenizing after that much tokens (optional).
         metadata (bool): Extract metadata from filenames (optional).
+        lemma (bool): If True, lemmatize text (optional).
+        pos (list): If not None, filter POS tags (optional).
+        jar (str): Path to DARIAH-DKPro-Wrapper JAR file (optional).
+        language (str): Language of text (optional).
 
     Returns:
         A Corpus model object and optionally a Metadata object.
     """
-    if not isinstance(directory, pathlib.Path):
-        directory = pathlib.Path(directory)
-    filepaths = directory.rglob(filepath_pattern)
+    filepaths = Path(directory).rglob(filepath_pattern)
 
-    def lazy_reading(filepaths):
+    def lazy_processing(filepaths, **kwargs):
         for filepath in filepaths:
-            if filepath.is_file() and ".git" not in str(filepath):
-                yield cophi.model.Textfile(filepath, treat_as, encoding)
+            logger.info(f"Processing '{filepath.stem}' ...")
+            if filepath.is_file():
+                if lemma or pos:
+                    document = dkpro.pipe(filepath,
+                                          jar,
+                                          language,
+                                          lemma,
+                                          pos,
+                                          **kwargs)
+                else:
+                    textfile = text.model.Textfile(filepath, treat_as, encoding)
+                    document = text.model.Document(textfile.content,
+                                                   textfile.title,
+                                                   **kwargs)
+                yield filepath, document
 
     if metadata:
-        metadata_ = cophi.model.Metadata()
+        metadata_ = text.model.Metadata()
+
     documents = pd.Series()
-    for textfile in lazy_reading(filepaths):
-        logger.info("Processing '{}' ...".format(textfile.title))
-        title = str(uuid.uuid1()) if metadata else textfile.title
-        text = textfile.content
-        document = cophi.model.Document(text,
-                                        title,
-                                        token_pattern,
-                                        lowercase,
-                                        n,
-                                        maximum)
-        documents[title] = document
+    for filepath, document in lazy_processing(filepaths,
+                                              token_pattern=token_pattern,
+                                              lowercase=lowercase,
+                                              n=n,
+                                              maximum=maximum):
+        title = document.title
         if metadata:
+            title = str(uuid.uuid1())
+            document.title = title
             metadata_ = metadata_.append({"uuid": title,
-                                          "filepath": textfile.filepath,
-                                          "parent": textfile.parent,
-                                          "title": textfile.title,
-                                          "suffix": textfile.filepath.suffix},
+                                          "filepath": str(filepath),
+                                          "parent": filepath.parent,
+                                          "title": filepath.stem,
+                                          "suffix": filepath.suffix},
                                           ignore_index=True)
+        documents[title] = document
     logger.info("Constructing Corpus object ...")
     if metadata:
-        return cophi.model.Corpus(documents), metadata_
-    else:
-        return cophi.model.Corpus(documents)
-
-def export(dtm, filepath, format="text"):
-    """Export a document-term matrix.
-
-    Parameters:
-        dtm: A document-term matrix.
-        filepath: Path to output file. Possibel values are `plaintext`/`text` or
-            `svmlight`.
-        format: File format.
-    """
-    if format.lower() in {"plaintext", "text"}:
-        cophi.model.Corpus.plaintext(dtm, filepath)
-    elif format.lower() in {"svmlight"}:
-        cophi.model.Corpus.svmlight(dtm, filepath)
+        return text.model.Corpus(documents), metadata_
     else:
-        raise ValueError("'{}' is no supported file format.".format(format))
+        return text.model.Corpus(documents)
diff --git a/src/cophi/dkpro/__init__.py b/src/cophi/dkpro/__init__.py
@@ -0,0 +1,10 @@
+"""
+cophi.dkpro
+~~~~~~~~~~~
+
+This module allows you to communicate with the Java CLI tool
+DARIAH DKPro-Wrapper in Python.
+"""
+
+from cophi.dkpro.api import process, pipe
+from cophi.dkpro import utils, model, core
diff --git a/src/cophi/dkpro/api.py b/src/cophi/dkpro/api.py
@@ -0,0 +1,51 @@
+"""
+cophi.dkpro.api
+~~~~~~~~~~~~~~~
+
+This module implements the high-level API for the DARIAH-DKPro-Wrapper.
+"""
+
+from pathlib import Path
+import tempfile
+
+from cophi import dkpro, text
+
+
+def process(path, jar, language, reader, xms="4g"):
+    """Process a textfile with the DARIAH-DKPro-Wrapper.
+
+    Parameters:
+        path (str): Path to text file.
+        jar (str): Path to JAR file.
+        language (str): Language of the text.
+        reader (str): File reader, either `text` or `xml`.
+        xms (str): Size to allocate by JVM.
+    """
+    output = Path(tempfile.gettempdir(), "dariah-dkpro-output")
+    if not output.exists():
+        output.mkdir()
+
+    d = dkpro.model.DKPro(jar=jar,
+                          xms=xms)
+
+    d.process(input=path,
+              output=output,
+              language=language,
+              reader=reader)
+
+    for file in output.glob("*.csv"):
+        yield dkpro.model.Document(file)
+
+
+def pipe(filepath, jar, language, lemma, pos, **kwargs):
+    """Pipe a file through DARIAH-DKPro-Wrapper.
+    """
+    for doc in process(filepath, jar, language, "text"):
+        if pos:
+            doc = doc.filter(pos)
+            title = doc.name
+        else:
+            doc = doc.raw
+            title = doc.name
+        content = " ".join(doc["Lemma" if lemma else "Token"])
+        return text.model.Document(content, title=title, **kwargs)
diff --git a/src/cophi/dkpro/core.py b/src/cophi/dkpro/core.py
@@ -0,0 +1,36 @@
+"""
+cophi.dkpro.core
+~~~~~~~~~~~~~~~~
+
+This module implements the core functions of the DKPro module.
+"""
+
+import csv
+from pathlib import Path
+
+import pandas as pd
+
+from cophi import dkpro
+
+
+def call(jar, xms="4g", **parameters):
+    """Call DARIAH DKPro-Wrapper.
+
+    Parameter:
+        xms (str): Initial memory allocation pool for Java Virtual Machine.
+        jar (str): Path to jarfile.
+        **parameter: Additional parameters for DARIAH DKPro-Wrapper.
+    """
+    # Basic subprocess command:
+    args = ["java", "-Xms{}".format(xms), "-jar", jar]
+
+    # Append additional parameters:
+    for parameter, value in parameters.items():
+        # Support synonyms for `-input` parameter:
+        if parameter in {"filepath", "directory", "path", "corpus"}:
+            args.append("-input")
+        else:
+            args.append("-{}".format(parameter))
+        if value:
+            args.append(str(value))
+    return dkpro.utils.call(args)