Skip to content

Commit

Permalink
release: v1.3.0
Browse files Browse the repository at this point in the history
  • Loading branch information
severinsimmler committed Apr 25, 2019
1 parent 40cd9fd commit 7d9289b
Show file tree
Hide file tree
Showing 17 changed files with 380 additions and 144 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Expand Up @@ -21,3 +21,6 @@ _build
htmlcov
tests.xml
.pytest_cache
notebooks/ddw-0.4.6.jar
notebooks/ddw.log
notebooks/configs/*
20 changes: 8 additions & 12 deletions README.md
@@ -1,15 +1,17 @@
# A library for preprocessing
`cophi` is a Python library for handling, modeling and processing text corpora. You
can easily pipe a collection of text files using the high-level API:
`cophi` is a Python library for handling, modeling and processing text corpora. You can easily pipe a collection of text files using the high-level API:

```python
```
corpus, metadata = cophi.corpus(directory="british-fiction-corpus",
filepath_pattern="*.txt",
pathname_pattern="**/*.txt",
encoding="utf-8",
lowercase=True,
token_pattern=r"\p{L}+\p{P}?\p{L}+")
```

You can also plug the [DARIAH-DKPro-Wrapper](https://dariah-de.github.io/DARIAH-DKPro-Wrapper/) into this pipeline to lemmatize text, or just keep certain word types. Check out the introducing [Jupyter notebook](https://github.com/cophi-wue/cophi-toolbox/blob/master/notebooks/introducing-cophi.ipynb).


## Getting started
To install the latest **stable** version:
```
Expand All @@ -21,16 +23,10 @@ To install the latest **development** version:
$ pip install --upgrade git+https://github.com/cophi-wue/cophi-toolbox.git@testing
```

Check out the introducing [Jupyter notebook](https://github.com/cophi-wue/cophi-toolbox/blob/master/notebooks/API.ipynb).

## Contents
- [`api`](https://github.com/cophi-wue/cophi-toolbox/blob/master/src/cophi_toolbox/api.py): High-level API.
- [`model`](https://github.com/cophi-wue/cophi-toolbox/blob/master/src/cophi_toolbox/model.py): Low-level model classes.
- [`complexity`](https://github.com/cophi-wue/cophi-toolbox/blob/master/src/cophi_toolbox/complexity.py): Measures that assess the linguistic and stylistic complexity of (literary) texts.
- [`utils`](https://github.com/cophi-wue/cophi-toolbox/blob/master/src/cophi_toolbox/utils.py): Low-level helper functions.
## Available complexity measures
There are also a plenty of complexity metrics for measuring the lexical richness of (literary) texts.


## Available complexity measures
Measures that use sample size and vocabulary size:
* Type-Token Ratio TTR
* Guiraud’s R
Expand Down
File renamed without changes.
7 changes: 3 additions & 4 deletions setup.py
Expand Up @@ -72,7 +72,7 @@ def run(self):
self.status("Pushing git tags ...")
os.system("git tag v{0}".format(about["__version__"]))
os.system("git push --tags")

sys.exit()

setup(
Expand All @@ -91,14 +91,13 @@ def run(self):
include_package_data=True,
license="Apache 2.0",
classifiers=[
"License :: OSI Approved :: MIT License",
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.4",
"Programming Language :: Python :: 3.5",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy"
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: Implementation :: CPython"
],
cmdclass={
"upload": UploadCommand,
Expand Down
25 changes: 12 additions & 13 deletions src/cophi/__init__.py
@@ -1,19 +1,17 @@
r"""
**cophi** is a Python library for handling, modeling and processing text
"""
This is a Python library for handling, modeling and processing text
corpora. You can easily pipe a collection of text files using the
high-level API:
.. code-block:: python
corpus, metadata = cophi.corpus(directory="british-fiction-corpus",
pathname_pattern="**/*.txt",
encoding="utf-8",
lowercase=True,
n=1,
token_pattern=r"\p{L}+\p{P}?\p{L}+")
```
corpus, metadata = cophi.corpus(directory="british-fiction-corpus",
pathname_pattern="**/*.txt",
encoding="utf-8",
lowercase=True,
token_pattern=r"\p{L}+\p{P}?\p{L}+")
```
There are also a plenty of complexity metrics for measuring lexical
There are also a plenty of complexity metrics for measuring the lexical
richness of (literary) texts.
Measures that use sample size and vocabulary size:
Expand Down Expand Up @@ -46,4 +44,5 @@
:module:`complexity` module.
"""

from cophi.api import document, corpus, export

from cophi.api import document, corpus
2 changes: 1 addition & 1 deletion src/cophi/__version__.py
@@ -1,3 +1,3 @@
VERSION = (1, 2, 3)
VERSION = (1, 3, 0)

__version__ = ".".join(map(str, VERSION))
105 changes: 57 additions & 48 deletions src/cophi/api.py
Expand Up @@ -6,21 +6,28 @@
"""

import logging
import pathlib
from pathlib import Path
import uuid

import pandas as pd
import cophi.model

from cophi import dkpro, text


logger = logging.getLogger(__name__)


def document(filepath, **kwargs):
def document(filepath, lemma=False, pos=None, jar="ddw-0.4.6.jar",
language="de", **kwargs):
"""Read a text file and create a Document object.
Parameter:
filepath (str): Path to the text file.
title (str): Describing title for the document. (optional).
lemma (bool): If True, lemmatize text (optional).
pos (list): If not None, filter POS tags (optional).
jar (str): Path to DARIAH-DKPro-Wrapper JAR file (optional).
language (str): Language of text (optional).
title (str): Describing title for the document (optional).
lowercase (bool): If True, writes all letters in lowercase (optional).
n (int): Number of tokens per ngram (optional).
token_pattern (str): Regex pattern for one token (optional).
Expand All @@ -29,13 +36,17 @@ def document(filepath, **kwargs):
Returns:
A Document object.
"""
textfile = cophi.model.Textfile(filepath)
return cophi.model.Document(textfile.content, **kwargs)
if lemma or pos:
return dkpro.pipe(filepath, jar, language, lemma, pos, **kwargs)
else:
filepath = text.model.filepath(filepath)
return text.model.Document(filepath.content, **kwargs)


def corpus(directory, filepath_pattern="*", treat_as=None, encoding="utf-8",
def corpus(directory, filepath_pattern="*.txt", treat_as=None, encoding="utf-8",
lowercase=True, n=None, token_pattern=r"\p{L}+\p{P}?\p{L}+",
maximum=None, metadata=True):
maximum=None, metadata=True, lemma=False, pos=None,
jar="ddw-0.4.6.jar", language="de"):
"""Pipe a collection of text files and create a Corpus object.
Parameters:
Expand All @@ -48,58 +59,56 @@ def corpus(directory, filepath_pattern="*", treat_as=None, encoding="utf-8",
token_pattern (str): Regex pattern for one token (optional).
maximum (int): Stop tokenizing after that much tokens (optional).
metadata (bool): Extract metadata from filenames (optional).
lemma (bool): If True, lemmatize text (optional).
pos (list): If not None, filter POS tags (optional).
jar (str): Path to DARIAH-DKPro-Wrapper JAR file (optional).
language (str): Language of text (optional).
Returns:
A Corpus model object and optionally a Metadata object.
"""
if not isinstance(directory, pathlib.Path):
directory = pathlib.Path(directory)
filepaths = directory.rglob(filepath_pattern)
filepaths = Path(directory).rglob(filepath_pattern)

def lazy_reading(filepaths):
def lazy_processing(filepaths, **kwargs):
for filepath in filepaths:
if filepath.is_file() and ".git" not in str(filepath):
yield cophi.model.Textfile(filepath, treat_as, encoding)
logger.info(f"Processing '{filepath.stem}' ...")
if filepath.is_file():
if lemma or pos:
document = dkpro.pipe(filepath,
jar,
language,
lemma,
pos,
**kwargs)
else:
textfile = text.model.Textfile(filepath, treat_as, encoding)
document = text.model.Document(textfile.content,
textfile.title,
**kwargs)
yield filepath, document

if metadata:
metadata_ = cophi.model.Metadata()
metadata_ = text.model.Metadata()

documents = pd.Series()
for textfile in lazy_reading(filepaths):
logger.info("Processing '{}' ...".format(textfile.title))
title = str(uuid.uuid1()) if metadata else textfile.title
text = textfile.content
document = cophi.model.Document(text,
title,
token_pattern,
lowercase,
n,
maximum)
documents[title] = document
for filepath, document in lazy_processing(filepaths,
token_pattern=token_pattern,
lowercase=lowercase,
n=n,
maximum=maximum):
title = document.title
if metadata:
title = str(uuid.uuid1())
document.title = title
metadata_ = metadata_.append({"uuid": title,
"filepath": textfile.filepath,
"parent": textfile.parent,
"title": textfile.title,
"suffix": textfile.filepath.suffix},
"filepath": str(filepath),
"parent": filepath.parent,
"title": filepath.stem,
"suffix": filepath.suffix},
ignore_index=True)
documents[title] = document
logger.info("Constructing Corpus object ...")
if metadata:
return cophi.model.Corpus(documents), metadata_
else:
return cophi.model.Corpus(documents)

def export(dtm, filepath, format="text"):
"""Export a document-term matrix.
Parameters:
dtm: A document-term matrix.
filepath: Path to output file. Possibel values are `plaintext`/`text` or
`svmlight`.
format: File format.
"""
if format.lower() in {"plaintext", "text"}:
cophi.model.Corpus.plaintext(dtm, filepath)
elif format.lower() in {"svmlight"}:
cophi.model.Corpus.svmlight(dtm, filepath)
return text.model.Corpus(documents), metadata_
else:
raise ValueError("'{}' is no supported file format.".format(format))
return text.model.Corpus(documents)
10 changes: 10 additions & 0 deletions src/cophi/dkpro/__init__.py
@@ -0,0 +1,10 @@
"""
cophi.dkpro
~~~~~~~~~~~
This module allows you to communicate with the Java CLI tool
DARIAH DKPro-Wrapper in Python.
"""

from cophi.dkpro.api import process, pipe
from cophi.dkpro import utils, model, core
51 changes: 51 additions & 0 deletions src/cophi/dkpro/api.py
@@ -0,0 +1,51 @@
"""
cophi.dkpro.api
~~~~~~~~~~~~~~~
This module implements the high-level API for the DARIAH-DKPro-Wrapper.
"""

from pathlib import Path
import tempfile

from cophi import dkpro, text


def process(path, jar, language, reader, xms="4g"):
"""Process a textfile with the DARIAH-DKPro-Wrapper.
Parameters:
path (str): Path to text file.
jar (str): Path to JAR file.
language (str): Language of the text.
reader (str): File reader, either `text` or `xml`.
xms (str): Size to allocate by JVM.
"""
output = Path(tempfile.gettempdir(), "dariah-dkpro-output")
if not output.exists():
output.mkdir()

d = dkpro.model.DKPro(jar=jar,
xms=xms)

d.process(input=path,
output=output,
language=language,
reader=reader)

for file in output.glob("*.csv"):
yield dkpro.model.Document(file)


def pipe(filepath, jar, language, lemma, pos, **kwargs):
"""Pipe a file through DARIAH-DKPro-Wrapper.
"""
for doc in process(filepath, jar, language, "text"):
if pos:
doc = doc.filter(pos)
title = doc.name
else:
doc = doc.raw
title = doc.name
content = " ".join(doc["Lemma" if lemma else "Token"])
return text.model.Document(content, title=title, **kwargs)
36 changes: 36 additions & 0 deletions src/cophi/dkpro/core.py
@@ -0,0 +1,36 @@
"""
cophi.dkpro.core
~~~~~~~~~~~~~~~~
This module implements the core functions of the DKPro module.
"""

import csv
from pathlib import Path

import pandas as pd

from cophi import dkpro


def call(jar, xms="4g", **parameters):
"""Call DARIAH DKPro-Wrapper.
Parameter:
xms (str): Initial memory allocation pool for Java Virtual Machine.
jar (str): Path to jarfile.
**parameter: Additional parameters for DARIAH DKPro-Wrapper.
"""
# Basic subprocess command:
args = ["java", "-Xms{}".format(xms), "-jar", jar]

# Append additional parameters:
for parameter, value in parameters.items():
# Support synonyms for `-input` parameter:
if parameter in {"filepath", "directory", "path", "corpus"}:
args.append("-input")
else:
args.append("-{}".format(parameter))
if value:
args.append(str(value))
return dkpro.utils.call(args)

0 comments on commit 7d9289b

Please sign in to comment.