Skip to content

Commit

Permalink
Allow to call Corpus with FeatureGenerator args.
Browse files Browse the repository at this point in the history
This allows for a quick call like

    c = Corpus('data', lower_case=True, glob='*')
  • Loading branch information
thvitt committed Jun 13, 2021
1 parent 4f21ac5 commit e9da8ce
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 3 deletions.
18 changes: 15 additions & 3 deletions delta/corpus.py
Expand Up @@ -10,6 +10,7 @@ class which represents the feature matrix. Also contained are default
import os
import glob
from fnmatch import fnmatch
from inspect import signature
from typing import Type

import regex as re
Expand Down Expand Up @@ -254,7 +255,7 @@ class Corpus(pd.DataFrame):
_metadata = ['metadata']

def __init__(self, subdir=None, file=None, corpus=None,
feature_generator=FeatureGenerator(),
feature_generator=None,
document_describer=DefaultDocumentDescriber(),
metadata=None, **kwargs):
"""
Expand All @@ -264,9 +265,11 @@ def __init__(self, subdir=None, file=None, corpus=None,
subdir (str): Path to a subdirectory containing the (unprocessed) corpus data.
file (str): Path to a CSV file containing the feature vectors.
corpus (pandas.DataFrame): A dataframe or :class:`Corpus` from which to create a new corpus, as a copy.
feature_generator (FeatureGenerator): A customizeable helper class that will process a `subdir` to a feature matrix, if the `subdir` argument is also given.
feature_generator (FeatureGenerator): A customizeable helper class that will process a `subdir` to a feature matrix, if the `subdir` argument is also given. If None, a default feature generator will be used.
metadata (dict): A dictionary with metadata to copy into the new corpus.
**kwargs: Additional keyword arguments will be set in the metadata record of the new corpus.
**kwargs: Additionally, if feature_generator is None and subdir is not None, you can pass FeatureGenerator
arguments and they will be used when instantiating the feature generator
Additional keyword arguments will be set in the metadata record of the new corpus.
"""
logger = logging.getLogger(__name__)

Expand All @@ -290,6 +293,15 @@ def __init__(self, subdir=None, file=None, corpus=None,

# initialize data
if subdir is not None:
if feature_generator is None:
fg_sig_arguments = signature(FeatureGenerator).parameters
fg_actual_args = {}
for key, value in kwargs.copy().items():
if key in fg_sig_arguments:
fg_actual_args[key] = value
del kwargs[key]
feature_generator = FeatureGenerator(**fg_actual_args)

logger.info(
"Creating corpus by reading %s using %s",
subdir,
Expand Down
5 changes: 5 additions & 0 deletions test/corpus_test.py
Expand Up @@ -69,3 +69,8 @@ def test_table_describer(testdir):
corpus = d.Corpus(testdir,
document_describer=d.util.TableDocumentDescriber(testdir + '.csv', 'Author', 'Title'))
assert corpus.document_describer.group_name(corpus.index[-1]) in {'Raabe', 'Marlitt', 'Fontane'}


def test_featuredescriber_args(testdir):
corpus = d.Corpus(testdir, lower_case=True)
assert 'Sie' not in corpus.columns

0 comments on commit e9da8ce

Please sign in to comment.