diff --git a/delta/corpus.py b/delta/corpus.py index 5f1dcce..5bc00fb 100644 --- a/delta/corpus.py +++ b/delta/corpus.py @@ -10,6 +10,7 @@ class which represents the feature matrix. Also contained are default import os import glob from fnmatch import fnmatch +from inspect import signature from typing import Type import regex as re @@ -254,7 +255,7 @@ class Corpus(pd.DataFrame): _metadata = ['metadata'] def __init__(self, subdir=None, file=None, corpus=None, - feature_generator=FeatureGenerator(), + feature_generator=None, document_describer=DefaultDocumentDescriber(), metadata=None, **kwargs): """ @@ -264,9 +265,11 @@ def __init__(self, subdir=None, file=None, corpus=None, subdir (str): Path to a subdirectory containing the (unprocessed) corpus data. file (str): Path to a CSV file containing the feature vectors. corpus (pandas.DataFrame): A dataframe or :class:`Corpus` from which to create a new corpus, as a copy. - feature_generator (FeatureGenerator): A customizeable helper class that will process a `subdir` to a feature matrix, if the `subdir` argument is also given. + feature_generator (FeatureGenerator): A customizeable helper class that will process a `subdir` to a feature matrix, if the `subdir` argument is also given. If None, a default feature generator will be used. metadata (dict): A dictionary with metadata to copy into the new corpus. - **kwargs: Additional keyword arguments will be set in the metadata record of the new corpus. + **kwargs: Additionally, if feature_generator is None and subdir is not None, you can pass FeatureGenerator + arguments and they will be used when instantiating the feature generator + Additional keyword arguments will be set in the metadata record of the new corpus. """ logger = logging.getLogger(__name__) @@ -290,6 +293,15 @@ def __init__(self, subdir=None, file=None, corpus=None, # initialize data if subdir is not None: + if feature_generator is None: + fg_sig_arguments = signature(FeatureGenerator).parameters + fg_actual_args = {} + for key, value in kwargs.copy().items(): + if key in fg_sig_arguments: + fg_actual_args[key] = value + del kwargs[key] + feature_generator = FeatureGenerator(**fg_actual_args) + logger.info( "Creating corpus by reading %s using %s", subdir, diff --git a/test/corpus_test.py b/test/corpus_test.py index d28ec2b..50a6e6c 100644 --- a/test/corpus_test.py +++ b/test/corpus_test.py @@ -69,3 +69,8 @@ def test_table_describer(testdir): corpus = d.Corpus(testdir, document_describer=d.util.TableDocumentDescriber(testdir + '.csv', 'Author', 'Title')) assert corpus.document_describer.group_name(corpus.index[-1]) in {'Raabe', 'Marlitt', 'Fontane'} + + +def test_featuredescriber_args(testdir): + corpus = d.Corpus(testdir, lower_case=True) + assert 'Sie' not in corpus.columns \ No newline at end of file