Skip to content

Commit

Permalink
fix(corpus): register corpus attributes.
Browse files Browse the repository at this point in the history
When a Corpus would contain a token that is identical to one of the
attribute names (e.g., 'logger'), pandas would store the value assigned to
the attribute in (all rows of) the column of the underlying DataFrame.
  • Loading branch information
thvitt committed Oct 1, 2021
1 parent 3a23d14 commit 35b0614
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 2 deletions.
2 changes: 1 addition & 1 deletion delta/corpus.py
Expand Up @@ -311,7 +311,7 @@ def __init__(self, operation):


class Corpus(pd.DataFrame):
_metadata = ['metadata']
_metadata = ['metadata', 'logger', 'document_describer', 'feature_generator']

def __init__(self, source=None, *, subdir=None, file=None, corpus=None,
feature_generator=None,
Expand Down
13 changes: 12 additions & 1 deletion test/corpus_test.py
@@ -1,3 +1,5 @@
import numpy as np
import pandas as pd
from pytest import approx

import delta as d
Expand Down Expand Up @@ -78,4 +80,13 @@ def test_featuredescriber_args(testdir):

def test_parallel_corpus(testdir, corpus):
parallel_corpus = d.Corpus(testdir, parallel=True)
assert (parallel_corpus == corpus).all().all()
assert (parallel_corpus == corpus).all().all()


@pytest.mark.parametrize("attr_name", ["logger", "metadata", "feature_generator", "document_describer", "save"])
def test_attribute_names(attr_name):
df = pd.DataFrame([[17, 4], [23, 42]],
columns=['foo', attr_name],
index=['doc1', 'doc2'])
corpus = d.Corpus(df)
assert list(corpus[attr_name]) == [4, 42]

0 comments on commit 35b0614

Please sign in to comment.