Skip to content

Commit

Permalink
Experimental support for sparse dataframes
Browse files Browse the repository at this point in the history
  • Loading branch information
thvitt committed Jul 1, 2021
1 parent 3eed8fc commit 07a7612
Showing 1 changed file with 11 additions and 3 deletions.
14 changes: 11 additions & 3 deletions delta/corpus.py
Expand Up @@ -29,7 +29,7 @@ class which represents the feature matrix. Also contained are default
LETTERS_PATTERN = re.compile(r'\p{L}+')
WORD_PATTERN = re.compile(r"\b(\p{L}[\p{L}'’]*?|[\p{L}'’]*?\p{L})\b", re.WORD)

class FeatureGenerator(object):
class FeatureGenerator:

"""
A **feature generator** is responsible for converting a subdirectory of
Expand Down Expand Up @@ -67,7 +67,8 @@ def __init__(self, lower_case=False, encoding="utf-8", glob='*.txt',
max_tokens=None,
ngrams=None,
parallel=False,
sort='documents'):
sort='documents',
sparse=False):
"""
Creates a customized default feature generator.
Expand All @@ -94,6 +95,7 @@ def __init__(self, lower_case=False, encoding="utf-8", glob='*.txt',
- ``features``, ``columns``: sort by feature labels (ie words)
- ``both``: sort along both axes
- None or the empty string: Do not sort
sparse (bool): build a sparse dataframe. Requires Pandas >=1.0
"""
self.lower_case = lower_case
self.encoding = encoding
Expand All @@ -105,6 +107,7 @@ def __init__(self, lower_case=False, encoding="utf-8", glob='*.txt',
self.logger = logging.getLogger(__name__)
self.parallel = parallel
self.sort = sort
self.sparse = sparse

def __repr__(self):
return type(self).__name__ + '(' + \
Expand Down Expand Up @@ -257,7 +260,12 @@ def __call__(self, directory):
given directory and returns a simple pd.DataFrame for that. The resulting
dataframe will be sorted according to the `sort` attribute.
"""
df = pd.DataFrame(self.process_directory(directory)).T
data = self.process_directory(directory)
if self.sparse:
dtype = pd.SparseDtype(pd.Int64Dtype, pd.NA)
else:
dtype = pd.Int64Dtype
df = pd.DataFrame.from_dict(data, orient='index', dtype=dtype)
if self.sort:
if self.sort.lower() in {'documents', 'index', 'both'}:
df = df.sort_index(axis=0)
Expand Down

0 comments on commit 07a7612

Please sign in to comment.