Experimental support for sparse dataframes

cophi-wue · Jul 1, 2021 · 07a7612 · 07a7612
1 parent 3eed8fc
commit 07a7612
Showing 1 changed file with 11 additions and 3 deletions.
diff --git a/delta/corpus.py b/delta/corpus.py
@@ -29,7 +29,7 @@ class which represents the feature matrix. Also contained are default
 LETTERS_PATTERN = re.compile(r'\p{L}+')
 WORD_PATTERN = re.compile(r"\b(\p{L}[\p{L}'’]*?|[\p{L}'’]*?\p{L})\b", re.WORD)
 
-class FeatureGenerator(object):
+class FeatureGenerator:
 
     """
     A **feature generator** is responsible for converting a subdirectory of
@@ -67,7 +67,8 @@ def __init__(self, lower_case=False, encoding="utf-8", glob='*.txt',
                  max_tokens=None,
                  ngrams=None,
                  parallel=False,
-                 sort='documents'):
+                 sort='documents',
+                 sparse=False):
         """
         Creates a customized default feature generator.
 
@@ -94,6 +95,7 @@ def __init__(self, lower_case=False, encoding="utf-8", glob='*.txt',
                 - ``features``, ``columns``: sort by feature labels (ie words)
                 - ``both``: sort along both axes
                 - None or the empty string: Do not sort
+            sparse (bool): build a sparse dataframe. Requires Pandas >=1.0
         """
         self.lower_case = lower_case
         self.encoding = encoding
@@ -105,6 +107,7 @@ def __init__(self, lower_case=False, encoding="utf-8", glob='*.txt',
         self.logger = logging.getLogger(__name__)
         self.parallel = parallel
         self.sort = sort
+        self.sparse = sparse
 
     def __repr__(self):
         return type(self).__name__ + '(' + \
@@ -257,7 +260,12 @@ def __call__(self, directory):
         given directory and returns a simple pd.DataFrame for that. The resulting
         dataframe will be sorted according to the `sort` attribute.
         """
-        df = pd.DataFrame(self.process_directory(directory)).T
+        data = self.process_directory(directory)
+        if self.sparse:
+            dtype = pd.SparseDtype(pd.Int64Dtype, pd.NA)
+        else:
+            dtype = pd.Int64Dtype
+        df = pd.DataFrame.from_dict(data, orient='index', dtype=dtype)
         if self.sort:
             if self.sort.lower() in {'documents', 'index', 'both'}:
                 df = df.sort_index(axis=0)