deeppavlov · dimakarp1996 · Apr 6, 2023 · Apr 6, 2023 · May 3, 2023
diff --git a/deeppavlov/dataset_readers/basic_classification_reader.py b/deeppavlov/dataset_readers/basic_classification_reader.py
@@ -13,10 +13,13 @@
 # limitations under the License.
 
 
+
 from logging import getLogger
 from pathlib import Path
+from collections import defaultdict
 
 import pandas as pd
+from tqdm import tqdm
 
 from deeppavlov.core.common.registry import register
 from deeppavlov.core.data.dataset_reader import DatasetReader
@@ -31,15 +34,15 @@ class BasicClassificationDatasetReader(DatasetReader):
     Class provides reading dataset in .csv format
     """
 
+    @overrides
     def read(self, data_path: str, url: str = None,
              format: str = "csv", class_sep: str = None,
-             *args, **kwargs) -> dict:
+             label_type: str = "str", *args, **kwargs) -> dict:
         """
         Read dataset from data_path directory.
         Reading files are all data_types + extension
         (i.e for data_types=["train", "valid"] files "train.csv" and "valid.csv" form
         data_path will be read)
-
         Args:
             data_path: directory with files
             url: download data files if data_path not exists or empty
@@ -50,11 +53,17 @@ def read(self, data_path: str, url: str = None,
             names (array): list of column names to use
             orient (str): indication of expected JSON string format
             lines (boolean): read the file as a json object per line. Default: ``False``
-
+            label_type(str): expected type of labels. Default: ``"str"``
         Returns:
             dictionary with types from data_types.
             Each field of dictionary is a list of tuples (x_i, y_i)
         """
+        def row_list_process(row, y):
+            if pd.isna(row[y]):
+                return []
+            else:
+                return [label_type(label) for label in str(row[y]).split(class_sep)]
+
         data_types = ["train", "valid", "test"]
 
         train_file = kwargs.get('train', 'train.csv')
@@ -70,6 +79,13 @@ def read(self, data_path: str, url: str = None,
         data = {"train": [],
                 "valid": [],
                 "test": []}
+
+        supported_label_types = ['int','str','float']
+        error_msg = f'Wrong label type {label_type} given! Needs to be one of the built-in Python types'
+        if label_type not in supported_label_types:
+            raise Exception(error_msg)
+        label_type = eval(label_type)
+        data=defaultdict(list)
         for data_type in data_types:
             file_name = kwargs.get(data_type, '{}.{}'.format(data_type, format))
             if file_name is None:
@@ -80,6 +96,7 @@ def read(self, data_path: str, url: str = None,
                 if format == 'csv':
                     keys = ('sep', 'header', 'names')
                     options = {k: kwargs[k] for k in keys if k in kwargs}
+                    print(file)
                     df = pd.read_csv(file, **options)
                 elif format == 'json':
                     keys = ('orient', 'lines')
@@ -90,22 +107,21 @@ def read(self, data_path: str, url: str = None,
 
                 x = kwargs.get("x", "text")
                 y = kwargs.get('y', 'labels')
-                if isinstance(x, list):
-                    if class_sep is None:
-                        # each sample is a tuple ("text", "label")
-                        data[data_type] = [([row[x_] for x_ in x], str(row[y]))
-                                           for _, row in df.iterrows()]
-                    else:
-                        # each sample is a tuple ("text", ["label", "label", ...])
-                        data[data_type] = [([row[x_] for x_ in x], str(row[y]).split(class_sep))
-                                           for _, row in df.iterrows()]
-                else:
-                    if class_sep is None:
-                        # each sample is a tuple ("text", "label")
-                        data[data_type] = [(row[x], str(row[y])) for _, row in df.iterrows()]
-                    else:
-                        # each sample is a tuple ("text", ["label", "label", ...])
-                        data[data_type] = [(row[x], str(row[y]).split(class_sep)) for _, row in df.iterrows()]
+
+                for _, row in tqdm(df.iterrows()):
+                    try:
+                        if isinstance(x, list):
+                            x_text = [row[x_] for x_ in x]
+                        else:
+                            x_text = row[x]
+                        if class_sep is None:
+                            y_label = label_type(row[y])
+                        else:
+                            y_label = row_list_process(row, y)
+                        data[data_type].append((x_text, y_label))
+                    except Exception as e:
+                        print(f'Error processing {row}: {e}')
+                        raise e
             else:
                 log.warning("Cannot find {} file".format(file))