Refactor the sklearn kit and add a function for loading data sets

data-workspaces · Jun 17, 2019 · e77098a · e77098a
1 parent 295d142
commit e77098a
Show file tree

Hide file tree

Showing 6 changed files with 184 additions and 48 deletions.
diff --git a/README.rst b/README.rst
@@ -70,15 +70,14 @@ Now, type the following Python code in the first cell::
   import numpy as np
   from os.path import join
   from sklearn.svm import SVC
-  from dataworkspaces.kits.scikit_learn import train_and_predict_with_cv
+  from dataworkspaces.kits.scikit_learn import load_dataset_from_resource,\
+                                               train_and_predict_with_cv
   
-  DATA_DIR='../sklearn-digits-dataset'
   RESULTS_DIR='../results'
   
-  data = np.loadtxt(join(DATA_DIR, 'data.csv'), delimiter=',')
-  target = np.loadtxt(join(DATA_DIR, 'target.csv'), delimiter=',')
-  train_and_predict_with_cv(SVC, {'gamma':[0.01, 0.001, 0.0001]}, data, target,
-                            DATA_DIR, RESULTS_DIR, random_state=42)
+  dataset = load_dataset_from_resource('sklearn-digits-dataset')
+  train_and_predict_with_cv(SVC, {'gamma':[0.01, 0.001, 0.0001]}, dataset,
+                            RESULTS_DIR, random_state=42)
 
 Now, run the cell. It will take a few seconds to train and test the
 model. You should then see::

diff --git a/dataworkspaces/kits/scikit_learn.py b/dataworkspaces/kits/scikit_learn.py
@@ -14,13 +14,160 @@
 from sklearn import metrics
 from sklearn.model_selection import GridSearchCV, train_test_split
 from sklearn.externals import joblib
+from sklearn.utils import Bunch
 import sys
 import numpy as np
-from os.path import join, abspath, expanduser
+import os
+from os.path import join, abspath, expanduser, exists
+import json
+import glob
 
+from dataworkspaces.errors import ConfigurationError
+from dataworkspaces.utils.workspace_utils import get_workspace
+from dataworkspaces.resources.resource import CurrentResources
 from dataworkspaces.lineage import LineageBuilder
+from dataworkspaces.utils.lineage_utils import ResourceRef
 from .jupyter import is_notebook, get_step_name_for_notebook, get_notebook_directory
 
+def _load_dataset_file(dataset_path, filename):
+    filepath = join(dataset_path, filename)
+    if filename.endswith('.txt') or filename.endswith('.rst'):
+        with open(filepath, 'r') as f:
+            return f.read()
+    elif filename.endswith('.csv') or filename.endswith('.csv.gz') or \
+         filename.endswith('.csv.bz2'):
+        try:
+            return np.loadtxt(filepath, delimiter=',')
+        except ValueError:
+            # try with pandas
+            import pandas
+            df = pandas.read_csv(filepath)
+            if len(df.values.shape)==2 and df.values.shape[1]==1: # this is just a list
+                return df.values.reshape(df.values.shape[0])
+            else:
+                return df.values
+    elif filename.endswith('.npy'):
+        return np.load(filepath)
+
+
+def load_dataset_from_resource(resource_name:str, subpath:Optional[str]=None,
+                               workspace_dir:Optional[str]=None)\
+                               -> Bunch:
+    """
+    Load a datset (data and targets) from the specified resource, and returns an
+    sklearn-style Bunch (a dictionary-like object). The bunch will include at least
+    three attributes:
+
+    * ``data`` - a NumPy array of shape number_samples * number_features
+    * ``target`` - a NumPy array of length number_samples
+    * ``resource`` - a :class:`~ResourceRef` that provides the resource name and
+      subpath (if any) for the data
+
+    Some other attributes that may also be present, depending on the data set:
+
+    * ``DESCR`` - text containing a full description of the data set (for humans)
+    * ``feature_names`` - an array of length number_features containing the name
+      of each feature.
+    * ``target_names`` - an array containing the name of each target class
+
+    Data sets may define their own attributes as well (see below).
+
+    The ``data`` and ``target`` attributes can be used directly (e.g. passed to
+    ``train_test_split()``) or the entire bunch used as a parameter to
+    :func:`~train_and_predict_with_cv`.
+
+    **Parameters**
+
+    resource_name
+        The name of the resource containing the dataset.
+
+    subpath
+        Optional subpath within the resource where this specific dataset is located.
+        If not specified, the root of the resource is used.
+
+    workspace_dir
+       The root directory of your workspace in the local file system. Usually,
+       this can be left unspecified and inferred by DWS, which will search up
+       from the current working directory.
+
+    **Creating a Dataset**
+
+    To create a dataset in your resource that is suitable for importing by this function,
+    you simply need to create a file for each attribute you want in the bunch and place
+    all these files in the same directory within your resource.
+    The names of the files should be ``ATTRIBUTE.extn`` where ``ATTRIBUTE`` is the
+    attribute name (e.g. ``data`` or ``DESCR``) and ``.extn`` is a file extension
+    indicating the format. Supported file extensions are:
+
+    * ``.txt`` or ``.rst`` - text files
+    * ``.csv`` - csv files. These are read in using ``numpy.loadtxt()``. If this
+      fails because the csv does not contain all numeric data, pandas is used to read
+      in the file. It is then converted back to a numpy array.
+    * ``.csv.gz`` or ``.csv.bz2`` - these are compressed csv files which are treated
+      the same was as csv files (numpy and pandas will automatically uncompress before parsing).
+    * ``.npy`` - this a a file containing a serialized NumPy array saved via ``numpy.save()``.
+      It is loaded using ``numpy.load()``.
+    """
+
+    workspace_dir = get_workspace(workspace_dir)
+    resources = CurrentResources.read_current_resources(workspace_dir, batch=True,
+                                                        verbose=False)
+    resources.validate_resource_name(resource_name, subpath)
+    dataset_name = 'Resource ' + resource_name + ' subpath ' + subpath \
+                   if subpath is not None \
+                   else 'Resource ' + resource_name
+    r = resources.by_name[resource_name]
+    local_path = r.get_local_path_if_any()
+    if local_path is None:
+        # TODO: Support a data access api
+        raise ConfigurationError("Unable to instantiate a data set for resource '%s': currently not supported for non-local resources"%
+                                 resource_name)
+    dataset_path = join(local_path, subpath) if subpath is not None else local_path
+    result = {} # this will be the args to the result Bunch
+    # First load data and target files, which are required
+    data_file = join(dataset_path, 'data.csv')
+    if exists(data_file):
+        pass
+    elif exists(data_file+'.gz'):
+        data_file += '.gz'
+    elif exists(data_file+'.bz2'):
+        data_file += '.bz2'
+    else:
+        raise ConfigurationError("Did not find data file for %s at '%s'"%
+                                 (dataset_name, data_file))
+    result['data'] = np.loadtxt(data_file, delimiter=',')
+    target_file = join(dataset_path, 'target.csv')
+    if exists(target_file):
+        pass
+    elif exists(target_file+'.gz'):
+        target_file += '.gz'
+    elif exists(target_file+'.bz2'):
+        target_file += '.bz2'
+    else:
+        raise ConfigurationError("Did not find target file for %s at '%s'"%
+                                 (dataset_name, target_file))
+    result['target'] = np.loadtxt(target_file, delimiter=',')
+    if result['data'].shape[0]!=result['target'].shape[0]:
+        raise ConfigurationError("Data matrix at '%s' has %d rows, but target at '%s' has %d rows"%
+                                 (data_file, result['data'].shape[0],
+                                  target_file, result['target'].shape[0]))
+    result['resource'] = ResourceRef(resource_name, subpath)
+    # check for and load any other attributes
+    for fname in os.listdir(dataset_path):
+        if fname.endswith('.txt'):
+            result[fname[:-4]] = _load_dataset_file(dataset_path, fname)
+        elif fname.endswith('.rst'):
+            result[fname[:-4]] = _load_dataset_file(dataset_path, fname)
+        elif fname.endswith('.csv'):
+            result[fname[:-4]] = _load_dataset_file(dataset_path, fname)
+        elif fname.endswith('.csv.gz'):
+            result[fname[:-7]] = _load_dataset_file(dataset_path, fname)
+        elif fname.endswith('.csv.bz2'):
+            result[fname[:-8]] = _load_dataset_file(dataset_path, fname)
+        elif fname.endswith('.npy'):
+            result[fname[:-4]] = _load_dataset_file(dataset_path, fname)
+    return Bunch(**result)
+
 
 class Metrics:
     """Metrics and its subclasses are convenience classes
@@ -97,8 +244,8 @@ def print_metrics(self, file=sys.stdout):
 def train_and_predict_with_cv(classifier_class:ClassifierMixin,
                               param_grid:Union[Dict[str,List[Any]],
                                                List[Dict[str,List[Any]]]],
-                              data:np.ndarray, target:np.ndarray,
-                              input_dir:str, results_dir:str,
+                              dataset:Bunch,
+                              results_dir:str,
                               test_size:float=0.2, folds:int=5,
                               cv_scoring:str='accuracy',
                               model_name:Optional[str]=None,
@@ -129,15 +276,9 @@ def train_and_predict_with_cv(classifier_class:ClassifierMixin,
         try as values, or a list of such dictionaries. The various combinations
         of these parameters will be searched to find the best classifiation results
         on the training data.
-    data
-        A 2-d NumPy array where each column is a feature and each row is a
-        collection of features comprising a sample.
-    target
-        A 1-d NumPy array where each value represents the class number of the
-        corresponding sample row in the data array.
-    input_dir
-        The directory on the local filesystem from which the data and target arrays
-        were read. This is used for lineage.
+    dataset
+        A sklean Bunch object with members for data, target, and resource. This can
+        be loaded by calling :func:`~load_dataset_from_resource`
     results_dir
         The directory on the local filesystem to which the results should be
         written.
@@ -169,22 +310,21 @@ def train_and_predict_with_cv(classifier_class:ClassifierMixin,
         import numpy as np
         from os.path import join
         from sklearn.svm import SVC
-        from dataworkspaces.kits.scikit_learn import train_and_predict_with_cv
+        from dataworkspaces.kits.scikit_learn import load_dataset_from_resource,\
+                                                     train_and_predict_with_cv
         
-        DATA_DIR='../sklearn-digits-dataset'
         RESULTS_DIR='../results'
         
-        data = np.loadtxt(join(DATA_DIR, 'data.csv'), delimiter=',')
-        target = np.loadtxt(join(DATA_DIR, 'target.csv'), delimiter=',')
-        train_and_predict_with_cv(SVC, {'gamma':[0.01, 0.001, 0.0001]}, data, target,
-                                  DATA_DIR, RESULTS_DIR, random_state=42)
+        dataset = load_dataset_from_resource('sklearn-digits-dataset')
+        train_and_predict_with_cv(SVC, {'gamma':[0.01, 0.001, 0.0001]}, dataset,
+                                  RESULTS_DIR, random_state=42)
 
     This trains a Support Vector Classifier with three different values of gamma
     (0.01, 0.001, and 0.0001) and then evaluates the trained classifier on the
     holdout data. The results are writen to ``../results/results.json``.
     """
     X_train, X_test, y_train, y_test = \
-        train_test_split(data, target, test_size=test_size,
+        train_test_split(dataset.data, dataset.target, test_size=test_size,
                          random_state=random_state)
     # find the best combination of hyperparameters
     search = GridSearchCV(classifier_class(), param_grid=param_grid, scoring=cv_scoring,
@@ -205,7 +345,7 @@ def train_and_predict_with_cv(classifier_class:ClassifierMixin,
 
     lb = LineageBuilder().with_parameters(lineage_params)\
                          .as_results_step(results_dir, run_description)\
-                         .with_input_path(input_dir)
+                         .with_input_ref(dataset.resource)
     lb = lb.with_step_name(get_step_name_for_notebook())\
            .with_code_path(get_notebook_directory()) \
          if is_notebook() \
@@ -219,14 +359,14 @@ def train_and_predict_with_cv(classifier_class:ClassifierMixin,
         # Now predict the value of the digit on the test set
         predicted = classifier.predict(X_test)
         m = MulticlassClassificationMetrics(y_test, predicted) \
-            if len(np.unique(target))>2 \
+            if len(np.unique(dataset.target))>2 \
             else BinaryClassificationMetrics(y_test, predicted) # type: Metrics
         m.print_metrics()
         lineage.write_results(m.to_dict())
 
         if model_name is not None:
             classifier = classifier_class(**best_params)
-            classifier.fit(data, target)
+            classifier.fit(dataset.data, dataset.target)
             model_file = join(abspath(expanduser(results_dir)), model_name+'.pkl')
             joblib.dump(classifier, model_file)
             print("Wrote trained model to %s"% model_file)
diff --git a/dataworkspaces/resources/resource.py b/dataworkspaces/resources/resource.py
@@ -103,9 +103,9 @@ def validate_subpath_exists(self, subpath):
         lp = self.get_local_path_if_any()
         if lp is not None:
             path = join(lp, subpath)
-            if not isdir(path):
-                raise ConfigurationError("Subpath %s does not exist for resource %s"%
-                                         (subpath, self.name))
+            if not exists(path): # use exists() instead of isdir() as subpath could be a file
+                raise ConfigurationError("Subpath %s does not exist for resource %s, expecting it at '%s'"%
+                                         (subpath, self.name, path))
 
     def results_move_current_files(self, rel_dest_root, exclude_files,
                                    exclude_dirs_re):

diff --git a/docs/intro.rst b/docs/intro.rst
@@ -60,15 +60,14 @@ Now, type the following Python code in the first cell::
   import numpy as np
   from os.path import join
   from sklearn.svm import SVC
-  from dataworkspaces.kits.scikit_learn import train_and_predict_with_cv
+  from dataworkspaces.kits.scikit_learn import load_dataset_from_resource,\
+                                               train_and_predict_with_cv
   
-  DATA_DIR='../sklearn-digits-dataset'
   RESULTS_DIR='../results'
-  
-  data = np.loadtxt(join(DATA_DIR, 'data.csv'), delimiter=',')
-  target = np.loadtxt(join(DATA_DIR, 'target.csv'), delimiter=',')
-  train_and_predict_with_cv(SVC, {'gamma':[0.01, 0.001, 0.0001]}, data, target,
-                            DATA_DIR, RESULTS_DIR, random_state=42)
+
+  dataset = load_dataset_from_resource('sklearn-digits-dataset')
+  train_and_predict_with_cv(SVC, {'gamma':[0.01, 0.001, 0.0001]}, dataset,
+                            RESULTS_DIR, random_state=42)
 
 Now, run the cell. It will take a few seconds to train and test the
 model. You should then see::

diff --git a/docs/kits.rst b/docs/kits.rst
@@ -16,7 +16,7 @@ Scikit-learn
 
 .. automodule:: dataworkspaces.kits.scikit_learn
    :no-undoc-members:
-   :members: train_and_predict_with_cv,Metrics,BinaryClassificationMetrics,MulticlassClassificationMetrics
+   :members: load_dataset_from_resource,train_and_predict_with_cv,Metrics,BinaryClassificationMetrics,MulticlassClassificationMetrics
 
 
 
diff --git a/docs/tutorial.rst b/docs/tutorial.rst
@@ -39,18 +39,16 @@ code into a notebook cell::
   import numpy as np
   from os.path import join
   from sklearn.linear_model import LogisticRegression
-  from dataworkspaces.kits.scikit_learn import train_and_predict_with_cv
+  from dataworkspaces.kits.scikit_learn import load_dataset_from_resource,\
+                                               train_and_predict_with_cv
   
-  DATA_DIR='../sklearn-digits-dataset'
   RESULTS_DIR='../results'
-  
-  data = np.loadtxt(join(DATA_DIR, 'data.csv'), delimiter=',')
-  target = np.loadtxt(join(DATA_DIR, 'target.csv'), delimiter=',')
+
+  dataset = load_dataset_from_resource('sklearn-digits-dataset')
   train_and_predict_with_cv(LogisticRegression,
                             {'C':[1e-3, 1e-2, 1e-1, 1, 1e2], 'solver':['lbfgs'],
                              'multi_class':['multinomial']},
-                            data, target,
-                            DATA_DIR, RESULTS_DIR, random_state=42)
+                            dataset, RESULTS_DIR, random_state=42)
 
 Note the only differences in our call to ``train_and_predict_with_cv`` are that
 we pass a different classifier (``LogisticRegression``) and a ``param_grid``
@@ -99,8 +97,8 @@ a model with the full data set and save it to our results directory.
 ``model_name`` parameter. Start the ``digits-svc`` notebook and add
 ``model_name='svc-best'`` to the call as follows::
 
-  train_and_predict_with_cv(SVC, {'gamma':[0.01, 0.001, 0.0001]}, data, target,
-                            DATA_DIR, RESULTS_DIR, random_state=42,
+  train_and_predict_with_cv(SVC, {'gamma':[0.01, 0.001, 0.0001]}, dataset,
+                            RESULTS_DIR, random_state=42,
                             model_name='svc-best')
 
 Now, run the cell. It should print the metrics as before and then the message: