added JSON serialization and blitzdb back-end

cmry · Apr 19, 2016 · 4a4db03 · 4a4db03
1 parent 0a1e78d
commit 4a4db03
Show file tree

Hide file tree

Showing 7 changed files with 116 additions and 72 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1 +1,2 @@
-shed/utils
+omesa/db
+omesa/__pycache__
diff --git a/examples/n_gram.py b/examples/n_gram.py
@@ -20,7 +20,7 @@
         "features": [Ngrams(level='char', n_list=[3])],
         "text_column": 1,
         "label_column": 0,
-        "save": ("log", "model")
+        "save": ("log", "model", "db")
     }
 }
 

diff --git a/omesa/database.py b/omesa/database.py
@@ -15,4 +15,15 @@ class Database(object):
 
     def __init__(self):
         """Load backend."""
-        self.db = FileBackend("./db")
+        #TODO: I'm sure the path here can be done neater
+        self.db = FileBackend(__file__.split('/database.py')[0] + "/db")
+
+    def save(self, doc):
+        self.db.save(doc)
+        self.db.commit()
+
+    def fetch(self, doc, q):
+        try:
+            return self.db.filter(doc, q)[0]
+        except IndexError:
+            print("File does not exist.")
diff --git a/omesa/experiment.py b/omesa/experiment.py
@@ -91,22 +91,7 @@ class Experiment(object):
             classifiers that can be set to output probabilities.
             ---
 
-            "setting": "grid",                     # optional
-
-            Currently only can be set to grid. If this is done, the classifier
-            of choice will be an SVM, for which the experiment will try to
-            optimize the parameter settings through CVGridSearch. At some
-            point, it might be useful to be able to provide the parameters
-            yourself (for other classifiers for example).
-            ---
-
-            "components": 200,                     # n for SVD - optional
-
-            Simply the n_components for TruncatedSVD. If not included, SVD
-            will not run for your experiment.
-            ---
-
-            "save": ("log", model")                # include whichever
+            "save": ("log", model", "db", "man", "json", "pickle")  # any combi
 
             Save the output of the log, or dump the entire model with its
             classification method and pipeline wrapper for new data instances.

diff --git a/omesa/featurizer.py b/omesa/featurizer.py
@@ -92,6 +92,7 @@ def transform(self, stream):
         X : numpy array of shape [n_samples, n_features]
             Training data returns when applying the transform function.
         """
+        # FIXME: move this outside of transform scope so can deal with 1 inst.
         for instance in stream:
             label, raw, parse, meta = instance + (None,) * (4 - len(instance))
             v = {}

diff --git a/omesa/io.py b/omesa/io.py
@@ -1,56 +1,95 @@
 """Data handling functions."""
 
 import csv
-import sys
 import json
-import numpy as np
+import pickle
+import sys
+from types import GeneratorType
 from inspect import isclass, isgenerator
+
+import numpy as np
 from .tools import serialize_sk as sr
 
-# pylint:       disable=R0903,R0913,W0141
+# pylint:       disable=R0903,R0913,W0141,C0103
 
 
 class Pipeline(object):
     """Shell for experiment pipeline storing and handling.
 
     Parameters
     ----------
-    vec : class
-        Instance of Vectorizer with fitted pipes.
+    exp : class, optional, default None
+        Instance of Experimen with fitted pipes. If not supplied, name and
+        source should be set.
+
+    name : str, optional, default None
+        Name that the pipeline should be saved/loaded under/from.
 
-    clf : class
-        Classifier that adheres to the sklearn type (with a predict function).
+    source : tuple, optional, default None
+        Tuple with storage options, can be "man" (manual json serialization),
+        "json" (for jsonpickle, requires this package), "db" (for database
+        storage, requires blitzdb).
     """
 
-    def __init__(self, exp):
+    def __init__(self, exp=None, name=None, source=None):
         """Set the pipeline for transformation and clf for classification."""
-        self.vec = exp.vec
-        self.clf = exp.clf
+        if not exp:
+            assert name
+        self.vec = exp.vec if exp else None
+        self.clf = exp.clf if exp else None
+        self.hook = self.vec.conf['name'] if not name else name
+        self.serialize = None
+        self.storage = self.vec.conf['save'] if not source else source
+        if 'db' in self.storage:
+            from .database import Database, Experiment
+            self.db = Database()
+            self.data = Experiment
+        if 'json' in self.storage:
+            import jsonpickle
+            self.serialize = jsonpickle
+        # FIXME: jsonpickle should be preferred, doesn't currently work though
+        elif 'man' in self.storage or 'db' in self.storage:
+            self.serialize = sr
+            # self.hook += '_man'
 
     def save(self):
-        """bla."""
+        """Save experiment and classifier in format specified."""
         print(" Saving experiment...")
-        top = self.vec.__dict__
-        print(top)
-        ser = sr.data_to_json(top)
-        print(" done!")
-        if 'db' in top['conf']['save']:
-            pass
-        else:
-            json.dump(ser, open(top['conf']['name'] + '.json', 'w'))
+        top = {'name': self.hook, 'vec': self.vec, 'clf': self.clf}
+
+        fl = self.hook
+        if self.serialize:
+            serialized = self.serialize.encode(top)
+
+        if any([x in self.storage for x in ('man', 'json')]) and serialized:
+            json.dump(serialized, open(self.hook + '.json', 'w'))
+        if 'pickle' in self.storage:
+            for t in ('train', 'test'):
+                c = top['conf']['{0}_data'].format(t)
+                c = '' if isinstance(c, GeneratorType) else c
+            pickle.dump(top, open(fl + '.pickle', 'wb'))
+        if 'db' in self.storage:
+            doc = self.data(json.loads(serialized))
+            self.db.save(doc)
 
     def load(self):
-        pass
+        """Load experiment and classifier from source specified."""
+        if any([x in self.storage for x in ('man', 'json')]):
+            mod = self.serialize.decode(json.load(open(self.hook + '.json')))
+        if 'pickle' in self.storage:
+            mod = pickle.load(open(self.hook + '.pickle', 'rb'))
+        if 'db' in self.storage:
+            mod = self.db.fetch(self.data, {'name': self.hook})
+            mod = self.serialize.decode(json.dumps(dict(mod)))
+        self.clf = mod['clf']
+        self.vec = mod['vec']
 
     def classify(self, data):
-        """Given a data iterator, return a (label, probability) tuple."""
-        self.pipeline.conf['label_column'] = 0
-        self.pipeline.conf['text_column'] = 1
-        # self.pipeline.loader.handle.labs = None
-        v, _ = self.pipeline.test(data)
-        # FIXME: this is like a java call
-        enc = dict(map(reversed, self.pipeline.featurizer.labels.items()))
-        return [enc[l] for l in self.clf.predict(v)], self.clf.predict_proba(v)
+        """Given a data point, return a (label, probability) tuple."""
+        X, y = self.vec.transform(data)
+        X = X.todense().reshape((1, -1))
+                                    # LinearSVC no predict proba?
+        return self.clf.predict(X)  # , self.clf.predict_proba(X)
 
 
 class CSV:

diff --git a/omesa/tools/serialize_sk.py b/omesa/tools/serialize_sk.py
@@ -29,14 +29,18 @@
 import json
 import sys
 
+class Dummy:
+
+    def __init__(self):
+        pass
+
 def isnamedtuple(obj):
     """Heuristic check if an object is a namedtuple."""
     return isinstance(obj, tuple) \
            and hasattr(obj, "_fields") \
            and hasattr(obj, "_asdict") \
            and callable(obj._asdict)
 
-
 def serialize(data):
     if data is None or isinstance(data, (bool, int, float, str)):
         return data
@@ -51,18 +55,6 @@ def serialize(data):
             "fields": list(data._fields),
             "values": [serialize(getattr(data, f)) for f in data._fields]}}
     # --- custom ---
-    try:
-        _ = data.__next__  # generator as string
-        return {'py/generator': str(data)}
-    except AttributeError:
-        pass
-    try:
-        if not isinstance(data, type):  # not numpy type
-            return {'py/class': {'name': data.__class__.__name__,
-                                 'mod': data.__module__,
-                                 'attr': data_to_json(data.__dict__)}}
-    except AttributeError as e:
-        pass
     if isinstance(data, type):
         return {"py/numpy.type": data.__name__}
     if isinstance(data, np.integer):
@@ -82,21 +74,25 @@ def serialize(data):
         return {"py/numpy.ndarray": {
             "values": data.tolist(),
             "dtype":  str(data.dtype)}}
+    # --- custom ---
+    try:
+        _ = data.__next__  # generator as string
+        return {'py/generator': str(data)}
+    except AttributeError:
+        pass
+    try:
+        if not isinstance(data, type):  # not numpy type
+            return {'py/class': {'name': data.__class__.__name__,
+                                 'mod': data.__module__,
+                                 'attr': serialize(data.__dict__)}}
+    except AttributeError as e:
+        print(e)
     raise TypeError("Type %s not data-serializable" % type(data))
 
 
 def restore(dct):
     # --- custom ---
-    print(dct)
-    if "py/generator" in dct:
-        return []
-    if "py/class" in dct:
-        obj = dct["py/class"]
-        cls_ = getattr(sys.modules[obj['mod']], obj['name'])
-        class_init = cls_()
-        for k, v in restore(obj['attr']):
-            setattr(class_init, k, v)
-        return class_init
+    # print(dct)
     if "py/numpy.type" in dct:
         return np.dtype(dct["py/numpy.type"]).type
     if "py/numpy.int" in dct:
@@ -118,10 +114,21 @@ def restore(dct):
         return np.array(data["values"], dtype=data["dtype"])
     if "py/collections.OrderedDict" in dct:
         return OrderedDict(dct["py/collections.OrderedDict"])
+    # --- custom ---
+    if "py/generator" in dct:
+        return []
+    if "py/class" in dct:
+        obj = dct["py/class"]
+        cls_ = getattr(sys.modules[obj['mod']], obj['name'])
+        class_init = Dummy()
+        class_init.__class__ = cls_
+        for k, v in restore(obj['attr']).items():
+            setattr(class_init, k, v)
+        return class_init
     return dct
 
-def data_to_json(data):
+def encode(data):
     return json.dumps(serialize(data))
 
-def json_to_data(s):
+def decode(s):
     return json.loads(s, object_hook=restore)