Skip to content

Commit

Permalink
added JSON serialization and blitzdb back-end
Browse files Browse the repository at this point in the history
  • Loading branch information
cmry committed Apr 19, 2016
1 parent 0a1e78d commit 4a4db03
Show file tree
Hide file tree
Showing 7 changed files with 116 additions and 72 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
shed/utils
omesa/db
omesa/__pycache__
2 changes: 1 addition & 1 deletion examples/n_gram.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
"features": [Ngrams(level='char', n_list=[3])],
"text_column": 1,
"label_column": 0,
"save": ("log", "model")
"save": ("log", "model", "db")
}
}

Expand Down
13 changes: 12 additions & 1 deletion omesa/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,15 @@ class Database(object):

def __init__(self):
"""Load backend."""
self.db = FileBackend("./db")
#TODO: I'm sure the path here can be done neater
self.db = FileBackend(__file__.split('/database.py')[0] + "/db")

def save(self, doc):
self.db.save(doc)
self.db.commit()

def fetch(self, doc, q):
try:
return self.db.filter(doc, q)[0]
except IndexError:
print("File does not exist.")
17 changes: 1 addition & 16 deletions omesa/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,22 +91,7 @@ class Experiment(object):
classifiers that can be set to output probabilities.
---
"setting": "grid", # optional
Currently only can be set to grid. If this is done, the classifier
of choice will be an SVM, for which the experiment will try to
optimize the parameter settings through CVGridSearch. At some
point, it might be useful to be able to provide the parameters
yourself (for other classifiers for example).
---
"components": 200, # n for SVD - optional
Simply the n_components for TruncatedSVD. If not included, SVD
will not run for your experiment.
---
"save": ("log", model") # include whichever
"save": ("log", model", "db", "man", "json", "pickle") # any combi
Save the output of the log, or dump the entire model with its
classification method and pipeline wrapper for new data instances.
Expand Down
1 change: 1 addition & 0 deletions omesa/featurizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ def transform(self, stream):
X : numpy array of shape [n_samples, n_features]
Training data returns when applying the transform function.
"""
# FIXME: move this outside of transform scope so can deal with 1 inst.
for instance in stream:
label, raw, parse, meta = instance + (None,) * (4 - len(instance))
v = {}
Expand Down
95 changes: 67 additions & 28 deletions omesa/io.py
Original file line number Diff line number Diff line change
@@ -1,56 +1,95 @@
"""Data handling functions."""

import csv
import sys
import json
import numpy as np
import pickle
import sys
from types import GeneratorType
from inspect import isclass, isgenerator

import numpy as np
from .tools import serialize_sk as sr

# pylint: disable=R0903,R0913,W0141
# pylint: disable=R0903,R0913,W0141,C0103


class Pipeline(object):
"""Shell for experiment pipeline storing and handling.
Parameters
----------
vec : class
Instance of Vectorizer with fitted pipes.
exp : class, optional, default None
Instance of Experimen with fitted pipes. If not supplied, name and
source should be set.
name : str, optional, default None
Name that the pipeline should be saved/loaded under/from.
clf : class
Classifier that adheres to the sklearn type (with a predict function).
source : tuple, optional, default None
Tuple with storage options, can be "man" (manual json serialization),
"json" (for jsonpickle, requires this package), "db" (for database
storage, requires blitzdb).
"""

def __init__(self, exp):
def __init__(self, exp=None, name=None, source=None):
"""Set the pipeline for transformation and clf for classification."""
self.vec = exp.vec
self.clf = exp.clf
if not exp:
assert name
self.vec = exp.vec if exp else None
self.clf = exp.clf if exp else None
self.hook = self.vec.conf['name'] if not name else name
self.serialize = None
self.storage = self.vec.conf['save'] if not source else source
if 'db' in self.storage:
from .database import Database, Experiment
self.db = Database()
self.data = Experiment
if 'json' in self.storage:
import jsonpickle
self.serialize = jsonpickle
# FIXME: jsonpickle should be preferred, doesn't currently work though
elif 'man' in self.storage or 'db' in self.storage:
self.serialize = sr
# self.hook += '_man'

def save(self):
"""bla."""
"""Save experiment and classifier in format specified."""
print(" Saving experiment...")
top = self.vec.__dict__
print(top)
ser = sr.data_to_json(top)
print(" done!")
if 'db' in top['conf']['save']:
pass
else:
json.dump(ser, open(top['conf']['name'] + '.json', 'w'))
top = {'name': self.hook, 'vec': self.vec, 'clf': self.clf}

fl = self.hook
if self.serialize:
serialized = self.serialize.encode(top)

if any([x in self.storage for x in ('man', 'json')]) and serialized:
json.dump(serialized, open(self.hook + '.json', 'w'))
if 'pickle' in self.storage:
for t in ('train', 'test'):
c = top['conf']['{0}_data'].format(t)
c = '' if isinstance(c, GeneratorType) else c
pickle.dump(top, open(fl + '.pickle', 'wb'))
if 'db' in self.storage:
doc = self.data(json.loads(serialized))
self.db.save(doc)

def load(self):
pass
"""Load experiment and classifier from source specified."""
if any([x in self.storage for x in ('man', 'json')]):
mod = self.serialize.decode(json.load(open(self.hook + '.json')))
if 'pickle' in self.storage:
mod = pickle.load(open(self.hook + '.pickle', 'rb'))
if 'db' in self.storage:
mod = self.db.fetch(self.data, {'name': self.hook})
mod = self.serialize.decode(json.dumps(dict(mod)))
self.clf = mod['clf']
self.vec = mod['vec']

def classify(self, data):
"""Given a data iterator, return a (label, probability) tuple."""
self.pipeline.conf['label_column'] = 0
self.pipeline.conf['text_column'] = 1
# self.pipeline.loader.handle.labs = None
v, _ = self.pipeline.test(data)
# FIXME: this is like a java call
enc = dict(map(reversed, self.pipeline.featurizer.labels.items()))
return [enc[l] for l in self.clf.predict(v)], self.clf.predict_proba(v)
"""Given a data point, return a (label, probability) tuple."""
X, y = self.vec.transform(data)
X = X.todense().reshape((1, -1))
# LinearSVC no predict proba?
return self.clf.predict(X) # , self.clf.predict_proba(X)


class CSV:
Expand Down
57 changes: 32 additions & 25 deletions omesa/tools/serialize_sk.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,18 @@
import json
import sys

class Dummy:

def __init__(self):
pass

def isnamedtuple(obj):
"""Heuristic check if an object is a namedtuple."""
return isinstance(obj, tuple) \
and hasattr(obj, "_fields") \
and hasattr(obj, "_asdict") \
and callable(obj._asdict)


def serialize(data):
if data is None or isinstance(data, (bool, int, float, str)):
return data
Expand All @@ -51,18 +55,6 @@ def serialize(data):
"fields": list(data._fields),
"values": [serialize(getattr(data, f)) for f in data._fields]}}
# --- custom ---
try:
_ = data.__next__ # generator as string
return {'py/generator': str(data)}
except AttributeError:
pass
try:
if not isinstance(data, type): # not numpy type
return {'py/class': {'name': data.__class__.__name__,
'mod': data.__module__,
'attr': data_to_json(data.__dict__)}}
except AttributeError as e:
pass
if isinstance(data, type):
return {"py/numpy.type": data.__name__}
if isinstance(data, np.integer):
Expand All @@ -82,21 +74,25 @@ def serialize(data):
return {"py/numpy.ndarray": {
"values": data.tolist(),
"dtype": str(data.dtype)}}
# --- custom ---
try:
_ = data.__next__ # generator as string
return {'py/generator': str(data)}
except AttributeError:
pass
try:
if not isinstance(data, type): # not numpy type
return {'py/class': {'name': data.__class__.__name__,
'mod': data.__module__,
'attr': serialize(data.__dict__)}}
except AttributeError as e:
print(e)
raise TypeError("Type %s not data-serializable" % type(data))


def restore(dct):
# --- custom ---
print(dct)
if "py/generator" in dct:
return []
if "py/class" in dct:
obj = dct["py/class"]
cls_ = getattr(sys.modules[obj['mod']], obj['name'])
class_init = cls_()
for k, v in restore(obj['attr']):
setattr(class_init, k, v)
return class_init
# print(dct)
if "py/numpy.type" in dct:
return np.dtype(dct["py/numpy.type"]).type
if "py/numpy.int" in dct:
Expand All @@ -118,10 +114,21 @@ def restore(dct):
return np.array(data["values"], dtype=data["dtype"])
if "py/collections.OrderedDict" in dct:
return OrderedDict(dct["py/collections.OrderedDict"])
# --- custom ---
if "py/generator" in dct:
return []
if "py/class" in dct:
obj = dct["py/class"]
cls_ = getattr(sys.modules[obj['mod']], obj['name'])
class_init = Dummy()
class_init.__class__ = cls_
for k, v in restore(obj['attr']).items():
setattr(class_init, k, v)
return class_init
return dct

def data_to_json(data):
def encode(data):
return json.dumps(serialize(data))

def json_to_data(s):
def decode(s):
return json.loads(s, object_hook=restore)

0 comments on commit 4a4db03

Please sign in to comment.