In [None]:
# default_exp demo.version_1

# Demo

> API details.

This is just a dummy set of models, so I can check dependency tracking. It's from the [sklearn documentation](https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html?highlight=predict_proba)

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
from stack.imports import *

In [None]:
#export

def load_if_present(path, mode='rb'):
    if path is None: return None
    if not Path.exists(path): return None
    return load(open(path, mode))

def dump_if_path(o, path=None, mode='wb'):
    if path is not None: dump(o, open(path, mode))
    return o

def get_scaler(X=None, fn=StandardScaler, path=None):
    scaler = load_if_present(path)
    if not X is None:
        scaler = fn()
        scaler.fit(X)
    if not path is None: dump(scaler, open(path, 'wb'))
    return scaler
        
def generate_data(path=None):
    X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
                               random_state=1, n_clusters_per_class=1)
    rng = np.random.RandomState(2)
    X += 2 * rng.uniform(size=X.shape)
    ds = make_circles(noise=0.2, factor=0.5, random_state=1)
    X, y = ds

    scaler = get_scaler(X=X, path=path)
    X = scaler.transform(X)

    return X, y, scaler

def split_data(X, y, test_size=.4, **kwargs):
    return train_test_split(X, y, test_size=test_size, **kwargs)

def train_model(X_train, y_train, path=None):
    clf = load_if_present(path)
    if clf is None:
        clf = SVC(gamma=2, C=1, probability=True)
        clf.fit(X_train, y_train)
        dump_if_path(clf, path=path)
    return clf

def predict(params):
    root = Path('../tmp')
    scaler_path = root/'scaler.pkl'
    clf_path = root/'clf.pkl'
    clf = load_if_present(clf_path)
    scaler = load_if_present(scaler_path)
    params = np.asarray(params).reshape(1, -1)
    X = scaler.transform(params)
    choice = clf.predict(X)[0]
    probabilities = clf.predict_proba(X)[0]
    return choice, probabilities

In [None]:
def rm_if_exists(path):
    if not Path.exists(path): return None
    Path.unlink(path)

root = Path('../tmp')
test_file = root/'test_file'
rm_if_exists(test_file)

# There is no test file currently
o = dict(a=1)
dump_if_path(o, path=test_file)
assert Path.exists(test_file)

# Just wrote the test file
o1 = load_if_present(test_file)
assert o1['a'] == 1
rm_if_exists(test_file)

In [None]:
root = Path('../tmp')
scaler_path = root/'scaler.pkl'
model_path = root/'clf.pkl'

X, y, scaler = generate_data(path=scaler_path)

X_train, X_test, y_train, y_test = split_data(X, y, random_state=42)
clf = train_model(X_train, y_train, path=model_path)
score = clf.score(X_test, y_test)
print(score)
assert score > 0.8

0.875


In [None]:
a = [-1.2, -1]
cls, probabilities = predict(a)
i = 1 - cls
assert probabilities[cls] > probabilities[i]

### Making Sense

I think I'm OK with this for now:

* It has a few functions that will eventually get moved into a utility module.
* It can make a prediction from a single set of parameters.
* It scales the parameters (using just local files for now)

### Now What?

Now that I have a trained model and data, but I don't have the Docker setup, MinIO, or DVC, I'm not sure where I want to stick these. I'm going to export the building and predicting functions into a library, and go from there.