In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from collections import Counter
import xmltodict
import pyodbc
import os
import itertools
import time
import json
import requests
import boto3
import pickle
import glob
import math
from datetime import datetime
from pandas.io.json import json_normalize
from collections import OrderedDict, Counter
import json, ast
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
import json_tricks
import sklearn
import copy
from collections import Counter, defaultdict
import xgboost

In [None]:
from sklearn.datasets import load_boston

### Configuration and Constructors

In [None]:
accountname = os.getenv('USER')
accountkey = os.getenv('PASSWORD')
aws_access_key = os.getenv('AWS_ACCESS_KEY')
aws_secret_key = os.getenv('AWS_SECRET_KEY')
s3_bucket = os.getenv('S3_BUCKET')

env = os.getenv('MODE').lower()
if env == 'api':
    env = 'prod'

In [None]:
s3_client = boto3.client('s3', aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key)

In [None]:
pd.options.display.max_columns = 999
pd.options.display.max_rows = 999

In [None]:
srcdata = load_boston()

In [None]:
df = pd.DataFrame(srcdata['data'])
df.columns = srcdata['feature_names']

In [None]:
df['objective'] = srcdata.target * 1000

In [None]:
df.head()

# Feature Engineering

In [None]:
plt.scatter(df['AGE'], df['objective'])
plt.xlabel('AGE')
plt.ylabel('PRICE')
plt.show()

In [None]:
mean_price = df['objective'].mean()
print(mean_price)
df['objective'] = df['objective'] > mean_price

Not necessarily useful in practice, but good for demonstration purposes.

In [None]:
plt.scatter(df['AGE'], df['objective'])
plt.xlabel('AGE')
plt.ylabel('PRICE > mean(PRICE)')
plt.show()

# Modeling

In [None]:
X = df.drop('objective', axis = 1)
y = df['objective']

In [None]:
clf = LogisticRegression()

In [None]:
kf = KFold(n_splits=4)

In [None]:
models = []
for train_index, test_index in kf.split(X):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train = X.loc[train_index]
    X_test = X.loc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model = clf.fit(X_train, y_train)
    models.append(model)

In [None]:
model = models[0]
y_pred = model.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, y_pred).tolist()

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues,
                         img_fname=None):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    cm = np.array(cm)
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    plt.show()

In [None]:
classes = ['Below average', 'Above average']
plot_confusion_matrix(cm, classes)

# Testing

* run `python model_service.py` locally

In [None]:
d = df.iloc[0].to_dict()
del d['objective']
d

In [None]:
r = requests.post('http://0.0.0.0:9988/api/predict/latest', data=d)

In [None]:
json.loads(r.content.decode('utf-8'))

# Deployment

In [None]:
model_name = 'latest'
fname = model_name + '.pickle'

In [None]:
pickle.dump(clf, open(fname, 'wb'))

In [None]:
f = open(fname, 'rb')
body = f.read()
f.close()

In [None]:
key = 'models/' + fname

In [None]:
s3_client.put_object(Body=body, Bucket=s3_bucket, Key=key)

# Let's save a copy that is timestamped

In [None]:
epoch_time = str(int(time.time()*1000))
fname = epoch_time + '.pickle'
n = datetime.now()
key = 'models/{yyyy}/{mm}/{fn}'.format(yyyy=n.year, mm=n.month, fn=fname)
s3_client.put_object(Body=body, Bucket=s3_bucket, Key=key)

# TODO: update model tracking database