In [1]:
import re, os, sys, csv, time, joblib, uuid
from collections import Counter, defaultdict
import numpy as np
import pandas as pd
from termcolor import cprint
import matplotlib.pyplot as plt
from datetime import date
from sklearn import svm
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
%matplotlib inline

MODEL_VERSION = "0.1"
SMALL_SIZE = 10
MEDIUM_SIZE = 11
LARGE_SIZE = 12

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=LARGE_SIZE)   # fontsize of the figure title

def slide_print(text, color='white'):
    cprint(text, color, 'on_grey')

In [2]:
def train_model(X, y, saved_model):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)
    params = {'C': 1.0, 'kernel': 'linear', 'gamma': 0.5}
    
    clf = svm.SVC(**params, probability = True)
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    print(classification_report(y_test, y_pred))
    
    clf.fit(X, y)
    joblib.dump(clf, saved_model)

In [3]:
def predict(query, saved_model, verbose = True):
    if verbose:
        print('...predicting')
    time_start = time.time()
    model = joblib.load(saved_model)
    
    if isinstance(query, list):
        query = np.array([query])
    if len(query.shape) == 1:
        query = query.reshape(1, -1)
    
    y_pred = model.predict(query)
    y_proba = None
    if 'predict_proba' in dir(model) and model.probability == True:
        y_proba = model.predict_proba(query)
        
    m, s = divmod(time.time() - time_start, 60)
    h, m = divmod(m, 60)
    runtime = '%03d:%02d:%02d' % (h, m, s)
    _update_predict_log(y_pred, y_proba, query, runtime)
    
    return y_pred

In [7]:
def _update_predict_log(y_pred, y_proba, query, runtime):
    today = date.today()
    logfile = 'iris-svm-{}-{}.log'.format(today.year, today.month)
    
    header = [
        'unique_id',
        'timestamp',
        'y_pred',
        'y_proba',
        'x.shape',
        'model_version',
        'runtime'
    ]
    write_header = False
    
    if not os.path.exists(logfile):
        write_header = True
    with open(logfile, 'a') as csv_file:
        writer = csv.writer(csv_file, delimiter = ',', quotechar = '|')
        if write_header:
            writer.writerow(header)
        to_write = map(str, [
            uuid.uuid4(),
            time.time(),
            y_pred,
            y_proba,
            query.shape,
            MODEL_VERSION,
            runtime
        ])
        writer.writerow(to_write)

In [8]:
iris = datasets.load_iris()
X = iris.data[:, :2]
y = iris.target

today = date.today()
logfile = 'iris-svm-{}-{}.log'.format(today.year, today.month)
if os.path.exists(logfile):
    os.remove(logfile)

saved_model = 'iris-svm-{}.joblib'.format(re.sub('\.', '_', str(MODEL_VERSION)))
train_model(X, y, saved_model)

queries = [
    [6.1, 2.8],
    [7.7, 2.5],
    [5.8, 3.8]
]
y_pred = [predict(query, saved_model)[0] for query in queries]
print(f'predicted: {y_pred}')
print(f1_score([1, 2, 0], y_pred, average = 'weighted'))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       0.67      0.53      0.59        15
           2       0.63      0.75      0.69        16

    accuracy                           0.78        50
   macro avg       0.77      0.76      0.76        50
weighted avg       0.78      0.78      0.78        50

...predicting
...predicting
...predicting
predicted: [1, 2, 0]
1.0


In [9]:
Counter(y).items()

dict_items([(0, 50), (1, 50), (2, 50)])

In [10]:
def simulate_samples(nsamples, X, y, weights):
    totals = np.round(np.array(weights) * nsamples).astype(int)
    indices = np.arange(y.size)
    new_indices = []
    for i, c in enumerate(np.unique(y)):
        new_indices.extend(np.random.choice(indices[y == c], totals[i], replace = True))
    y_new = y[new_indices]
    X_new = X[new_indices, :]
    return X_new, y_new

n = 150
weights = np.array([0.25, 0.25, 0.50])
X_new, y_new = simulate_samples(n, X, y, weights)
Counter(y_new).items()

dict_items([(0, 38), (1, 38), (2, 75)])

In [12]:
drifting_weights = [np.array([(100 - p) / 2.0, (100 - p) / 2.0 ,p]) / 100.0 for p in np.arange(33, 95, 5)]
n = 100
for weights in drifting_weights:
    X_new, y_new = simulate_samples(n, X, y, weights)
    percent_class_3 = np.round(y_new[y_new == 2].size / y_new.size, 2)
    
    y_pred = [predict(X_new[row, :], saved_model, verbose = False)[0] for row in range(y_new.shape[0])]
    f1 = np.round(f1_score(y_new, y_pred, average = 'weighted'), 2)
    print(f'percent class 3: {percent_class_3}, f1_score: {f1}')

percent class 3: 0.33, f1_score: 0.88
percent class 3: 0.38, f1_score: 0.86
percent class 3: 0.43, f1_score: 0.82
percent class 3: 0.48, f1_score: 0.83
percent class 3: 0.52, f1_score: 0.81
percent class 3: 0.58, f1_score: 0.77
percent class 3: 0.64, f1_score: 0.8
percent class 3: 0.68, f1_score: 0.74
percent class 3: 0.72, f1_score: 0.79
percent class 3: 0.78, f1_score: 0.77
percent class 3: 0.84, f1_score: 0.76
percent class 3: 0.88, f1_score: 0.81
percent class 3: 0.92, f1_score: 0.77


In [14]:
from sklearn.covariance import EllipticEnvelope
from scipy.stats import wasserstein_distance

clf_y = EllipticEnvelope(random_state = 0, contamination = 0.01)
clf_X = EllipticEnvelope(random_state = 0, contamination = 0.01)

clf_X.fit(X)
clf_y.fit(y.reshape(y.size, 1))

results = defaultdict(list)

for weights in drifting_weights:
    X_new, y_new = simulate_samples(n, X, y, weights)
    results['class_3_percent'].append(np.round(y_new[y_new == 2].size, 2))
    results['wasserstein_X'].append(np.round(wasserstein_distance(X.flatten(), X_new.flatten()), 2))
    results['wasserstein_y'].append(np.round(wasserstein_distance(y, y_new), 2))
    test1 = clf_X.predict(X_new)
    test2 = clf_y.predict(y_new.reshape(y_new.size, 1))
    results['outlier_percent_X'].append(np.round(1.0 - (test1[test1 == 1].size / test1.size), 2))
    results['outlier_percent_y'].append(np.round(1.0 - (test2[test2 == 1].size / test2.size), 2))

results_df = pd.DataFrame(results)
results_df

Unnamed: 0,class_3_percent,wasserstein_X,wasserstein_y,outlier_percent_X,outlier_percent_y
0,33,0.06,0.01,0.02,0.0
1,38,0.07,0.07,0.0,0.0
2,43,0.08,0.15,0.01,0.0
3,48,0.17,0.22,0.01,0.0
4,53,0.15,0.29,0.04,0.0
5,58,0.2,0.37,0.01,0.0
6,63,0.17,0.45,0.01,0.0
7,68,0.25,0.52,0.0,0.0
8,73,0.25,0.58,0.01,0.0
9,78,0.33,0.67,0.0,0.0


In [15]:
(
results_df
    .style
    .hide_index()
     .bar(color='lightblue', vmin=0, subset=['wasserstein_X'], align='zero')
     .bar(color='lightblue', vmin=0, subset=['wasserstein_y'], align='zero')
     .bar(color='red', vmin=0, vmax=0.1,subset=['outlier_percent_X'], align='zero')
     .bar(color='red', vmin=0, vmax=0.1,subset=['outlier_percent_y'], align='zero')
     .set_caption('Performance Monitoring')
)

class_3_percent,wasserstein_X,wasserstein_y,outlier_percent_X,outlier_percent_y
33,0.06,0.01,0.02,0.0
38,0.07,0.07,0.0,0.0
43,0.08,0.15,0.01,0.0
48,0.17,0.22,0.01,0.0
53,0.15,0.29,0.04,0.0
58,0.2,0.37,0.01,0.0
63,0.17,0.45,0.01,0.0
68,0.25,0.52,0.0,0.0
73,0.25,0.58,0.01,0.0
78,0.33,0.67,0.0,0.0
