# Straggler experiments (figure 9)

Use bagging to train an ensemble of models for Clipper to demonstrate the effects of stragglers

In [1]:
import pandas as pd
import numpy as np
import sklearn.linear_model
%matplotlib inline
import matplotlib.pyplot as plt
import os
import sys
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
import warnings
warnings.filterwarnings('ignore', message='Changing the shape of non-C contiguous array')


def load_digits(digits_location, digits_filename = "train-mnist-dense-with-labels.data"):
    digits_path = os.path.join(digits_location, digits_filename)
    print "Source file:", digits_path
    df = pd.read_csv(digits_path, sep=",", header=None)
    data = df.values
    print "Number of image files:", len(data)
    y = data[:,0]
    X = data[:,1:]
    return (X, y)

def to_image(x):
    return np.reshape(x,[28,28])

def display_digit(x):
    plt.imshow(to_image(x), interpolation='none')

def display_random_digits(X, y):
    ind = np.random.permutation(len(X))
    plt.figure()
    for i in range(0, 16):
        plt.subplot(4,4,i+1)
        display_digit(X[ind[i],:])
        plt.draw()
        # Display the plot

        
def normalize_digits(X):
    mu = np.mean(X,0)
    sigma = np.var(X,0)
    Z = (X - mu) / np.array([np.sqrt(z) if z > 0 else 1. for z in sigma])
    return Z 

def fourier_project(X, nfeatures = 4096, scale = 1.e-4):
    (n,d) = X.shape
    W = np.random.normal(scale = scale, size = [d, nfeatures])
    phase = np.random.uniform( size = [1, nfeatures]) * 2.0 * np.pi
    randomFeatures = np.cos(X.dot(W) + phase)
    return randomFeatures

def filter_two_class(X, y, digitA = 3, digitB = 9):
    indexes = (y == (digitA + 1)) | (y == (digitB + 1))
    binary_labels = (y == (digitA + 1)) * 1.
    return (X[indexes], binary_labels[indexes])
    
#     return (yInd, yBinary[yInd])


def train_test_split(y, propTrain = 0.75):
    ind = np.random.permutation(len(y))
    split_ind = ind[0.75 * len(y)]
    train_ind = ind[:split_ind]
    test_ind = ind[split_ind:]
    print "Train size: ", len(train_ind)
    print "Train true: ", np.mean(y[train_ind] == 1.0)
    print "Test size:  ", len(test_ind)
    print "Test true:  ", np.mean(y[test_ind] == 1.0)
    return (train_ind, test_ind)


In [2]:
# # Load data
train_x, train_y = load_digits(os.path.expanduser("/Users/crankshaw/model-serving/data/mnist_data"))
train_x = normalize_digits(train_x)

test_x, test_y = load_digits(os.path.expanduser("/Users/crankshaw/model-serving/data/mnist_data"), "test-mnist-dense-with-labels.data")
test_x = normalize_digits(test_x)



Source file: /Users/crankshaw/model-serving/data/mnist_data/train-mnist-dense-with-labels.data
Number of image files: 60000
Source file: /Users/crankshaw/model-serving/data/mnist_data/test-mnist-dense-with-labels.data
Number of image files: 10000


In [3]:


def run_trial():
    ensemble_size=16
    # Train decision tree classifiers
    models = BaggingClassifier(base_estimator=RandomForestClassifier(n_estimators=5), n_estimators=ensemble_size, max_samples=0.05, n_jobs=-1)
    models.fit(train_x, train_y)

    # Look at baseline score
    print "Ensemble score: %f" % models.score(test_x, test_y)
    print "individual scores"
    for m in models.estimators_:
        print m.score(test_x, test_y - 1)

    # Run straggler experiment

    def vote(estimators, x, y):
        votes = np.zeros(len(estimators[0].classes_))
        for m in estimators:
            y_pred = m.predict(x)
            votes[y_pred] += 1
        y_hat = np.argmax(votes) + 1
        return int(y_hat == y)

    print "cumulative score"
    scores = []    
    for num_estimators in range(1,ensemble_size+1):
        num_correct = 0
        for i in range(len(test_y)):
            num_correct += vote(models.estimators_[:num_estimators], test_x[i].reshape(1,-1), test_y[i])
        score = float(num_correct)/float(len(test_y))
        print score
        scores.append(score)
    scores.reverse()
    return np.array(scores)

In [5]:
trials = []
for t in range(6):
    print "TRIAL %d" % t
    trials.append(run_trial())


TRIAL 0
Ensemble score: 0.929000
individual scores
0.8173
0.7989
0.8031
0.78
0.6382
0.7697
0.7838
0.7868
0.7224
0.8026
0.7941
0.796
0.7967
0.7756
0.7815
0.8114
cumulative score
0.8173
0.7788
0.8467
0.8656
0.8708
0.8787
0.8863
0.8965
0.8976
0.9014
0.9051
0.9082
0.9094
0.9113
0.9117
0.9146
TRIAL 1
Ensemble score: 0.931000
individual scores
0.8005
0.81
0.8144
0.7858
0.7992
0.8069
0.8082
0.7802
0.8063
0.7971
0.7598
0.7898
0.8079
0.7941
0.8109
0.7676
cumulative score
0.8005
0.7805
0.8451
0.8656
0.881
0.8877
0.8916
0.8992
0.9028
0.9062
0.9067
0.9084
0.9109
0.9128
0.9159
0.9165
TRIAL 2
Ensemble score: 0.930400
individual scores
0.7984
0.7868
0.76
0.7972
0.8115
0.7904
0.7802
0.7991
0.808
0.8067
0.8033
0.7909
0.7142
0.7996
0.7816
0.767
cumulative score
0.7984
0.7524
0.8255
0.856
0.8748
0.8843
0.8904
0.8961
0.9008
0.9043
0.907
0.9124
0.913
0.9134
0.9121
0.9133
TRIAL 3
Ensemble score: 0.927800
individual scores
0.7838
0.8003
0.7596
0.8087
0.807
0.7924
0.7943
0.6913
0.6982
0.7679
0.7765
0.8006
0.8

In [10]:
print np.mean(trials, axis=0)
print np.std(trials, axis=0)
print trials

[ 0.91386667  0.91245     0.91101667  0.90915     0.90736667  0.90405
  0.90168333  0.8976      0.89371667  0.88836667  0.88068333  0.87293333
  0.85843333  0.83255     0.75778333  0.78701667]
[ 0.00150185  0.00166908  0.00186317  0.00239078  0.00302251  0.00264496
  0.00268912  0.00324088  0.00390957  0.00300592  0.0052123   0.00497583
  0.00672227  0.01015263  0.01744892  0.02088456]
[[0.9146, 0.9117, 0.9113, 0.9094, 0.9082, 0.9051, 0.9014, 0.8976, 0.8965, 0.8863, 0.8787, 0.8708, 0.8656, 0.8467, 0.7788, 0.8173], [0.9165, 0.9159, 0.9128, 0.9109, 0.9084, 0.9067, 0.9062, 0.9028, 0.8992, 0.8916, 0.8877, 0.881, 0.8656, 0.8451, 0.7805, 0.8005], [0.9133, 0.9121, 0.9134, 0.913, 0.9124, 0.907, 0.9043, 0.9008, 0.8961, 0.8904, 0.8843, 0.8748, 0.856, 0.8255, 0.7524, 0.7984], [0.9117, 0.9119, 0.9108, 0.9075, 0.9046, 0.9032, 0.899, 0.8947, 0.8922, 0.8916, 0.8847, 0.8755, 0.8588, 0.8274, 0.7595, 0.7838], [0.9142, 0.9126, 0.9101, 0.9086, 0.9077, 0.9031, 0.8993, 0.8959, 0.8906, 0.8866, 0.8748, 0.8705

In [None]:
print scores
scores.reverse()
# scores = scores[:16]
fig, ax = plt.subplots()
ax.plot(range(16), scores[:16])
ax.set_xlabel("Size of ensemble")
ax.set_ylabel("Accuracy")
ax.set_ylim((ax.get_ylim()[0],1.0))
plt.show()


In [None]:
from sklearn.externals import joblib
base_path = "/Users/crankshaw/clipper-rust/model_wrappers/python/sklearn_models"
for (i,m) in enumerate(models.estimators_):
    dname = os.path.join(base_path, "sklearn_linsvm_10class_%d" % (i+1))
    os.mkdir(dname)
    fname = os.path.join(dname, "sklearn_linsvm_10class_%d.pkl" % (i+1))
    joblib.dump(m, fname)

In [None]:
preds = models.estimators_[0].predict(test_x[:100])

In [None]:
print scores

In [None]:
print scores