## Train and Test Classifiers

In [13]:
from os import path
import urllib


if not path.exists('bottlenecks_with_labels.pkl'):
    os.system('wget https://s3-us-west-2.amazonaws.com/vegnonveg/bottlenecks_with_labels.pkl')
if not path.exists('labels_bigdl_classifier.pkl'):
    os.system('wget https://s3-us-west-2.amazonaws.com/vegnonveg/labels_bigdl_classifier.pkl')

In [14]:
import pickle
import numpy as np

#Import all the required packages
from bigdl.nn.layer import *
from optparse import OptionParser
from bigdl.nn.criterion import *
from bigdl.optim.optimizer import *
from bigdl.util.common import *
from bigdl.dataset.transformer import *
from bigdl.nn.initialization_method import *
from transformer import *
from imagenet import *
from transformer import Resize
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.linear_model import SGDClassifier, LogisticRegression

In [15]:
data = pickle.load(open("./bottlenecks_with_labels.pkl", 'rb'))
label_nums = pickle.load(open("./labels_bigdl_classifier.pkl", 'rb'))
print label_nums

{'Chicken eggs, caged hen, large size': 0, 'Fresh bananas, standard': 1, 'Fresh onions': 2, 'Fresh cucumber': 3, 'Fresh apple, red delicious': 4, 'Fresh potatoes, brown': 5, 'Fresh carrots': 6, 'Fresh oranges': 7, 'Fresh apples, typical local variety': 8}


## Use Stratified Train/Test Split
To make sure we have the same distribution of samples across labels in both train and test sets.

In [16]:

x_train, x_test, train_labels, test_labels = \
    train_test_split(data['bottleneck_values'], 
                     data['labels'], 
                     test_size=0.2, 
                     random_state=101,
                     stratify=data['labels'])
len(x_train), len(train_labels), len(x_test), len(test_labels)

(1541, 1541, 386, 386)

In [17]:
_, train_counts = np.unique(np.array(train_labels), return_counts=True)
train_counts = train_counts.astype(np.float) / len(train_labels)

In [18]:
_, test_counts = np.unique(np.array(test_labels), return_counts=True)
test_counts = test_counts.astype(np.float) / len(test_labels)
# Difference in labels counts, %
(train_counts - test_counts) / train_counts * 100

array([ 2.28812638,  0.19430052, -1.21141356,  1.36848522,  0.19430052,
       -0.32824241, -0.27871228, -0.77468686, -1.03786861])

## Classifier #1: BigDL Logistic Regression

In [19]:
# get rdd
def get_rdd_sample(images, labels):
    labels = map(lambda(word): label_nums[word] + 1, labels)
    imgs = sc.parallelize(images)
    labels = sc.parallelize(labels)
    sample_rdd = imgs.zip(labels).map(lambda(bottleneck, label): Sample.from_ndarray(bottleneck, np.array(label)))
    return sample_rdd



In [20]:
train_rdd = get_rdd_sample(x_train, train_labels)
test_rdd = get_rdd_sample(x_test, test_labels)

## Define Model

In [23]:
# initializing BigDL engine
init_engine()

In [24]:
# Parameters
learning_rate = 0.2
training_epochs = 40
batch_size = 60

# Network Parameters
n_input = 1024 # 1024
n_classes = len(label_nums) # item_name categories

def fc_layer(n_input, n_classes):
    model = Sequential()
    model.add(Linear(n_input, n_classes))
    model.add(LogSoftMax())
    return model # Create an Optimizer

model = fc_layer(n_input, n_classes)

creating: createSequential
creating: createLinear
creating: createLogSoftMax


In [25]:
optimizer = Optimizer(
    model=model,
    training_rdd=train_rdd,
    criterion=ClassNLLCriterion(),
    optim_method=SGD(learningrate=learning_rate),
    end_trigger=MaxEpoch(training_epochs),
    batch_size=batch_size)
# Set the validation logic
optimizer.set_validation(
    batch_size=batch_size,
    val_rdd=test_rdd,
    trigger=EveryEpoch(),
    val_method=[Top1Accuracy()]
)

app_name= 'vegnonveg' # + dt.datetime.now().strftime("%Y%m%d-%H%M%S")
train_summary = TrainSummary(log_dir='/tmp/bigdl_summaries',
                                     app_name=app_name)
train_summary.set_summary_trigger("Parameters", SeveralIteration(50))
val_summary = ValidationSummary(log_dir='/tmp/bigdl_summaries',
                                        app_name=app_name)
optimizer.set_train_summary(train_summary)
optimizer.set_val_summary(val_summary)
print "saving logs to ",app_name

creating: createClassNLLCriterion
creating: createDefault
creating: createSGD
creating: createMaxEpoch
creating: createOptimizer
creating: createEveryEpoch
creating: createTop1Accuracy
creating: createTrainSummary
creating: createSeveralIteration
creating: createValidationSummary
saving logs to  vegnonveg


In [26]:
# Start to train
trained_model = optimizer.optimize()
print "Optimization Done."

Optimization Done.


In [27]:
def map_predict_label(l):
    return np.array(l).argmax()
def map_groundtruth_label(l):
    return l[0] - 1
def map_to_label(l):
    return label_nums.keys()[label_nums.values().index(l)]

In [28]:
'''
Look at some predictions and their accuracy
'''
predictions = trained_model.predict(test_rdd)

num_preds = 8
truth = test_rdd.take(num_preds)
preds = predictions.take(num_preds)

for idx in range(num_preds):
    true_label = str(map_to_label(map_groundtruth_label(truth[idx].label)))
    pred_label = str(map_to_label(map_predict_label(preds[idx])))
    print idx + 1, ')', 'Ground Truth label: ', true_label
    print idx + 1, ')', 'Predicted label: ', pred_label
    print "correct" if true_label == pred_label else "wrong"

1 ) Ground Truth label:  Fresh potatoes, brown
1 ) Predicted label:  Fresh onions
wrong
2 ) Ground Truth label:  Fresh apples, typical local variety
2 ) Predicted label:  Fresh apples, typical local variety
correct
3 ) Ground Truth label:  Fresh cucumber
3 ) Predicted label:  Fresh oranges
wrong
4 ) Ground Truth label:  Fresh cucumber
4 ) Predicted label:  Fresh cucumber
correct
5 ) Ground Truth label:  Fresh apples, typical local variety
5 ) Predicted label:  Fresh apples, typical local variety
correct
6 ) Ground Truth label:  Fresh cucumber
6 ) Predicted label:  Fresh cucumber
correct
7 ) Ground Truth label:  Fresh carrots
7 ) Predicted label:  Fresh carrots
correct
8 ) Ground Truth label:  Fresh potatoes, brown
8 ) Predicted label:  Fresh potatoes, brown
correct


In [29]:
'''
Measure Test Accuracy w/Test Set
'''
results = trained_model.test(test_rdd, len(x_test), [Top1Accuracy()])
print(results[0])

creating: createTop1Accuracy
Test result: 0.712435245514, total_num: 386, method: Top1Accuracy


## Classifier #2: Neural Net

In [30]:
clf = MLPClassifier(hidden_layer_sizes=(512,))

In [31]:
%%time 
cross_val_score(clf, x_train, train_labels, cv=StratifiedKFold(n_splits=3), scoring='accuracy')

CPU times: user 1min 8s, sys: 12.9 s, total: 1min 21s
Wall time: 1min 13s


array([ 0.70599613,  0.68677043,  0.68235294])

## Classifier #3: Logistic Regression

In [32]:
%%time
cross_val_score(LogisticRegression(), x_train, train_labels, cv=StratifiedKFold(n_splits=3), scoring='accuracy')

CPU times: user 10.4 s, sys: 137 ms, total: 10.6 s
Wall time: 16.4 s


array([ 0.67117988,  0.6614786 ,  0.66666667])