In [1]:
%pylab inline
import pandas as pd
import numpy as np
import datetime as dt

from bigdl.nn.layer import *
from bigdl.nn.criterion import *
from bigdl.optim.optimizer import *
from bigdl.util.common import *
from bigdl.dataset.transformer import *
from bigdl.dataset import mnist

init_engine()

Populating the interactive namespace from numpy and matplotlib


In [2]:
'''
Generate Sorted Labels from csv
'''
df = pd.read_csv("~/Downloads/vegnonveg-samples_labels.csv")
df = df.sort_values(by='obs_uid', axis = 0)

'''
Labels are stored here
'''
nums = range(0,9)
categ = df.bh_name.unique()
label_dict = dict(zip(categ, nums))

print label_dict

train_labels = df['bh_name'][0:2176].map(label_dict).get_values()
test_labels = df['bh_name'][2176:].map(label_dict).get_values()
print train_labels.shape
print test_labels.shape

{'Poultry': 5, 'Pork': 4, 'Fresh, chilled or frozen fish and seafood': 3, 'Fresh or chilled potatoes': 2, 'Lamb, mutton and goat': 8, 'Egg and egg-based products': 7, 'Fresh or chilled fruit': 1, 'Fresh or chilled vegetables other than potatoes': 0, 'Beef and veal': 6}
(2176,)
(435,)


In [3]:
import pickle
import numpy as np

#list of 1024 numpy arrays (bottlenecks)
bottlenecks = pickle.load(open("pickle_bottleneck.dat", "rb"))

In [4]:
bottlenecks_adj = np.array(bottlenecks)
train_images = bottlenecks_adj[0:2176]
test_images = bottlenecks_adj[2176:]

print train_images.shape
print test_images.shape

(2176, 1024)
(435, 1024)


In [5]:
# def get_rdd(train_images, test_images, train_labels, test_labels):
#     training_mean = np.mean(train_images)
#     training_std = np.std(train_images)
#     rdd_train_images = sc.parallelize(train_images)
#     rdd_train_labels = sc.parallelize(train_labels)
#     rdd_test_images = sc.parallelize(test_images)
#     rdd_test_labels = sc.parallelize(test_labels)

#     rdd_train_sample = rdd_train_images.zip(rdd_train_labels).map(lambda (features, label):Sample.from_ndarray(
#                                         (features - training_mean) / training_std,
#                                         label + 1))
#     rdd_test_sample = rdd_test_images.zip(rdd_test_labels).map(lambda (features, label):Sample.from_ndarray(
#                                         (features - training_mean) / training_std,
#                                         label + 1))
#     return (rdd_train_sample, rdd_test_sample)

In [6]:
def get_rdd(train_images, test_images, train_labels, test_labels):
    rdd_train_images = sc.parallelize(train_images)
    rdd_train_labels = sc.parallelize(train_labels)
    rdd_test_images = sc.parallelize(test_images)
    rdd_test_labels = sc.parallelize(test_labels)

    rdd_train_sample = rdd_train_images.zip(rdd_train_labels).map(lambda (features, label): Sample.from_ndarray((features),label + 1))
    rdd_test_sample = rdd_test_images.zip(rdd_test_labels).map(lambda (features, label):Sample.from_ndarray((features),label + 1))
    return (rdd_train_sample, rdd_test_sample)

In [7]:
(train_data, test_data) = get_rdd(train_images, test_images, train_labels, test_labels)

In [8]:
print train_data.count()
print test_data.count()

2176
435


In [9]:
# Parameters
learning_rate = 0.2
training_epochs = 40
batch_size = 60

# Network Parameters
n_input = 1024 # 1024
n_classes = 9 # bh_name categories

In [10]:
def fc_layer(n_input, n_classes):
    model = Sequential()
    model.add(Linear(n_input, n_classes))
    model.add(LogSoftMax())
    return model# Create an Optimizer

model = fc_layer(n_input, n_classes)

creating: createSequential
creating: createLinear
creating: createLogSoftMax


In [11]:
optimizer = Optimizer(
    model=model,
    training_rdd=train_data,
    criterion=ClassNLLCriterion(),
    optim_method=SGD(learningrate=learning_rate),
    end_trigger=MaxEpoch(training_epochs),
    batch_size=batch_size)
# Set the validation logic
optimizer.set_validation(
    batch_size=batch_size,
    val_rdd=test_data,
    trigger=EveryEpoch(),
    val_method=[Top1Accuracy()]
)

app_name= 'linear-' # + dt.datetime.now().strftime("%Y%m%d-%H%M%S")
train_summary = TrainSummary(log_dir='/tmp/bigdl_summaries',
                                     app_name=app_name)
train_summary.set_summary_trigger("Parameters", SeveralIteration(50))
val_summary = ValidationSummary(log_dir='/tmp/bigdl_summaries',
                                        app_name=app_name)
optimizer.set_train_summary(train_summary)
optimizer.set_val_summary(val_summary)
print "saving logs to ",app_name

creating: createClassNLLCriterion
creating: createDefault
creating: createSGD
creating: createMaxEpoch
creating: createOptimizer
creating: createEveryEpoch
creating: createTop1Accuracy
creating: createTrainSummary
creating: createSeveralIteration
creating: createValidationSummary
saving logs to  linear-


In [12]:
# Start to train
trained_model = optimizer.optimize()
print "Optimization Done."

Optimization Done.


In [27]:
def map_predict_label(l):
    return np.array(l).argmax()
def map_groundtruth_label(l):
    return l[0] - 1
def map_to_label(l):
    return label_dict.keys()[label_dict.values().index(l)]

In [35]:
predictions = trained_model.predict(test_data)

print 'Ground Truth labels:'
print '\n '.join(str(map_to_label(map_groundtruth_label(s.label))) for s in test_data.take(8))
#print 'Ground Truth:'
#print '\t'.join([str(s.label) for s in ground_truth])
print "\n"
print 'Predicted labels:'
print '\n '.join(str(map_to_label(map_predict_label(s))) for s in predictions.take(8))

Ground Truth labels:
Fresh or chilled potatoes
 Fresh or chilled potatoes
 Fresh or chilled fruit
 Fresh or chilled fruit
 Fresh, chilled or frozen fish and seafood
 Fresh or chilled fruit
 Fresh or chilled potatoes
 Egg and egg-based products


Predicted labels:
Fresh or chilled potatoes
 Fresh or chilled potatoes
 Fresh or chilled fruit
 Fresh or chilled vegetables other than potatoes
 Fresh, chilled or frozen fish and seafood
 Fresh or chilled fruit
 Fresh or chilled fruit
 Fresh or chilled potatoes


In [36]:
'''
Measure Test Accuracy
'''
results = trained_model.test(test_data, 200, [Top1Accuracy()])
print(results[0])

creating: createTop1Accuracy
Test result: 0.698850572109, total_num: 435, method: Top1Accuracy
