In [None]:
from numpy import array
import numpy as np
import random
import json
import time

In [None]:
#load dataset
dataset = np.load('dataset.npy')

In [None]:
#front-end json inputs
inputs = {
    "tt_split": "0.5",
    "calories": True,
    "has_carb": True,
    "has_cheese": True,
    "has_marinara_sauce": True,
    "has_meat": True,
    "is_round": True,
    "cook_method": True,
    "food_origin": True,
}

In [None]:
front_end = json.dumps(inputs)

In [None]:
payload = json.loads(front_end)

In [6]:
def selection(inputs):
    S = []
    if inputs['calories'] == True:
        S.append(0)
    if inputs['has_carb'] == True:
        S.append(1)
    if inputs['has_cheese'] == True:
        S.append(2)
    if inputs['has_marinara_sauce'] == True:
        S.append(3)
    if inputs['has_meat'] == True:
        S.append(4)
    if inputs['is_round'] == True:
        S.append(5)
    if inputs['cook_method'] == True:
        S = S + [6,7,8,9,10,11,12,13]
    if inputs['food_origin'] == True:
        S = S + [14,15,16,17,18,19]
    return S

def unselectColumn(L, L1):
    for l in L:
        yield [x for i, x in enumerate(l) if i not in L1]

In [7]:
F = range(20)
S = selection(payload)
L1 = [x for x in F if x not in S]
subset = [e for e in unselectColumn(dataset, L1)]

In [8]:
cols = [u'calories',
 u'has_carb',
 u'has_cheese',
 u'has_marinara_sauce',
 u'has_meat',
 u'is_round',
 u'Bake in oven',
 u'Cook in a skillet',
 u'Cook in pan',
 u'Deep fried',
 u'Made by hand',
 u'Microwaved',
 u'Saute in a skillet',
 u'Wrapped by hand',
 u'America',
 u'China',
 u'Italy',
 u'Japan',
 u'Mexico',
 u'Taiwan',
 u'is_pizza']

In [9]:
mapping = {}
for i, c in enumerate(cols):
    mapping[i] = c

In [10]:
feat_map = {}
for s in range(len(S)):
    feat_map['X'+str(s+1)] = mapping[S[s]]

In [11]:
# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

def train_test_split(tt_split, dataset):
    ds = len(dataset)
    train_set = dataset[:int(ds*tt_split)]
    test_set = dataset[int(ds*tt_split):]
    return train_set, test_set

# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(algorithm, train_set, test_set, *args):
    predicted = algorithm(train_set, test_set, *args)
    actual = [row[-1] for row in test_set]
    accuracy = accuracy_metric(actual, predicted)
    return accuracy
 
# Split a dataset based on an attribute and an attribute value
def test_split(index, value, dataset):
    left, right = list(), list()
    for row in dataset:
        if row[index] < value:
            left.append(row)
        else:
            right.append(row)
    return left, right
 
# Calculate the Gini index for a split dataset
def gini_index(groups, class_values):
    gini = 0.0
    for class_value in class_values:
        for group in groups:
            size = len(group)
            if size == 0:
                continue
            proportion = [row[-1] for row in group].count(class_value) / float(size)
            gini += (proportion * (1.0 - proportion))
    return gini
 
# Select the best split point for a dataset
def get_split(dataset):
    class_values = list(set(row[-1] for row in dataset))
    b_index, b_value, b_score, b_groups = 999, 999, 999, None
    for index in range(len(dataset[0])-1):
        for row in dataset:
            groups = test_split(index, row[index], dataset)
            gini = gini_index(groups, class_values)
            if gini < b_score:
                b_index, b_value, b_score, b_groups = index, row[index], gini, groups
    return {'index':b_index, 'value':b_value, 'groups':b_groups}
 
# Create a terminal node value
def to_terminal(group):
    outcomes = [row[-1] for row in group]
    return max(set(outcomes), key=outcomes.count)
 
# Create child splits for a node or make terminal
def split(node, max_depth, min_size, depth):
    left, right = node['groups']
    del(node['groups'])
    # check for a no split
    if not left or not right:
        node['left'] = node['right'] = to_terminal(left + right)
        return
    # check for max depth
    if depth >= max_depth:
        node['left'], node['right'] = to_terminal(left), to_terminal(right)
        return
    # process left child
    if len(left) <= min_size:
        node['left'] = to_terminal(left)
    else:
        node['left'] = get_split(left)
        split(node['left'], max_depth, min_size, depth+1)
    # process right child
    if len(right) <= min_size:
        node['right'] = to_terminal(right)
    else:
        node['right'] = get_split(right)
        split(node['right'], max_depth, min_size, depth+1)
 
# Build a decision tree
def build_tree(train, max_depth, min_size):
    root = get_split(train)
    split(root, max_depth, min_size, 1)
    return root
 
# Make a prediction with a decision tree
def predict(node, row):
    if row[node['index']] < node['value']:
        if isinstance(node['left'], dict):
            return predict(node['left'], row)
        else:
            return node['left']
    else:
        if isinstance(node['right'], dict):
            return predict(node['right'], row)
        else:
            return node['right']
 
# Classification and Regression Tree Algorithm
def decision_tree(train, test, max_depth, min_size):
    tree = build_tree(train, max_depth, min_size)
    predictions = list()
    for row in test:
        prediction = predict(tree, row)
        predictions.append(prediction)
    return(predictions)

def print_tree(node, depth=0):
    if isinstance(node, dict):
        print('%s[%s < %.3f]' % ((depth*'   ', feat_map['X'+str((node['index']+1))], node['value'])))
        print_tree(node['left'], depth+1)
        print_tree(node['right'], depth+1)
    else:
        print('%s[%s]' % ((depth*'   '+' ', node)))

def check_key(td, n):
    if n not in td:
        return n
    else:
        return check_key(td, n+1)

def save_tree(node, td, n, feat_map, depth=0):
    if isinstance(node, dict):
        n = check_key(td, n)
        td[n] = '%s[%s < %s]' % ((depth*'---', feat_map['X'+str((node['index']+1))], node['value']))
        save_tree(node['left'], td, n, feat_map, depth+1)
        save_tree(node['right'], td, n, feat_map, depth+1)
    else:
        n = check_key(td, n)
        td[n] = '%s[%s]' % ((depth*'---', node))
    return td

def get_tree(train_set, test_set, feat_map, max_depth, min_size):
    td = {}
    n=0
    tree = build_tree(train_set, max_depth, min_size)
    outputs = save_tree(tree, td, n, feat_map)    
    return outputs

def train(split, dataset, feat_map):
    tt_split = split
    max_depth = 3
    min_size = 2
    td = {}
    n=0
    train_set, test_set = train_test_split(split, dataset)
    train_scores = evaluate_algorithm(dataset, decision_tree, train_set, train_set, max_depth, min_size)
    test_scores = evaluate_algorithm(dataset, decision_tree, train_set, test_set, max_depth, min_size)
    tree = build_tree(train_set, max_depth, min_size)
    outputs = save_tree(tree, td, n, feat_map)    
    return str(train_scores), str(test_scores), outputs

In [12]:
tt_split = float(payload['tt_split'])
max_depth = 3
min_size = 2

In [13]:
train_set, test_set = train_test_split(tt_split, subset)

In [14]:
outputs = get_tree(train_set, test_set, feat_map, max_depth, min_size)
outputs

{0: u'[has_meat < 1]',
 1: u'---[calories < 2998]',
 2: u'------[calories < 2967]',
 3: '---------[1]',
 4: '---------[1]',
 5: u'------[calories < 2998]',
 6: '---------[1]',
 7: '---------[1]',
 8: u'---[is_round < 1]',
 9: u'------[calories < 101]',
 10: '---------[0]',
 11: '---------[0]',
 12: u'------[calories < 3170]',
 13: '---------[1]',
 14: '---------[1]'}

In [15]:
train_scores = evaluate_algorithm(decision_tree, train_set, train_set, max_depth, min_size)
train_scores

90.8

In [16]:
test_scores = evaluate_algorithm(decision_tree, train_set, test_set, max_depth, min_size)
test_scores

96.39999999999999