In [1]:
import DecisionTree_entropy
import csv
from pyspark.sql.types import StringType
import random
from math import log, sqrt
import logging
from sklearn import cross_validation
from concurrent.futures import ProcessPoolExecutor



In [2]:
sc.addPyFile("DecisionTree_entropy.py")

In [3]:
def csv_to_rdd(filepath):
    rdd = sc.textFile(filepath).mapPartitions(lambda line: csv.reader(line, delimiter=','))
    rdd_2 = rdd.map(lambda row: (int(row[0]), row[1], row[2], int(row[3]), row[4],row[5], row[6], row[7], row[8], int(row[9]),int(row[10]), int(row[11]), row[12], row[13]))
    rdd_3 = rdd_2.map(lambda x: x.strip() if isinstance(x, str) else x)
    return rdd_3

In [4]:
def test_tree():
    data = csv_to_rdd("../data/income.csv").collect()
    tree = DecisionTree_entropy.DecisionTreeClassifier(random_features=True)
    tree.fit(data)

    print(tree.predict([39, 'State-gov', 'Bachelors', 13, 'Never-married',
                        'Adm-clerical', 'Not-in-family', 'White', 'Male',
                        2174, 0, 40, 'United-States']))


if __name__ == '__main__':
    test_tree()

 <=50K


In [36]:
def dataset_tree(partition):
    final_iterator = []
    for sub_list in partition:
        final_iterator.append(list(sub_list))
    return iter(final_iterator)

In [37]:
data = csv_to_rdd("../data/income.csv").repartition(60)

In [38]:
data.getNumPartitions()

60

In [39]:
rdd_trees = data.mapPartitionsWithIndex(lambda index, part: (yield index, dataset_tree(part)))

In [40]:
rdd_trees.first()

(0, <list_iterator at 0x1a0e4b0a90>)

In [63]:
class DecisionNode:
    def __init__(self, col=-1, value=None, results=None, tb=None, fb=None):
        self.col = col
        self.value = value
        self.results = results
        self.tb = tb
        self.fb = fb

In [43]:
def choose_random_features(row):
    num_features = len(row) - 1
    return random.sample(range(num_features), int(sqrt(num_features)))

In [44]:
def get_features_subset(row, features_indexes):
    return [row[i] for i in features_indexes]

In [45]:
def unique_counts(rows):
    results = {}
    for row in rows:
        r = row[len(row) - 1]
        if r not in results:
            results[r] = 0
        results[r] += 1
    return results

In [46]:
def entropy(rows):
    log2 = lambda x: log(x) / log(2)
    results = unique_counts(rows)
    ent = 0.0
    for r in results.keys():
        p = float(results[r]) / len(rows)
        ent = ent - p * log2(p)
    return ent

In [47]:
def divide_set(rows, column, value):
    split_function = None
    if isinstance(value, int) or isinstance(value, float):
        split_function = lambda row: row[column] >= value
    else:
        split_function = lambda row: row[column] == value

    set1 = [row for row in rows if split_function(row)]
    set2 = [row for row in rows if not split_function(row)]

    return set1, set2

In [75]:
def build_tree(rows,depth):
    if len(rows) == 0:
        return 0
    if depth == 0:
        return 1

    current_score = entropy(rows)
    best_gain = 0.0
    best_criteria = None
    best_sets = None
    column_count = len(rows[0]) - 1

    for col in range(0, column_count):
        column_values = {}
        for row in rows:
            column_values[row[col]] = 1
        for value in column_values.keys():
            set1, set2 = divide_set(rows, col, value)

            p = float(len(set1)) / len(rows)
            gain = current_score - p * entropy(set1) - (1 - p) * entropy(set2)
            if gain > best_gain and len(set1) > 0 and len(set2) > 0:
                best_gain = gain
                best_criteria = (col, value)
                best_sets = (set1, set2)

    if best_gain > 0:
        trueBranch = build_tree(best_sets[0], depth - 1)
        falseBranch = build_tree(best_sets[1], depth - 1)
        return 2
    else:
        return 3

In [76]:
def teste_fit(rows):
    if len(rows) < 1:
        raise ValueError("Nao ha amostras suficientes no dataset de entrada.")

    features_indexes = choose_random_features(rows[0])
    rows = [get_features_subset(row, features_indexes) + [row[-1]] for row in rows]

    return build_tree(rows,-1)

In [77]:
tree = DecisionTree_entropy.DecisionTreeClassifier(max_depth=-1)
trees = rdd_trees.reduceByKey(lambda x,y: list(x+y))
trees.map(lambda x: (x[0],teste_fit(list(x[1])))).collect()
#trees.map(lambda x: (x[0],tree.fit(list(x[1])))).collect()

[(0, 2),
 (1, 2),
 (2, 2),
 (3, 2),
 (4, 2),
 (5, 2),
 (6, 2),
 (7, 2),
 (8, 2),
 (9, 2),
 (10, 2),
 (11, 2),
 (12, 2),
 (13, 2),
 (14, 2),
 (15, 2),
 (16, 2),
 (17, 2),
 (18, 2),
 (19, 2),
 (20, 2),
 (21, 2),
 (22, 2),
 (23, 2),
 (24, 2),
 (25, 2),
 (26, 2),
 (27, 2),
 (28, 2),
 (29, 2),
 (30, 2),
 (31, 2),
 (32, 2),
 (33, 2),
 (34, 2),
 (35, 2),
 (36, 2),
 (37, 2),
 (38, 2),
 (39, 2),
 (40, 2),
 (41, 2),
 (42, 2),
 (43, 2),
 (44, 2),
 (45, 2),
 (46, 2),
 (47, 2),
 (48, 2),
 (49, 2),
 (50, 2),
 (51, 2),
 (52, 2),
 (53, 2),
 (54, 2),
 (55, 2),
 (56, 2),
 (57, 2),
 (58, 2),
 (59, 2)]