In [1]:
import numpy as np
import pandas as pd

In [2]:
class decision_node:
    def __init__(self,col_name=None,value=None,results=None,true_branch=None,false_branch=None):
        self.col_name=col_name # column name of criteria being tested
        self.value=value # vlaue necessary to get a true result
        self.results=results # dict of results for a branch, None for everything except endpoints
        self.true_branch = true_branch # true decision nodes 
        self.false_branch = false_branch # false decision nodes
        
    def classify_row(self,row):
        if self.results != None:
            return self.results.keys()[0]
        else:
            decision_col = self.col_name
            decision_value = self.value
            if isinstance(decision_value,np.int64):
                if row[decision_col] > decision_value: 
                    return self.true_branch.classify_row(row)
                else: 
                    return self.false_branch.classify_row(row)
            else:
                if row[decision_col] == decision_value:
                    return self.true_branch.classify_row(row)
                else: 
                    return self.false_branch.classify_row(row)
    
    def classify(self,df):
        predicted_list = []
        for index, row in df.iterrows(): 
            predicted_result = self.classify_row(row)
            predicted_list.append(predicted_result)
        new_df = df.copy()
        new_df['Prediction'] = predicted_list
        return new_df
    
    def print_tree(self,indent=''):
        # Is this a leaf node?
        if self.results!=None:
            print str(self.results.keys()[0])
        else:
            # Print the criteria
            print 'Variable ' + str(self.col_name)+' : >='+str(self.value)+' ? ' if isinstance(self.value, np.int64) \
                else 'Variable ' + str(self.col_name)+' : is '+str(self.value)+' ? '
            # Print the branches
            print indent+'True->',
            self.true_branch.print_tree(indent+'  ')
            print indent+'False->',
            self.false_branch.print_tree(indent+'  ')

In [3]:
# Divides a set on a specific column. Can handle numeric or nominal values

def divide_set(df,column,value): # rows is a df, column is string (col name)
    # Make a function that tells us if a row is in the first group 
    # (true) or the second group (false)
    split_function=None
    # for numerical values
    if isinstance(value,np.int64) or isinstance(value,np.float64):
        split_function=lambda x : x >= value
    # for nominal values
    else:
        split_function=lambda x : x == value
   
   # Divide the rows into two sets and return them
    df_true = df.loc[df[column].apply(split_function)]
    df_false = df.loc[~df[column].apply(split_function)]
    #set1=[row for row in rows if split_function(row)] # if split_function(row) 
    #set2=[row for row in rows if not split_function(row)]
    return df_true, df_false

In [4]:
from collections import defaultdict
def unique_counts(df):
    results = defaultdict(lambda: 0)
    for idnex, row in df.iterrows():
        r = row[df.columns[-1]]
        results[r]+=1
    return dict(results) 

In [5]:
# Entropy is the sum of p(x)log(p(x)) across all the different possible results
def entropy(df):
    from math import log
    log2 = lambda x:log(x)/log(2)  
    results = unique_counts(df)
    # Now calculate the entropy
    ent=0.0
    for r in results.keys():
        # current probability of class
        p=float(results[r])/df.shape[0]
        ent=ent-p*log2(p)
    return ent

In [6]:
def build_tree(df, score_function=entropy):
    if df.shape[0] == 0: return decision_node()
    current_score = score_function(df)

    best_gain = 0.0
    best_criteria = None
    best_sets = None

    for col_name in df.columns[:-1]: #last col is result
        # find different values in this column
        col_values = set(df[col_name])

        # for each possible value, try to divide on that value
        for value in col_values:
            df_true, df_false = divide_set(df, col_name, value)

            # Information gain
            p = float(df_true.shape[0])/df.shape[0]
            gain = current_score - p*score_function(df_true) - (1-p)*score_function(df_false)
            if gain > best_gain and df_true.shape[0] > 0 and df_false.shape[0] > 0:
                best_gain = gain
                best_criteria = (col_name, value)
                best_sets = (df_true, df_false)

    if best_gain > 0:
        trueBranch = build_tree(best_sets[0])
        falseBranch = build_tree(best_sets[1])
        return decision_node(col_name=best_criteria[0], value=best_criteria[1],
                true_branch=trueBranch, false_branch=falseBranch)
    else:
        return decision_node(results=unique_counts(df))

Create data frame

In [7]:
df = pd.DataFrame()

In [8]:
Type = ['courte','courte','longue','longue','courte','longue','longue','longue','courte','courte']

In [9]:
Prop = ['chimique','solaire','chimique','solaire','nucleaire','chimique','chimique','solaire','nucleaire','chimique']

In [10]:
Astro = [6,6,3,4,6,6,4,6,3,3]

In [11]:
MOI = ['freinage','freinage','aerocapt.','freinage','freinage','freinage', 'aerocapt.','freinage','aerocapt.','aerocapt.']

In [12]:
ERV = ['terrestre','terrestre','martienne','terrestre','terrestre','terrestre','terrestre','martienne','terrestre','martienne']

In [13]:
EDL = ['gonflable','gonflable','rigide','gonflable','gonflable','gonflable','gonflable','gonflable','rigide','gonflable']

In [14]:
Eval = ['TC','TC','A','A','TC','TC','A','TC','TC','TC']

In [15]:
df['Type'] = Type
df['Prop'] = Prop
df['Astro'] = Astro
df['MOI'] = MOI
df['ERV'] = ERV
df['EDL'] = EDL
df['Eval'] = Eval

In [27]:
df

Unnamed: 0,Type,Prop,Astro,MOI,ERV,EDL,Eval
0,courte,chimique,6,freinage,terrestre,gonflable,TC
1,courte,solaire,6,freinage,terrestre,gonflable,TC
2,longue,chimique,3,aerocapt.,martienne,rigide,A
3,longue,solaire,4,freinage,terrestre,gonflable,A
4,courte,nucleaire,6,freinage,terrestre,gonflable,TC
5,longue,chimique,6,freinage,terrestre,gonflable,TC
6,longue,chimique,4,aerocapt.,terrestre,gonflable,A
7,longue,solaire,6,freinage,martienne,gonflable,TC
8,courte,nucleaire,3,aerocapt.,terrestre,rigide,TC
9,courte,chimique,3,aerocapt.,martienne,gonflable,TC


In [17]:
training_set = df.loc[[0,2,4,5,6,8],:].reset_index(drop=True)
testing_set = df.loc[[1,3,7,9],:].reset_index(drop=True)

In [18]:
training_set

Unnamed: 0,Type,Prop,Astro,MOI,ERV,EDL,Eval
0,courte,chimique,6,freinage,terrestre,gonflable,TC
1,longue,chimique,3,aerocapt.,martienne,rigide,A
2,courte,nucleaire,6,freinage,terrestre,gonflable,TC
3,longue,chimique,6,freinage,terrestre,gonflable,TC
4,longue,chimique,4,aerocapt.,terrestre,gonflable,A
5,courte,nucleaire,3,aerocapt.,terrestre,rigide,TC


In [19]:
testing_set

Unnamed: 0,Type,Prop,Astro,MOI,ERV,EDL,Eval
0,courte,solaire,6,freinage,terrestre,gonflable,TC
1,longue,solaire,4,freinage,terrestre,gonflable,A
2,longue,solaire,6,freinage,martienne,gonflable,TC
3,courte,chimique,3,aerocapt.,martienne,gonflable,TC


In [25]:
tree = build_tree(df)

In [26]:
tree.print_tree()

Variable Type : is courte ? 
True-> TC
False-> Variable Astro : >=6 ? 
  True-> TC
  False-> A


In [28]:
tree.classify(df)

Unnamed: 0,Type,Prop,Astro,MOI,ERV,EDL,Eval,Prediction
0,courte,chimique,6,freinage,terrestre,gonflable,TC,TC
1,courte,solaire,6,freinage,terrestre,gonflable,TC,TC
2,longue,chimique,3,aerocapt.,martienne,rigide,A,A
3,longue,solaire,4,freinage,terrestre,gonflable,A,A
4,courte,nucleaire,6,freinage,terrestre,gonflable,TC,TC
5,longue,chimique,6,freinage,terrestre,gonflable,TC,A
6,longue,chimique,4,aerocapt.,terrestre,gonflable,A,A
7,longue,solaire,6,freinage,martienne,gonflable,TC,A
8,courte,nucleaire,3,aerocapt.,terrestre,rigide,TC,TC
9,courte,chimique,3,aerocapt.,martienne,gonflable,TC,TC
