In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, plot_confusion_matrix, make_scorer
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from matplotlib import pyplot as plt

In [19]:
#Loading the dataset
dataset_path = "../../Data/galaxymorphology/dataset1_sydney.csv"
sydney_dataset = pd.read_csv(dataset_path)
header = sydney_dataset.columns
sydney_train, sydney_test = train_test_split(sydney_dataset, test_size=0.2)
sydney_train = sydney_train.to_numpy()
sydney_test = sydney_test.to_numpy()

In [20]:
def unique_vals(rows, col):
    """Returns unique values of a column in a dataset"""
    return set([row[col] for row in rows])

In [21]:
def class_counts(rows):
    """Counts the number of each type of example in a dataset."""
    counts = {}  # a dictionary of label -> count.
    for row in rows:
        """In our datasets format, the label is always the last column 
         or we will prepare the dataframe that way before feeding it"""
        label = row[-1]
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    return counts

In [22]:
def is_numeric(value):
    """Test if a value is numeric."""
    return isinstance(value, int) or isinstance(value, float)

In [23]:
class Question:
    """A Question, to split(with a feature) a dataset.
        Records: -
        'column number' (e.g., 0 for Color) 
        'column value' (e.g., Green). 
        
        'match' method: is used to compare the feature value in an example to the feature value stored in the
                        question.
    """

    def __init__(self, column, value):
        self.column = column #1
        self.value = value #3
    
    def match(self, example):
        # Compare the feature value in an example to the
        # feature value in this question.
        val = example[self.column]
        if is_numeric(val):
            return val >= self.value
        else:
            return val == self.value

    def __repr__(self):
        # This is just a helper method to print
        # the question in a readable format.
        condition = "=="
        if is_numeric(self.value):
            condition = ">="
        return "Is %s %s %s?" % (
            header[self.column], condition, str(self.value))


In [24]:
def partition(rows, question):
    """Partitions a dataset.
    For each row in the dataset, check if it matches the question. If
    so, add it to 'true rows', otherwise, add it to 'false rows'.
    """
    true_rows, false_rows = [], []
    for row in rows:
        if question.match(row):
            true_rows.append(row)
        else:
            false_rows.append(row)
    return true_rows, false_rows

In [25]:
def gini(rows):
    """Calculate the Gini Impurity for a list of rows.
    There are a few different ways to do this, I thought this one was
    the most concise. See:
    https://en.wikipedia.org/wiki/Decision_tree_learning#Gini_impurity
    """
    counts = class_counts(rows)
    impurity = 1
    for label in counts:
        prob_of_label = counts[label] / float(len(rows))
        impurity -= prob_of_label**2
    return impurity

In [26]:
def info_gain(left, right, current_uncertainty):
    """Information Gain.
    The uncertainty of the starting node, minus the weighted impurity of
    two child nodes.
    """
    p = float(len(left)) / (len(left) + len(right))
    return current_uncertainty - p * gini(left) - (1 - p) * gini(right)

In [27]:
def find_best_split(rows):
    """Find the best question to ask by iterating over every feature / value
    and calculating the information gain."""
    best_gain = 0  # keep track of the best information gain
    best_question = None  # keep train of the feature / value that produced it
    current_uncertainty = gini(rows)
    n_features = len(rows[0]) - 1  # number of columns

    for col in range(n_features):  # for each feature

        values = set([row[col] for row in rows])  # unique values in the column

        for val in values:  # for each value

            question = Question(col, val)

            # try splitting the dataset
            true_rows, false_rows = partition(rows, question)

            # Skip this split if it doesn't divide the
            # dataset.
            if len(true_rows) == 0 or len(false_rows) == 0:
                continue

            # Calculate the information gain from this split
            gain = info_gain(true_rows, false_rows, current_uncertainty)

            # You actually can use '>' instead of '>=' here
            # but I wanted the tree to look a certain way for our
            # toy dataset.
            if gain >= best_gain:
                best_gain, best_question = gain, question

    return best_gain, best_question

In [28]:
class Leaf:
    """A Leaf node classifies data.
    This holds a dictionary of class (e.g., "Apple") -> number of times
    it appears in the rows from the training data that reach this leaf.
    """

    def __init__(self, rows):
        self.predictions = class_counts(rows)


In [41]:
class Decision_Node:
    """A Decision Node asks a question.
    This holds a reference to the question, and to the two child nodes.
    """

    def __init__(self, question, true_branch, false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch

In [42]:
def build_tree(rows):
    """Builds the tree.
    Rules of recursion: 1) Believe that it works. 2) Start by checking
    for the base case (no further information gain). 3) Prepare for
    giant stack traces.
    """

    # Try partitioing the dataset on each of the unique attribute,
    # calculate the information gain,
    # and return the question that produces the highest gain.
    gain, question = find_best_split(rows)
    # Base case: no further info gain
    # Since we can ask no further questions,
    # we'll return a leaf.
    if gain == 0:
        return Leaf(rows)

    # If we reach here, we have found a useful feature / value
    # to partition on.
    true_rows, false_rows = partition(rows, question)

    # Recursively build the true branch.
    true_branch = build_tree(true_rows)

    # Recursively build the false branch.
    false_branch = build_tree(false_rows)

    # Return a Question node.
    # This records the best feature / value to ask at this point,
    # as well as the branches to follow
    # dependingo on the answer.
    return Decision_Node(question, true_branch, false_branch)


In [43]:
def print_tree(node, spacing=""):

    # Base case: we've reached a leaf
    if isinstance(node, Leaf):
        print (spacing + "Predict", node.predictions)
        return

    # Print the question at this node
    print (spacing + str(node.question))

    # Call this function recursively on the true branch
    print (spacing + '--> True:')
    print_tree(node.true_branch, spacing + "  ")

    # Call this function recursively on the false branch
    print (spacing + '--> False:')
    print_tree(node.false_branch, spacing + "  ")

In [44]:
def classify(row, node):

    # Base case: we've reached a leaf
    if isinstance(node, Leaf):
        return node.predictions

    # Decide whether to follow the true-branch or the false-branch.
    # Compare the feature / value stored in the node,
    # to the example we're considering.
    if node.question.match(row):
        return classify(row, node.true_branch)
    else:
        return classify(row, node.false_branch)

In [45]:
def print_leaf(counts):
    """A nicer way to print the predictions at a leaf."""
    total = sum(counts.values()) * 1.0
    probs = {}
    for lbl in counts.keys():
        probs[lbl] = str(int(counts[lbl] / total * 100)) + "%"
        
    return probs

In [47]:
if __name__ == '__main__':

    my_tree = build_tree(sydney_train)

    print_tree(my_tree)

    predicted= []
    actual = []
    for row in sydney_test:
        most_probable = print_leaf(classify(row, my_tree))
        predicted += [next(iter(most_probable))]
        actual += [row[-1]]
        print("Actual: %s. Predicted: %s" %(row[-1], print_leaf(classify(row, my_tree))))
    print(classification_report(actual, predicted))

Is pConc_r >= 0.3626386461586635?
--> True:
  Is pConc_r >= 0.3745722834978756?
  --> True:
    Predict {'spiral': 192}
  --> False:
    Is g-r >= 0.7860399999999981?
    --> True:
      Predict {'elliptical': 2}
    --> False:
      Predict {'spiral': 5}
--> False:
  Is u-g >= 1.6989600000000014?
  --> True:
    Is ecc >= 0.6039394?
    --> True:
      Predict {'elliptical': 204}
    --> False:
      Is ecc >= 0.5970721?
      --> True:
        Predict {'spiral': 1}
      --> False:
        Predict {'elliptical': 2}
  --> False:
    Is petroR50_u >= 5.164757?
    --> True:
      Predict {'spiral': 6}
    --> False:
      Is petroR90_z >= 8.220436?
      --> True:
        Predict {'elliptical': 3}
      --> False:
        Predict {'spiral': 1}
Actual: elliptical. Predicted: {'elliptical': '100%'}
Actual: elliptical. Predicted: {'elliptical': '100%'}
Actual: elliptical. Predicted: {'elliptical': '100%'}
Actual: spiral. Predicted: {'spiral': '100%'}
Actual: elliptical. Predicted: {'ellip