* Function
    * Raise_Question
    * Find_best_question
    * Partition
    * Build_tree
    * Predict
* Class
    * Attribute Node
    * Leaf Node

# Data Preparation - reorganize the data
1. label should be the last column
2. make sure that catagorical data are in string format, numerical data are in int or float format

In [10]:
# Import lib
# ===========================================================
import csv
from datascience import *
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np
import random
import time
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('fivethirtyeight')
import collections
import math
import sys
# from tqdm import tqdm
from time import sleep

In [11]:
# Initialize useful data
# ===========================================================
# with open('clinvar_conflicting_clean.csv', 'r') as f:
#     reader = csv.reader(f)
#     temp_rows = list(reader)
df = pd.read_csv('clinvar_conflicting_clean.csv', low_memory=False)
columns_to_change = ['ORIGIN', 'EXON', 'INTRON', 'STRAND', 'LoFtool', 'CADD_PHRED', 'CADD_RAW', 'BLOSUM62']
df[['CLNVI', 'MC', 'SYMBOL', 'Feature_type', 'Feature', 'BIOTYPE', 
 'cDNA_position', 'CDS_position', 'Protein_position', 'Amino_acids', 'Codons', 
 'BAM_EDIT', 'SIFT', 'PolyPhen']] = df[['CLNVI', 'MC', 'SYMBOL', 'Feature_type', 'Feature', 'BIOTYPE', 
 'cDNA_position', 'CDS_position', 'Protein_position', 'Amino_acids', 'Codons', 
 'BAM_EDIT', 'SIFT', 'PolyPhen']].fillna(value="null")
df = df.sample(n = df.shape[0])
all_rows = df.values.tolist()
row_num = len(all_rows)

# Classification with Supervised Learning: Decision Tree

In [24]:
# Decision stump part for Adaboost
# ===========================================================
def is_numeric(value):
    return isinstance(value, int) or isinstance(value, float)

# === LeafNode is the prediction result of this branch ===
class LeafNode:
    def __init__(self, rows):
        labels = [row[-1] for row in rows]
        self.prediction = collections.Counter(labels)

# === DecisionNode is an attribute / question used to partition the data ===
class DecisionNode:
    def __init__(self, question = None, left_branch = None, right_branch = None):
        self.question = question
        self.left_branch = left_branch
        self.right_branch = right_branch
    
class DecisionTree:
    def __init__(self, training_attribute, training_data, method = "CART"):
        self.attribute = training_attribute     # takein attribute and data separately
        self.train = training_data[1:]
        self.row_num = len(self.train)
        self.column_num = len(self.attribute)
        self.method = method.upper()            # convert to upper case for general use
        self.labels = self.uniq_val(-1)
        if self.method not in ["C4.5", "CART", "HYBRID"]:
            print("Error: Please choose a valid method!")
            return None
        self.root = self.build_tree(self.train)
    
    def uniq_val(self, column):
        return set([self.train[i][column] for i in range(len(self.train))])
    
    # when raising a question.
    # if it's a categorical attribute, we simply iterate all categories
    # if it's a numeric attribute, we iterate the set of possible numeric values 
    class Question:
        def __init__(self, column, ref_value, attribute):
            self.column = column
            self.ref_value = ref_value if ref_value else "None"
            self.attri = attribute

        def match(self, row):
            if is_numeric(self.ref_value):
                try:
                    return row[self.column] >= self.ref_value
                except:
                    print("Error occured in ", row)
                    return True
            else:
                return row[self.column] == self.ref_value

        def __repr__(self):
            operand = ">=" if is_numeric(self.ref_value) else "=="
            return "Is %s %s %s?" % (self.attri[self.column], operand, str(self.ref_value))
    
    # === Method 1 - C4.5 ===
    def entropy(self, rows):
        # === Bits used to store the information ===
        labels = [row[-1] for row in rows]
        frequency = collections.Counter(labels).values()
        pop = sum(frequency)
        H = 0
        for f in frequency:
            p = f / pop
            H -= p * math.log(p, 2)
        return H
    
    # === Method 2 - CART ===
    def gini(self, rows):
        # === Probability of misclassifying any of your label, which is impurity ===
        labels = [row[-1] for row in rows]
        frequency = collections.Counter(labels).values()
        pop = sum(frequency)
        gini = 1
        for f in frequency:
            p = f / pop
            gini -= p ** 2
        return gini
    
    # === Calculate Gain Info ===
    # I'm actually returning the gain info reduction
    def info(self, branches, root):
        # === Objective: to find the best question which can maximize info ===
        root_size = float(len(root))
        if self.method == "C4.5":  # Here I pick the GainRatio Approach
            root_uncertainty = self.entropy(root)
            gain_info = root_uncertainty
            split_info = 0
            for branch in branches:
                if not branch: continue
                gain_info -= len(branch) / root_size * self.entropy(branch)
                split_info -= float(len(branch)) / root_size * math.log(float(len(branch)) / root_size)
#                 print(gain_info, split_info)
            return gain_info / split_info
        elif self.method == "CART":
            root_uncertainty = self.gini(root)
            gain_info = root_uncertainty
            for branch in branches:
                if not branch: continue
                gain_info -= len(branch) / root_size * self.gini(branch)
            return gain_info
        elif self.method == "HYBRID":
            pass
        pass
    
    # === Here I only do Binary Partitions ===
    def partition(self, rows, question):
        true_rows = []
        false_rows = []
        for row in rows:
            if question.match(row):
                true_rows.append(row)
            else:
                false_rows.append(row)
        return true_rows, false_rows
    
    def find_best_question(self, rows):
        max_info_attenuation = 0
        best_question = self.Question(0, self.train[0][0], self.attribute)
        # === Iterate through all question candidates ===
        # === TODO: Maybe Iteration here can be optimized ===
        for col in range(self.column_num - 1): # minus 1 to avoid using the label as attribute
            ref_candidates = self.uniq_val(col)
            for ref_value in ref_candidates:
                if ref_value == "null": continue # avoid using null values to generate a question
                q = self.Question(col, ref_value, self.attribute)
                temp_true_rows, temp_false_rows = self.partition(rows, q)
                temp_info_attenuation = self.info([temp_true_rows, temp_false_rows], rows)
                if temp_info_attenuation >= max_info_attenuation:
                    max_info_attenuation = temp_info_attenuation
                    best_question = q
        return max_info_attenuation, best_question
        
    # === Input rows of data with attributes and labels ===
    def build_tree(self, rows):
        # === Assign all rows as root of the whole decision tree ===
        # === We have met the leaf node if gini(rows) is 0 or no question candidates left ===
        gain_reduction, q = self.find_best_question(rows)
#         if gain_reduction <= 0.003:
        if self.gini(rows) <= 0.48:
            return LeafNode(rows)
        true_rows, false_rows = self.partition(rows, q)
        # === Recursion after we have found a optimal question ===
        return DecisionNode(q, self.build_tree(true_rows), self.build_tree(false_rows))
    
    # === Input a row of data with attributes (and no label), predict its label with our decision tree ===
    # === Actually it can contain a label, we just don't use it ===
    # === walk down the decision tree until we reach the leaf node ===
    def classify(self, row, node):
        if isinstance(node, LeafNode):
#             print("===", node.prediction)
            return node.prediction
        
        if node.question.match(row):
#             print(node.question, True)
            return self.classify(row, node.left_branch)
        else:
#             print(node.question, False)
            return self.classify(row, node.right_branch)
    
    def print_tree(self, node, spacing=""):
        # Base case: we've reached a leaf
        if isinstance(node, LeafNode):
            print (spacing + "Predict", node.prediction)
            return

        # Print the question at this node
        print (spacing + str(node.question))

        # Call this function recursively on the true branch
        print (spacing + '--> True:')
        self.print_tree(node.left_branch, spacing + "  ")

        # Call this function recursively on the false branch
        print (spacing + '--> False:')
        self.print_tree(node.right_branch, spacing + "  ")


In [25]:
# Divide whole dataset into training set and testing set
# ===========================================================
training_percentage = 0.01  # percent of partition of training dataset
training_size = int(row_num * training_percentage)
testing_size = row_num - training_size
training_attribute = list(df.columns)      # i omitted the 'CLASS' inside the decision tree structure
training_data = all_rows[: training_size]  # training data should include header row
testing_data = all_rows[training_size: ]   # testing data don't need to include header row

In [26]:
# Training
# ===========================================================
start = time.time()
tree = DecisionTree(training_attribute, training_data, "CART")
end = time.time()
print("Decision Tree Trained! Time: %.03fs" % (end - start))

Decision Tree Trained! Time: 3.477s


In [7]:
print(tree.gini(training_data))

0.3799424730002998


In [23]:
# tree.print_tree(tree.root)

In [27]:
# Testing and Computing TN, TP, FN, FP, etc. 
# ===========================================================
ROC = Table(make_array('CUTOFF', 'TN', 'FN', 'FP', 'TP', 'ACC'))
step_size = 0.05
CMap = {0: 'TN', 1: 'FN', 2: 'FP', 3: 'TP'}
# 00(0) -> TN
# 01(1) -> FN
# 10(2) -> FP
# 11(3) -> TP
for cutoff in np.arange(0, 1 + step_size, step_size):
    sys.stdout.write('\r')
    # the exact output you're looking for:
    sys.stdout.write("Testing: [%-20s] %d%%" % ('='*int(cutoff * 100 / 5), int(cutoff * 100)))
    sys.stdout.flush()
    Confusion = {'TN': 0, 'FN': 0, 'FP': 0, 'TP': 0}
    for row in testing_data:
        # prediction is a counter of label 1 and 0
        pred_counter = tree.classify(row, tree.root)
        true_rate = pred_counter.get(1, 0) / (pred_counter.get(1, 0) + pred_counter.get(0, 0) + 0.00000001)
#         print(true_rate)
        true_pred = 1 if true_rate >= cutoff else 0
        indicator = (true_pred << 1) + row[-1]
        # accordingly update confusion matrix
        Confusion[CMap[indicator]] += 1
    # concatenate the confusion matrix values into the overall ROC Table
    thisline = [cutoff] + list(Confusion.values()) + [(Confusion['TP'] + Confusion['TN']) / sum(Confusion.values())]
    ROC = ROC.with_row(thisline)
ROC = ROC.with_column('SENSITIVITY', ROC.apply(lambda TP, FN: TP / (TP + FN + 0.00000001), 'TP', 'FN'))
ROC = ROC.with_column('FPR', ROC.apply(lambda TN, FP: FP / (TN + FP + 0.00000001), 'TN', 'FP'))
ROC = ROC.with_column('FMEAS', ROC.apply(lambda TP, FP, FN: 2 * (TP / (TP + FN)) * (TP / (TP + FP)) / (TP / (TP + FN) + TP / (TP + FP)), 'TP', 'FP', 'FN'))





In [28]:
# show
ROC.show()

CUTOFF,TN,FN,FP,TP,ACC,SENSITIVITY,FPR,FMEAS
0.0,0,0,48265,16272,0.252134,1,1,0.402727
0.05,0,0,48265,16272,0.252134,1,1,0.402727
0.1,0,0,48265,16272,0.252134,1,1,0.402727
0.15,0,0,48265,16272,0.252134,1,1,0.402727
0.2,0,0,48265,16272,0.252134,1,1,0.402727
0.25,48265,16272,0,0,0.747866,0,0,
0.3,48265,16272,0,0,0.747866,0,0,
0.35,48265,16272,0,0,0.747866,0,0,
0.4,48265,16272,0,0,0.747866,0,0,
0.45,48265,16272,0,0,0.747866,0,0,


In [8]:
# Acc Curve by cutoff
# ===========================================================
matplotlib.use('TkAgg')
fig = plt.figure()
plt.xlabel('Cutoff')
plt.ylabel('Accuracy')
plt.title('Accuracy - Cutoff of Decision Tree')
plt.plot(ROC.column('CUTOFF'), ROC.column('ACC'), color='orange')
plt.show()
fig.savefig('Decision Tree ACC.png', bbox_inches='tight')

In [9]:
# ROC_CURVE
# ===========================================================
fig = plt.figure()
plt.xlabel('False Positive Rate')
plt.ylabel('Sensitivity')
plt.title('ROC - Curve of Decision Tree')
plt.plot(ROC.column('FPR'), ROC.column('SENSITIVITY'), color='orange')
plt.plot(np.arange(0, 1.1, 0.1), np.arange(0, 1.1, 0.1), color='black')
plt.legend(['Decision Tree', 'Null'])
plt.axis([0, 1, 0, 1])
plt.show()
fig.savefig('Decision Tree ROC.png', bbox_inches='tight')

In [10]:
# Compute AUC
# ===========================================================
length = len(ROC.column('FPR'))
auc = 0
for i in range(length - 1):
    auc += 0.5 * abs(ROC.column('FPR')[i + 1] - ROC.column('FPR')[i]) * (ROC.column('SENSITIVITY')[i] + ROC.column('SENSITIVITY')[i + 1])
print("auc = %.03f" %auc)

auc = 0.555


In [11]:
# Original Testing
# ===========================================================

accuracy = []
for row in testing_data:
    classification = tree.classify(row, tree.root)
    if len(classification) == 1:
        accuracy.append(int(classification.get(row[-1], 0) > 0))
    else:
        tot = sum(classification.values())
        accuracy.append(classification.get(row[-1], 0) / tot)
print(sum(accuracy) / len(accuracy))

0.7072377086012674


In [12]:
# My Decision Tree Model - before SMOTE
# ===========================================================
result = np.array([[0.1, 0.6052853830443929, 0.6341292591846538, 6.658, 0.2575948238372803], 
[0.2, 0.6696793630299119, 0.6462675119355091, 15.636898589134216, 0.26355624198913574], 
[0.3, 0.688320280645608, 0.6270048866889123, 15.349877572059631, 0.2788889408111572], 
[0.4, 0.6926441596845737, 0.6461577795177449, 17.142344093322755, 0.3045179843902588], 
[0.5, 0.7249279250111773, 0.6423258893248721, 20.893156909942626, 0.3267941474914551], 
[0.6, 0.6908789196403483, 0.6319173588785352, 23.121678400039674, 0.3292832374572754], 
[0.7, 0.6911146321003911, 0.6490688659793814, 26.19135489463806, 0.3296070098876953], 
[0.8, 0.681758435172271, 0.6396528437797009, 29.576567125320434, 0.3475379943847656], 
[0.9, 0.6796756798972815, 0.6486733420990525, 36.53753626346588, 0.37729477882385254], 
[1, 0.678894111630874, 0.6433185614453725, 40.28847301006317, 0.3777620792388916]])

[percentage, my_acc, sklearn_acc, my_time_cost, sklearn_time_cost] = result.transpose()

fig, ax1 = plt.subplots()

color = 'tab:red'
ax1.set_xlabel('Training Data Percentage(%)')
ax1.set_ylabel('Time Consumption(s)', color=color)
ax1.plot(percentage, my_time_cost, color=color)
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

color = 'tab:green'
ax2.set_ylabel('My Model - Accuracy', color=color)  # we already handled the x-label with ax1
ax2.plot(percentage, my_acc, color=color)
ax2.tick_params(axis='y', labelcolor=color)
plt.ylim(0, 1)
plt.title('My Decision Tree Performance')
# plt.legend(['Time Consumption', 'My ACC'])
fig.tight_layout()  # otherwise the right y-label is slightly clipped
fig.savefig('Decision Tree Time & Acc by Training percentage.png', bbox_inches='tight')
plt.show()

In [13]:
# Sklearn Decision Tree Model - before SMOTE
# ===========================================================
fig, ax3 = plt.subplots()

color = 'tab:red'
ax3.set_xlabel('Training Data Percentage(%)')
ax3.set_ylabel('Time Consumption(s)', color=color)
ax3.plot(percentage, sklearn_time_cost, color=color)
ax3.tick_params(axis='y', labelcolor=color)
plt.ylim(0, 1)

ax4 = ax3.twinx()  # instantiate a second axes that shares the same x-axis
color = 'tab:green'
ax4.set_ylabel('Sklearn - Accuracy', color=color)  # we already handled the x-label with ax1
ax4.plot(percentage, sklearn_acc, color=color)
ax4.tick_params(axis='y', labelcolor=color)
plt.ylim(0, 1)
plt.title('Sklearn Decision Tree Performance')
# plt.legend(['Time Consumption', 'Sklearn ACC'])
fig.tight_layout()  # otherwise the right y-label is slightly clipped
fig.savefig('Decision Tree Time & Sklearn Acc by Training percentage.png', bbox_inches='tight')
plt.show()

In [14]:
# 2 Models Comparison - Time Consumption
# ===========================================================
fig = plt.figure()
plt.xlabel('Training Data Percentage(%)')
plt.ylabel('Time Consumption(s)')
plt.title('Decision Tree Time Consumption')
plt.plot(percentage, my_time_cost, color='orange')
plt.plot(percentage, sklearn_time_cost, color='black')
plt.legend(['My Model', 'Sklearn Model'])
plt.axis([0, 1, -5, 45])
plt.show()
fig.savefig('Decision Tree Time Consumption Comparison.png', bbox_inches='tight')

In [15]:
# 2 Models Comparison - Accuracy
# ===========================================================
fig = plt.figure()
plt.xlabel('Training Data Percentage(%)')
plt.ylabel('Accuracy')
plt.title('Decision Tree Accuracy')
plt.plot(percentage, my_acc, color='orange')
plt.plot(percentage, sklearn_acc, color='black')
plt.legend(['My Model: Avg = 68.0%', 'Sklearn Model: Avg = 64.1%'])
plt.axis([0, 1, 0, 1])
plt.show()
fig.savefig('Decision Tree Accuracy Comparison.png', bbox_inches='tight')

In [16]:
label_distrib = dict()
for row in all_rows:
    label_distrib[row[-1]] = label_distrib.get(row[-1], 0) + 1
print("label_distribution", label_distrib)

label_distribution {0: 48754, 1: 16434}


In [17]:
# === toy data set ===
training_data = [
    ['Color', 'Diameter', 'Label'],
    ['Green', 3, 'Apple'],
    ['Yellow', 3, 'Apple'],
    ['Red', 1, 'Grape'],
    ['Red', 1, 'Grape'],
    ['Yellow', 3, 'Lemon'],
]
testing_data = [
    ['Red', 2, 'Grape'],
    ['Yellow', 3.5, 'Apple'],
    ['Green', 3, 'Apple']
]

In [18]:
# tree.print_tree(tree.root)

In [None]:
X = [row[: -1] for row in training_data]
Y = [row[-1] for row in training_data]
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, Y)

In [None]:
test = [row[: -1] for row in testing_data]
actual_label = [row[-1] for row in testing_data]
result = clf.predict(test)

In [None]:
accuracy = 0
for i in range(len(result)):
    accuracy += int(result[i] == actual_label[i])
accuracy /= len(result)
accuracy

In [None]:
np.average(sklearn_acc)

In [1]:
from time import sleep
import sys

for i in range(101):
    sys.stdout.write('\r')
    # the exact output you're looking for:
    sys.stdout.write("[%-20s] %d%%" % ('='*(i // 5), i))
    sys.stdout.flush()
    sleep(0.25)

