In [1]:
import csv
import pandas as pd
import numpy as np
import random
import time
import matplotlib.pyplot as plt
import collections
import math
import sys
from tqdm import tqdm
from time import sleep

In [2]:
# Initialize useful data
with open('clinvar_conflicting_clean.csv', 'r') as f:
    reader = csv.reader(f)
    temp_rows = list(reader)
df = pd.read_csv('clinvar_conflicting_clean.csv', low_memory=False)
columns_to_change = ['ORIGIN', 'EXON', 'INTRON', 'STRAND', 'LoFtool', 'CADD_PHRED', 'CADD_RAW', 'BLOSUM62']
df[['CLNVI', 'MC', 'SYMBOL', 'Feature_type', 'Feature', 'BIOTYPE', 
 'cDNA_position', 'CDS_position', 'Protein_position', 'Amino_acids', 'Codons', 
 'BAM_EDIT', 'SIFT', 'PolyPhen']] = df[['CLNVI', 'MC', 'SYMBOL', 'Feature_type', 'Feature', 'BIOTYPE', 
 'cDNA_position', 'CDS_position', 'Protein_position', 'Amino_acids', 'Codons', 
 'BAM_EDIT', 'SIFT', 'PolyPhen']].fillna(value="null")
df = df.sample(n = df.shape[0])
all_rows = df.values.tolist()
row_num = len(all_rows)

In [18]:
# Decision stump part for Adaboost
# ===========================================================
def is_numeric(value):
    return isinstance(value, int) or isinstance(value, float)

# === LeafNode is the prediction result of this branch ===
class LeafNode:
    def __init__(self, rows, label = 1):
        labels = []
        self.true_idx = []
        self.false_idx = []
        for i in range(len(rows)):
            row = rows[i]
            labels.append(row[-1])
            if row[-1] == label:
                self.true_idx.append(i)
            else:
                self.false_idx.append(i)
        self.prediction = collections.Counter(labels)
        
# === DecisionNode is an attribute / question used to partition the data ===
class DecisionNode:
    def __init__(self, question = None, left_branch = None, right_branch = None):
        self.question = question
        self.left_branch = left_branch
        self.right_branch = right_branch
    
class DecisionStump:
    def __init__(self, training_attribute, training_data, method = "CART"):
        self.attribute = training_attribute     # takein attribute and data separately
        self.train = training_data
        self.row_num = len(self.train)
        self.column_num = len(self.attribute)
        self.method = method.upper()            # convert to upper case for general use
        self.labels = self.uniq_val(-1)
        self.significance = 0
        if self.method not in ["C4.5", "CART", "HYBRID"]:
            print("Error: Please choose a valid method! from: [C4.5, CART, HYBRID]")
            return None
        self.root = self.build_stump(self.train)
        self.accclassify_idx = self.root.left_branch.true_idx + self.root.right_branch.true_idx
        self.misclassify_idx = self.root.left_branch.false_idx + self.root.right_branch.false_idx
    
    def uniq_val(self, column):
        return set([self.train[i][column] for i in range(len(self.train))])
    
    # when raising a question.
    # if it's a categorical attribute, we simply iterate all categories
    # if it's a numeric attribute, we iterate the set of possible numeric values 
    class Question:
        def __init__(self, column, ref_value, attribute):
            self.column = column
            self.ref_value = ref_value if ref_value else "None"
            self.attri = attribute

        def match(self, row):
            if is_numeric(self.ref_value):
                try:
                    return row[self.column] >= self.ref_value
                except:
                    print("Error occured in ", row)
                    return True
            else:
                return row[self.column] == self.ref_value

        def __repr__(self):
            operand = ">=" if is_numeric(self.ref_value) else "=="
            return "Is %s %s %s?" % (self.attri[self.column], operand, str(self.ref_value))
    
    # === Method 1 - C4.5 ===
    def entropy(self, rows):
        # === Bits used to store the information ===
        labels = [row[-1] for row in rows]
        frequency = collections.Counter(labels).values()
        pop = sum(frequency)
        H = 0
        for f in frequency:
            p = f / pop
            H -= p * math.log(p, 2)
        return H
    
    # === Method 2 - CART ===
    def gini(self, rows):
        # === Probability of misclassifying any of your label, which is impurity ===
        labels = [row[-1] for row in rows]
        frequency = collections.Counter(labels).values()
        pop = sum(frequency)
        gini = 1
        for f in frequency:
            p = f / pop
            gini -= p ** 2
        return gini
    
    # === Calculate Gain Info ===
    def info(self, branches, root):
        # === Objective: to find the best question which can maximize info ===
        root_size = float(len(root))
        if self.method == "C4.5":  # Here I pick the GainRatio Approach
            root_uncertainty = self.entropy(root)
            gain_info = root_uncertainty
            split_info = 0
            for branch in branches:
                if not branch: continue
                gain_info -= len(branch) / root_size * self.entropy(branch)
                split_info -= float(len(branch)) / root_size * math.log(float(len(branch)) / root_size)
#                 print(gain_info, split_info)
            return gain_info / split_info
        elif self.method == "CART":
            root_uncertainty = self.gini(root)
            gain_info = root_uncertainty
            for branch in branches:
                if not branch: continue
                gain_info -= len(branch) / root_size * self.gini(branch)
            return gain_info
        elif self.method == "HYBRID":
            pass
        pass
    
    # === Here I only do Binary Partitions ===
    def partition(self, rows, question):
        true_rows = []
        false_rows = []
        for row in rows:
            if question.match(row):
                true_rows.append(row)
            else:
                false_rows.append(row)
        return true_rows, false_rows
    
    # the question that achieves the max infomation attenuation is the best question
    def find_best_question(self, rows):
        max_info_attenuation = 0
        # === Iterate through all question candidates ===
        # === TODO: Maybe Iteration here can be optimized ===
        for col in range(self.column_num - 1):
            ref_candidates = self.uniq_val(col)
            for ref_value in ref_candidates:
                if ref_value == "null": continue # avoid using null values to generate questions
                q = self.Question(col, ref_value, self.attribute)
                temp_true_rows, temp_false_rows = self.partition(rows, q)
                temp_info_attenuation = self.info([temp_true_rows, temp_false_rows], rows)
                if temp_info_attenuation >= max_info_attenuation:
                    max_info_attenuation = temp_info_attenuation
                    best_question = q
        return max_info_attenuation, best_question
        
    # === Input rows of data with attributes and labels ===
    def build_stump(self, rows):
        # === Assign all rows as root of the whole decision tree ===
        # === We have met the leaf node if gini(rows) is 0 or no question candidates left ===
        gain, q = self.find_best_question(rows)
        true_rows, false_rows = self.partition(rows, q)
        return DecisionNode(q, LeafNode(true_rows, 1), LeafNode(false_rows, 0))
    
    # === Input a row of data with attributes (and no label), predict its label with our decision tree ===
    # === Actually it can contain a label, we just don't use it ===
    # === walk down the decision tree until we reach the leaf node ===
    def classify(self, row, node):
        if isinstance(node, LeafNode):
#             print("===", node.prediction)
            return node.prediction
        
        if node.question.match(row):
#             print(node.question, True)
            return self.classify(row, node.left_branch)
        else:
#             print(node.question, False)
            return self.classify(row, node.right_branch)

    # function to print the tree out
    def print_tree(self, node, spacing=""):
        # Base case: we've reached a leaf
        if isinstance(node, LeafNode):
            print (spacing + "Predict", node.prediction)
            return

        # Print the question at this node
        print (spacing + str(node.question))

        # Call this function recursively on the true branch
        print (spacing + '--> True:')
        self.print_tree(node.left_branch, spacing + "  ")

        # Call this function recursively on the false branch
        print (spacing + '--> False:')
        self.print_tree(node.right_branch, spacing + "  ")
    
    def test(self):
        for i in range(self.column_num):
            q = self.Question(i, self.train[1][i], self.attribute)
            print(q)
            print(q.match(1))
            
def normalized_weight(weight):
    return np.divide(weight, sum(weight))

In [19]:
# Divide whole dataset into training set and testing set
# ===========================================================
training_percentage = 0.001  # percent of partition of training dataset
training_size = int(row_num * training_percentage)
testing_size = row_num - training_size
training_attribute = list(df.columns)
training_data = all_rows[: training_size]  # training data should include header row
testing_data = all_rows[training_size: ]   # testing data don't need to include header row

In [21]:
# Recursively Training base learners
# ===========================================================
# let's train T = 10 base learners
T = 10
stump_forest = []
weight = [1 / training_size for _ in range(training_size)]
start = time.time()
for i in range(T):
    # train a decision stump
    print("training stump %d" % i)
    stump = DecisionStump(training_attribute, training_data, "CART")
    # calculate the total error of the stump
    # TBD: maybe can be integrated into the training process
    accuracy = len(stump.accclassify_idx) / training_size
    
    total_err_rate = 1 - accuracy
    # update the significance level of this stump, remember not to divide by zero
    stump.significance = 0.5 * math.log((1 - total_err_rate + 0.0001) / (total_err_rate + 0.0001))
    # append stump into the forest
    stump_forest.append(stump)
    if len(stump_forest) == T: break  # early break
    # update training_data weight, resample the training data with the updated weight distribution
    true_scale = np.e ** stump.significance
    for idx in stump.misclassify_idx:
        weight[idx] = weight[idx] * true_scale
    for idx in stump.accclassify_idx:
        weight[idx] = weight[idx] * (1 / true_scale)
    distrib = normalized_weight(weight)
    print("stump %d trained, alpha = %.02f" % (i, stump.significance))
    resampled_idx = np.random.choice(training_size, training_size, p = distrib)
    training_data = [training_data[idx] for idx in resampled_idx]
#     print(training_data)
#     stump.print_tree(stump.root)
#     print(stump.misclassify_idx, stump.accclassify_idx)
    weight = [1 / training_size for _ in range(training_size)]
end = time.time()
print(end - start, "seconds")

training stump 0
stump 0 trained, alpha = 4.61
training stump 1
stump 1 trained, alpha = -2.08
training stump 2
stump 2 trained, alpha = -2.08
training stump 3
stump 3 trained, alpha = -2.08
training stump 4
stump 4 trained, alpha = -4.61
training stump 5
stump 5 trained, alpha = -4.61
training stump 6
stump 6 trained, alpha = -4.61
training stump 7
stump 7 trained, alpha = -4.61
training stump 8
stump 8 trained, alpha = -4.61
training stump 9
0.05620312690734863 seconds


In [6]:
# Testing
# ===========================================================
accuracy = []
for row in testing_data:
    classification = stump.classify(row, stump.root)
    if len(classification) == 1:
        accuracy.append(int(classification.get(row[-1], 0) > 0))
    else:
        tot = sum(classification.values())
        accuracy.append(classification.get(row[-1], 0) / tot)
    
print(sum(accuracy) / len(accuracy))
end = time.time()
print(end - start)

1.0
7.06307315826416


In [7]:
b = np.random.choice(a=[1,2,3,4,5], p=[0.2, 0.2, 0.2, 0.2, 0.2])