In [10]:
import pandas as pd
import random

# 1. Original Isolation Forest 

Exploratory coding of an Isolation Forest as proposed by Liu et al (2008): https://cs.nju.edu.cn/zhouzh/zhouzh.files/publication/icdm08b.pdf?q=isolation-forest

*isolation*: separating an instance from the rest of the instances

## 1.1 Random partitioning 

In this implementation, partitions are generated by randomly selecting an attribute and then randomly selecting a split value between the maximum and minimum values of the selected attribute. 

Isolation Forests are constructed by recursively partitioning a given training set until instances are isolated or a specific tree height is reached of which results a partial model. 

In [80]:
# How do we sample? 
def Sampling(df, size): 
    return True

def IsolationTree(sample, height, height_limit): 
    attributes = list(sample.keys())
    attribute = random.choice(attributes)
    
    min_value = min(sample[attribute])
    max_value = max(sample[attribute])
    
    split = random.uniform(min_value, max_value)
    
    new_height = height + 1 
    
    left_df = sample[sample[attribute] <= split]
    right_df = sample[sample[attribute] > split]
    
    if (new_height < height_limit) and len(left_df) > 0 and len(right_df) > 0:
        tree = {
            'left': IsolationTree(left_df, new_height, height_limit),
            'right': IsolationTree(right_df, new_height, height_limit),
            'split_attr': attribute, 
            'split_val': split, 
            'leaf': False
        }
        return tree 
    else: 
        tree = {
            'left': left_df, 
            'right': right_df, 
            'split_attr': attribute, 
            'split_val': split, 
            'leaf': True
        }
        return tree 
    
def IsolationForest(df, num_trees, subsample_size): 
    height_limit = 40 # l = ceiling(log_2, ψ)
    i = 0
    model = []
    
    while i < num_trees: 
#         sample = Subsample(df, subsample_size) 
        
        model.append(IsolationTree(df, 0, height_limit))
        i = i+1 
        
    return model 

def PathLength(instance, tree, path_length): 
    attribute = tree['split_attr']
    
    
    if not tree['leaf']: 
        if instance[attribute][0] < tree['split_val']: 
            return PathLength(instance, tree['left'], path_length + 1)
        else:
            return PathLength(instance, tree['right'], path_length + 1)
    else: 
        return path_length + 1

def Evaluation(forest, instance): 
    scores = []
    for tree in forest: 
        scores.append(PathLength(instance, tree, 0))
    
    return scores

In [81]:
num_trees = 100
raw_df = pd.read_csv('data/HousingData.csv')
df = raw_df.dropna()
subsample_size = len(df) / 2

forest = IsolationForest(df, num_trees, subsample_size)

# Validate with Kurtosis 

# print(forest)

In [82]:
instance = df[:1]

score = Evaluation(forest, instance)

print(score)

[9, 13, 6, 16, 14, 9, 7, 11, 13, 19, 14, 17, 14, 8, 10, 9, 10, 9, 13, 10, 16, 13, 14, 16, 22, 11, 12, 5, 15, 17, 17, 17, 6, 12, 9, 6, 20, 8, 16, 18, 4, 22, 10, 10, 15, 8, 11, 11, 20, 17, 11, 13, 11, 8, 11, 11, 12, 12, 11, 10, 15, 2, 14, 8, 13, 8, 10, 14, 10, 14, 10, 11, 10, 11, 11, 10, 9, 13, 9, 13, 8, 9, 13, 12, 10, 10, 8, 8, 13, 7, 19, 15, 17, 11, 10, 4, 12, 8, 14, 11]
