# This file is to examine the performance of iForest in prdatasets (One-class Classification)

Please read http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html for further details.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import os

import sklearn.preprocessing

from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_auc_score

In [2]:
# get all the absolute path of the csv files
repo_dir = '../../datasets/datasets in csv format/classification/'
repo_dir_ext = []

for subdir, dirs, files in os.walk(repo_dir):
    for file in files:
        if file.find('.csv') != -1:
            repo_dir_ext.append(os.path.join(subdir, file))

            
# read all the data from csv to a tensor of type [data(2d matrix), flag(1d vector)]
repo_datasets = []

for i in range(len(repo_dir_ext)):
    dataset = pd.read_csv(repo_dir_ext[i], header=None)
    dataset = np.array(dataset)
    shape = dataset.shape
    
    dataset_data = dataset[:,0:shape[1]-1]
    dataset_flag = dataset[:,shape[1]-1]
    
    scaler = sklearn.preprocessing.MinMaxScaler()
    scaler.fit(dataset_data.astype(np.float32))
    dataset_data = scaler.transform(dataset_data.astype(np.float32))
    
    dataset = [dataset_data, dataset_flag.astype(np.int32)]
    
    repo_datasets.append(dataset)
    
# print some basic info
print len(repo_datasets)
for i in range(len(repo_dir_ext)):
    print repo_dir_ext[i]

15
../../datasets/datasets in csv format/auto_mpg.csv
../../datasets/datasets in csv format/biomed.csv
../../datasets/datasets in csv format/breast.csv
../../datasets/datasets in csv format/breastdiag.csv
../../datasets/datasets in csv format/car.csv
../../datasets/datasets in csv format/chromo.csv
../../datasets/datasets in csv format/diabetes.csv
../../datasets/datasets in csv format/ecoli.csv
../../datasets/datasets in csv format/glass.csv
../../datasets/datasets in csv format/imox.csv
../../datasets/datasets in csv format/ionosphere.csv
../../datasets/datasets in csv format/iris.csv
../../datasets/datasets in csv format/liver.csv
../../datasets/datasets in csv format/sonar.csv
../../datasets/datasets in csv format/wine.csv


In [3]:
"""
For each dataset, test the performance of iForest
"""
# to store the results
res = []

# ratio of training data, the rest are testing data
# the training data are used for cross-validation so as to find the best parameter
ratio = 0.9

# for different datasets
for i in range(len(repo_datasets)):
    data = repo_datasets[i][0]
    flag = repo_datasets[i][1]

    sub_res = []
    
    # test a dataset multiple times
    for j in range(10):
        target = data[flag==1,:]
        outlier = data[flag==2,:]
        
        np.random.shuffle(target)
        np.random.shuffle(outlier)
        
        len_target_cv = np.int32(len(target)*ratio)
        len_outlier_cv = np.int32(len(outlier)*ratio)
        
        # a for cross-validation training;
        # b for testing
        a = target[0:len_target_cv]
        b = np.concatenate((target[len_target_cv:len(target)], outlier))
        
        a_flag = np.ones(len_target_cv)
        b_flag = np.concatenate((np.ones(len(target)-len_target_cv), -1*np.ones(len(outlier))))
        
        assert(len(a) == len(a_flag))
        assert(len(b) == len(b_flag))
        
        # Ideally, get the trained model after cross-validation (grid_search)
        # We use the default settings in IsolationForest, so negate the needs of grid_search
        model = IsolationForest(contamination=0)
        
        # get the performance of iForest in testing dataset
        model.fit(a)
        b_pred = model.decision_function(b)
                
        sub_res.append(roc_auc_score(b_flag, b_pred))
        
    res.append(sub_res)

In [4]:
# analyze the results
for i in range(len(res)):
    print("{0}. {1}".format(i+1, repo_dir_ext[i]))
    print np.average(res[i])

1. ../../datasets/datasets in csv format/auto_mpg.csv
0.921147414458
2. ../../datasets/datasets in csv format/biomed.csv
0.592913385827
3. ../../datasets/datasets in csv format/breast.csv
0.993108425041
4. ../../datasets/datasets in csv format/breastdiag.csv
0.962722746331
5. ../../datasets/datasets in csv format/car.csv
0.580529800061
6. ../../datasets/datasets in csv format/chromo.csv
0.954859218892
7. ../../datasets/datasets in csv format/diabetes.csv
0.557725925926
8. ../../datasets/datasets in csv format/ecoli.csv
0.983316062176
9. ../../datasets/datasets in csv format/glass.csv
0.820238095238
10. ../../datasets/datasets in csv format/imox.csv
0.923333333333
11. ../../datasets/datasets in csv format/ionosphere.csv
0.915562456867
12. ../../datasets/datasets in csv format/iris.csv
1.0
13. ../../datasets/datasets in csv format/liver.csv
0.606866666667
14. ../../datasets/datasets in csv format/sonar.csv
0.64793814433
15. ../../datasets/datasets in csv format/wine.csv
0.984733893557
