# Test Parallel ExIFFI

In [1]:
import sys
import numpy as np
import pandas as pd
#from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from append_dir import append_dirname
append_dirname('ExIFFI')
from utils.utils import partition_data
from utils.feature_selection import *
#from plot import *
#from simulation_setup import *
from models import *
from models.Extended_IF import *
from models.Extended_DIFFI_parallel import *
from models.Extended_DIFFI_original import *
import math
import seaborn as sns
sns.set()

import os
import pickle 
from scipy.io import loadmat
from glob import glob

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

## Utility Functions

In [2]:
path=os.getcwd()
path=os.path.dirname(path)
path=os.path.join(path,'data')
mat_files=glob(os.path.join(path, '*.mat'))
mat_file_names={os.path.basename(x).split('.')[0]:x for x in mat_files}
csv_files=glob(os.path.join(path, '*.csv'))
csv_file_names={os.path.basename(x).split('.')[0]:x for x in csv_files}

In [3]:
def load_data(filename):
    data=loadmat(mat_file_names[filename])
    X,y=data['X'],data['y']
    y=np.hstack(y)
    return X,y 

def load_data_csv(filename):
    data=pd.read_csv(csv_file_names[filename])
    if 'Unnamed: 0' in data.columns:
        data=data.drop(columns=['Unnamed: 0'])
    X=data[data.columns[data.columns!='Target']]
    y=data['Target']
    return X,y

In [4]:
def compute_imps(model,X_train,X_test,n_runs,name,pwd,dim,f=6):

    name='GFI_'+name

    #X_test=np.r_[X_train,X_test]

    imps=np.zeros(shape=(n_runs,X_train.shape[1]))
    for i in tqdm(range(n_runs)):
        model.fit(X_train)
        imps[i,:]=model.Global_importance(X_test,calculate=True,overwrite=False,depth_based=False)

    path = pwd + '/results/imp/imp_score_' + name + '.pkl'
    with open(path, 'wb') as fl:
        pickle.dump(imps,fl)

    #Take the mean feature importance scores over the different runs for the Feature Importance Plot
    #and put it in decreasing order of importance
    mean_imp=np.mean(imps,axis=0)
    std_imp=np.std(imps,axis=0)
    mean_imp_val=np.sort(mean_imp)
    feat_order=mean_imp.argsort()

    plt_data={'Importances': mean_imp_val,
              'feat_order': feat_order,
              'std': std_imp[mean_imp.argsort()]}

    path = pwd + '/results/plt_data/plt_data_' + name + '.pkl'
    with open(path, 'wb') as fl:
        pickle.dump(plt_data,fl)

    return imps,plt_data
    

## Load Data

## Wine Dataset

In [5]:
name='wine'
X,y=load_data(name)
X_train,X_test=partition_data(X,y)
X.shape,y.shape

((129, 13), (129,))

In [6]:
X_train.shape,X_test.shape

((119, 13), (10, 13))

### Serial ExIFFI

In [7]:
# from sklearn.preprocessing import StandardScaler
# import time
# scaler=StandardScaler()
# X_train=scaler.fit_transform(X_train)
# X_test=scaler.transform(X_test)
# y_train=np.zeros(X_train.shape[0])
# y_test=np.ones(X_test.shape[0])
# y=np.concatenate([y_train,y_test])
# X_test=np.r_[X_train,X_test]
# scaler2=StandardScaler()
# X=scaler2.fit_transform(X)
# EDIFFI=Extended_DIFFI_original(300,max_depth=100,subsample_size=256,plus=1)
# dim=X.shape[1]
# pwd=os.path.dirname(os.getcwd())
# start=time.time()
# imps,plt_data=compute_imps(EDIFFI,X,X,10,name,pwd,dim,f=6)
# end=time.time()
# print(f'Elapsed time: {end-start}')

### Parallel ExIFFI

In [8]:
from sklearn.preprocessing import StandardScaler
import time

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
y_train = np.zeros(X_train.shape[0])
y_test = np.ones(X_test.shape[0])
y = np.concatenate([y_train, y_test])
X_test = np.r_[X_train, X_test]
scaler2 = StandardScaler()
X = scaler2.fit_transform(X)

EDIFFI = Extended_DIFFI_parallel(300, max_depth=100, subsample_size=256, plus=1)
EDIFFI.set_num_processes(num_processes_fit=1, num_processes_importances=1)

dim = X.shape[1]
pwd = os.path.dirname(os.getcwd())

start = time.time()
imps, plt_data = compute_imps(EDIFFI, X, X, 10, name, pwd, dim, f=6)
end = time.time()

print(f"Elapsed time: {end-start}")

 30%|███       | 3/10 [00:40<01:35, 13.59s/it]


KeyboardInterrupt: 

## Ionosphere Dataset

In [18]:
X,y=load_data('ionosphere')
X.shape,y.shape

((351, 33), (351, 1))

## Diabetes Dataset

In [23]:
X,y=load_data_csv('diabetes')
X.shape,y.shape

((100000, 4), (100000,))