In [3]:
import warnings
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel, SequentialFeatureSelector

warnings.filterwarnings("ignore")

In [4]:
from ITMO_FS.filters.univariate import select_k_best, relief_measure, 
                                        UnivariateFilter, chi2_measure, su_measure
from ITMO_FS.filters.multivariate import MultivariateFilter, MRMR
from ITMO_FS.wrappers import SequentialForwardSelection
from boruta import BorutaPy

In [5]:
import pprint
pp = pprint.PrettyPrinter()

In [6]:
data_path = './data'

In [7]:
def kuncheva_index(a: np.array, b: np.array, n: int) -> float:
    '''
    k - number of selected features
    r - number of intersection between lists of features
    n - total number of features
    
    metric:
        close to 0 - very different feature sets
        close to 1 - very similar feature sets
    '''
    if len(a) == len(b):
        k = len(a)
        r = len(list(set(a).intersection(b)))
        metric = ((r * n) - (k * k)) / (k * (n - k))
        return metric
    else:
        raise('Both arrays length must be the same size')

In [8]:
def jaccard_index(a: np.array, b: np.array, n: int = 100):
    '''
    metric:
        close to 0 - very different feature sets
        close to 1 - very similar feature sets
    '''
    numerator = len(list(set(a).intersection(b)))
    denominator = len(list(set(a).union(b)))
    metric = numerator / denominator
    return metric

In [9]:
def model_score(x_train, x_test, y_train, y_test):
    model = LogisticRegression()
    model.fit(x_train, y_train)
    pred = model.predict(x_test)

    precision = precision_score(y_test, pred)
    return precision

# Dataset: Dorothea (800 x 100000)

In [290]:
dorothea = pd.read_csv(f'{data_path}/dorothea.csv', header=None)
X = dorothea.iloc[:, :dorothea.shape[1]-1]
y = dorothea.iloc[:, -1]

print(X.shape)
print(y.shape)

dorothea_result = {}
n_features = 50

(800, 100000)
(800,)


In [11]:
x_train, x_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=3, test_size=0.2)
x_train_b, x_test_b, y_train_b, y_test_b = train_test_split(X, y, stratify=y, random_state=4, test_size=0.2)

## Without feature selection

In [13]:
%%time

start = time.perf_counter()
full_precision_d = model_score(x_train, x_test, y_train, y_test)
end = time.perf_counter()
time_diff = end - start

full_precision_d_b = model_score(x_train_b, x_test_b, y_train_b, y_test_b)

dorothea_result['w/o'] = {}
dorothea_result['w/o']['precision'] = max(full_precision_d, full_precision_d_b)
dorothea_result['w/o']['time'] = time_diff
dorothea_result['w/o']['stability'] = 1
dorothea_result['w/o']['features'] = x_train.shape[1]

CPU times: user 3.3 s, sys: 165 ms, total: 3.47 s
Wall time: 5.98 s


## Chi-square

In [277]:
%%time

start = time.perf_counter()
selector = UnivariateFilter(chi2_measure,
                          select_k_best(n_features)).fit(x_train, y_train)
selected_features = selector.selected_features_
x_train_chi2 = x_train.iloc[:, selected_features]
x_test_chi2 = x_test.iloc[:, selected_features]
chi2_precision_d = model_score(x_train_chi2, x_test_chi2, y_train, y_test)
end = time.perf_counter()
time_diff = end - start


selector_b = UnivariateFilter(chi2_measure,
                          select_k_best(n_features)).fit(x_train_b, y_train_b)
selected_features_b = selector_b.selected_features_
x_train_chi2 = x_train_b.iloc[:, selected_features_b]
x_test_chi2 = x_test_b.iloc[:, selected_features_b]
chi2_precision_d_b = model_score(x_train_chi2, x_test_chi2, y_train_b, y_test_b)


stability_index = jaccard_index(selected_features, selected_features_b, x_train.shape[1])

dorothea_result['chi2'] = {}
dorothea_result['chi2']['precision'] = max(chi2_precision_d, chi2_precision_d_b)
dorothea_result['chi2']['time'] = time_diff
dorothea_result['chi2']['stability'] = stability_index
dorothea_result['chi2']['features'] = n_features

Found %d constant features; they would not be used in fit
Found %d constant features; they would not be used in fit


CPU times: user 2min 17s, sys: 5.52 s, total: 2min 22s
Wall time: 4min 57s


## Random Forest

In [278]:
%%time

start = time.perf_counter()
selector = SelectFromModel(RandomForestClassifier(n_estimators = 100),
                           max_features=n_features).fit(x_train, y_train)
selected_features = x_train.columns[selector.get_support()]
x_train_rf = x_train.iloc[:, selected_features]
x_test_rf = x_test.iloc[:, selected_features]
rf_precision_d = model_score(x_train_rf, x_test_rf, y_train, y_test)
end = time.perf_counter()
time_diff = end - start


selector_b = SelectFromModel(RandomForestClassifier(n_estimators = 100),
                           max_features=n_features).fit(x_train_b, y_train_b)
selected_features_b = x_train_b.columns[selector_b.get_support()]
x_train_rf = x_train_b.iloc[:, selected_features_b]
x_test_rf = x_test_b.iloc[:, selected_features_b]
rf_precision_d_b = model_score(x_train_rf, x_test_rf, y_train_b, y_test_b)


stability_index = jaccard_index(selected_features, selected_features_b, x_train.shape[1])

dorothea_result['rf'] = {}
dorothea_result['rf']['precision'] = max(rf_precision_d, rf_precision_d_b)
dorothea_result['rf']['time'] = time_diff
dorothea_result['rf']['stability'] = stability_index
dorothea_result['rf']['features'] = n_features

CPU times: user 6.02 s, sys: 1.17 s, total: 7.19 s
Wall time: 10.6 s


## SU - Symmetric Uncertainty

In [256]:
%%time

start = time.perf_counter()
selector = UnivariateFilter(su_measure,
                          select_k_best(n_features)).fit(x_train, y_train)
selected_features = selector.selected_features_
x_train_su = x_train.iloc[:, selected_features]
x_test_su = x_test.iloc[:, selected_features]

su_precision_d = model_score(x_train_su, x_test_su, y_train, y_test)
end = time.perf_counter()
time_diff = end - start


selector_b = UnivariateFilter(su_measure,
                          select_k_best(n_features)).fit(x_train_b, y_train_b)
selected_features_b = selector_b.selected_features_
x_train_su = x_train_b.iloc[:, selected_features_b]
x_test_su = x_test_b.iloc[:, selected_features_b]
su_precision_d_b = model_score(x_train_su, x_test_su, y_train_b, y_test_b)


stability_index = jaccard_index(selected_features, selected_features_b, x_train.shape[1])

dorothea_result['su'] = {}
dorothea_result['su']['precision'] = max(su_precision_d, su_precision_d_b)
dorothea_result['su']['time'] = time_diff
dorothea_result['su']['stability'] = stability_index
dorothea_result['su']['features'] = n_features

Found %d constant features; they would not be used in fit
Found %d constant features; they would not be used in fit


CPU times: user 1min 36s, sys: 1.69 s, total: 1min 37s
Wall time: 1min 44s


## Relief

In [279]:
%%time

start = time.perf_counter()
selector = UnivariateFilter(relief_measure,
                          select_k_best(n_features)).fit(x_train, y_train)
selected_features = selector.selected_features_
x_train_relief = x_train.iloc[:, selected_features]
x_test_relief = x_test.iloc[:, selected_features]
relief_precision_d = model_score(x_train_relief, x_test_relief, y_train, y_test)
end = time.perf_counter()
time_diff = end - start


selector_b = UnivariateFilter(relief_measure,
                          select_k_best(n_features)).fit(x_train_b, y_train_b)
selected_features_b = selector_b.selected_features_
x_train_relief = x_train_b.iloc[:, selected_features_b]
x_test_relief = x_test_b.iloc[:, selected_features_b]
relief_precision_d_b = model_score(x_train_relief, x_test_relief, y_train_b, y_test_b)


stability_index = jaccard_index(selected_features, selected_features_b, x_train.shape[1])

dorothea_result['relief'] = {}
dorothea_result['relief']['precision'] = max(relief_precision_d, relief_precision_d_b)
dorothea_result['relief']['time'] = time_diff
dorothea_result['relief']['stability'] = stability_index
dorothea_result['relief']['features'] = n_features

Found %d constant features; they would not be used in fit
Found %d constant features; they would not be used in fit


CPU times: user 19 s, sys: 3.93 s, total: 22.9 s
Wall time: 31.1 s


## Boruta

In [286]:
%%time

x_values = x_train.values
y_values = y_train.values.ravel()

start = time.perf_counter()
rf = RandomForestClassifier(n_estimators=50, max_depth=5)
selector = BorutaPy(rf, n_estimators=50, verbose=1, random_state=1)
selector.fit(x_values, y_values)
selected_features = x_train.columns[selector.support_]

x_train_boruta = selector.transform(x_train.values) # call transform() on X to filter it down to selected features
x_test_boruta = selector.transform(x_test.values)

boruta_precision_d = model_score(x_train_boruta, x_test_boruta, y_train, y_test)
end = time.perf_counter()
time_diff = end - start


x_values = x_train_b.values
y_values = y_train_b.values.ravel()

rf = RandomForestClassifier(n_estimators=50, max_depth=5)
selector_b = BorutaPy(rf, n_estimators=50, verbose=1, random_state=1)
selector_b.fit(x_values, y_values)
selected_features_b = x_train_b.columns[selector_b.support_]

x_train_boruta = selector_b.transform(x_train_b.values) # call transform() on X to filter it down to selected features
x_test_boruta = selector_b.transform(x_test_b.values)

boruta_precision_d_b = model_score(x_train_boruta, x_test_boruta, y_train_b, y_test_b)


stability_index = jaccard_index(selected_features, selected_features_b, x_train.shape[1])

dorothea_result['boruta'] = {}
dorothea_result['boruta']['precision'] = max(boruta_precision_d, boruta_precision_d_b)
dorothea_result['boruta']['time'] = time_diff
dorothea_result['boruta']['stability'] = stability_index
dorothea_result['boruta']['features'] = max(len(selected_features), len(selected_features_b))

Iteration: 1 / 100
Iteration: 2 / 100
Iteration: 3 / 100
Iteration: 4 / 100
Iteration: 5 / 100
Iteration: 6 / 100
Iteration: 7 / 100
Iteration: 8 / 100
Iteration: 9 / 100
Iteration: 10 / 100
Iteration: 11 / 100
Iteration: 12 / 100
Iteration: 13 / 100
Iteration: 14 / 100
Iteration: 15 / 100
Iteration: 16 / 100
Iteration: 17 / 100
Iteration: 18 / 100
Iteration: 19 / 100
Iteration: 20 / 100
Iteration: 21 / 100
Iteration: 22 / 100
Iteration: 23 / 100
Iteration: 24 / 100
Iteration: 25 / 100
Iteration: 26 / 100
Iteration: 27 / 100
Iteration: 28 / 100
Iteration: 29 / 100
Iteration: 30 / 100
Iteration: 31 / 100
Iteration: 32 / 100
Iteration: 33 / 100
Iteration: 34 / 100
Iteration: 35 / 100
Iteration: 36 / 100
Iteration: 37 / 100
Iteration: 38 / 100
Iteration: 39 / 100
Iteration: 40 / 100
Iteration: 41 / 100
Iteration: 42 / 100
Iteration: 43 / 100
Iteration: 44 / 100
Iteration: 45 / 100
Iteration: 46 / 100
Iteration: 47 / 100
Iteration: 48 / 100
Iteration: 49 / 100
Iteration: 50 / 100
Iteration

## Dorothea result

In [287]:
pp.pprint(dorothea_result)

{'boruta': {'features': 36,
            'precision': 0.9,
            'stability': 0.42,
            'time': 77.35342771501746},
 'chi2': {'features': 50,
          'precision': 1.0,
          'stability': 0.7857142857142857,
          'time': 146.2416198248975},
 'relief': {'features': 50,
            'precision': 1.0,
            'stability': 0.3333333333333333,
            'time': 15.024194821016863},
 'rf': {'features': 50,
        'precision': 1.0,
        'stability': 0.3698630136986301,
        'time': 6.101353017031215},
 'w/o': {'features': 100000,
         'precision': 1.0,
         'stability': 1,
         'time': 13.39689862402156}}


In [288]:
dorothea_result_df = pd.DataFrame.from_records(dorothea_result)
print(dorothea_result_df)

              boruta        chi2     relief         rf            w/o
precision   0.900000    1.000000   1.000000   1.000000       1.000000
time       77.353428  146.241620  15.024195   6.101353      13.396899
stability   0.420000    0.785714   0.333333   0.369863       1.000000
features   36.000000   50.000000  50.000000  50.000000  100000.000000


In [289]:
dorothea_result_df.to_excel('./results/dorothea_result.xlsx')

# Dataset: Gisette (6000 x 5000)

In [261]:
gisette = pd.read_csv(f'{data_path}/gisette.csv', header=None)

X = gisette.iloc[:, :gisette.shape[1]-1]
y = gisette.iloc[:, -1]

print(X.shape)
print(y.shape)

gisette_result = {}
n_features = 50

(6000, 5000)
(6000,)


In [262]:
x_train, x_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=1, test_size=0.2)
x_train_b, x_test_b, y_train_b, y_test_b = train_test_split(X, y, stratify=y, random_state=2, test_size=0.2)

## Without feature selection

In [263]:
%%time

start = time.perf_counter()
full_precision_g = model_score(x_train, x_test, y_train, y_test)
end = time.perf_counter()
time_diff = end - start

full_precision_g_b = model_score(x_train_b, x_test_b, y_train_b, y_test_b)

gisette_result['w/o'] = {}
gisette_result['w/o']['precision'] = max(full_precision_g, full_precision_g_b)
gisette_result['w/o']['time'] = time_diff
gisette_result['w/o']['stability'] = 1
gisette_result['w/o']['features'] = x_train.shape[1]

CPU times: user 11.9 s, sys: 624 ms, total: 12.6 s
Wall time: 8.09 s


## Chi-square

In [264]:
%%time

start = time.perf_counter()
selector = UnivariateFilter(chi2_measure,
                          select_k_best(n_features)).fit(x_train, y_train)
selected_features = selector.selected_features_
x_train_chi2 = x_train.iloc[:, selected_features]
x_test_chi2 = x_test.iloc[:, selected_features]
chi2_precision_g = model_score(x_train_chi2, x_test_chi2, y_train, y_test)
end = time.perf_counter()
time_diff = end - start


selector = UnivariateFilter(chi2_measure,
                          select_k_best(n_features)).fit(x_train_b, y_train_b)
selected_features_b = selector.selected_features_
x_train_chi2 = x_train_b.iloc[:, selected_features_b]
x_test_chi2 = x_test_b.iloc[:, selected_features_b]
chi2_precision_g_b = model_score(x_train_chi2, x_test_chi2, y_train_b, y_test_b)


stability_index = jaccard_index(selected_features, selected_features_b, x_train.shape[1])

gisette_result['chi2'] = {}
gisette_result['chi2']['precision'] = max(chi2_precision_g, chi2_precision_g_b)
gisette_result['chi2']['time'] = time_diff
gisette_result['chi2']['stability'] = stability_index
gisette_result['chi2']['features'] = n_features

Found %d constant features; they would not be used in fit
Found %d constant features; they would not be used in fit


CPU times: user 17.7 s, sys: 792 ms, total: 18.5 s
Wall time: 21.2 s


## Random Forest

In [265]:
%%time

start = time.perf_counter()
selector = SelectFromModel(RandomForestClassifier(n_estimators = 100),
                           max_features=n_features).fit(x_train, y_train)
selected_features = x_train.columns[selector.get_support()]
x_train_rf = x_train.iloc[:, selected_features]
x_test_rf = x_test.iloc[:, selected_features]
rf_precision_g = model_score(x_train_rf, x_test_rf, y_train, y_test)
end = time.perf_counter()
time_diff = end - start


selector = SelectFromModel(RandomForestClassifier(n_estimators = 100),
                           max_features=n_features).fit(x_train_b, y_train_b)
selected_features_b = x_train_b.columns[selector.get_support()]
x_train_rf = x_train_b.iloc[:, selected_features_b]
x_test_rf = x_test_b.iloc[:, selected_features_b]
rf_precision_g_b = model_score(x_train_rf, x_test_rf, y_train_b, y_test_b)


stability_index = jaccard_index(selected_features, selected_features_b, x_train.shape[1])

gisette_result['rf'] = {}
gisette_result['rf']['precision'] = max(rf_precision_g, rf_precision_g_b)
gisette_result['rf']['time'] = time_diff
gisette_result['rf']['stability'] = stability_index
gisette_result['rf']['features'] = n_features

CPU times: user 11.8 s, sys: 362 ms, total: 12.2 s
Wall time: 14 s


## SU - Symmetric Uncertainty

In [267]:
%%time

start = time.perf_counter()
selector = UnivariateFilter(su_measure,
                          select_k_best(n_features)).fit(x_train, y_train)
selected_features = selector.selected_features_
x_train_su = x_train.iloc[:, selected_features]
x_test_su = x_test.iloc[:, selected_features]
su_precision_g = model_score(x_train_su, x_test_su, y_train, y_test)
end = time.perf_counter()
time_diff = end - start


selector = UnivariateFilter(su_measure,
                          select_k_best(n_features)).fit(x_train_b, y_train_b)
selected_features_b = selector.selected_features_
x_train_su = x_train_b.iloc[:, selected_features_b]
x_test_su = x_test_b.iloc[:, selected_features_b]
su_precision_g_b = model_score(x_train_su, x_test_su, y_train_b, y_test_b)


stability_index = jaccard_index(selected_features, selected_features_b, x_train.shape[1])

gisette_result['su'] = {}
gisette_result['su']['precision'] = max(su_precision_g, su_precision_g_b)
gisette_result['su']['time'] = time_diff
gisette_result['su']['stability'] = stability_index
gisette_result['su']['features'] = n_features

Found %d constant features; they would not be used in fit
Found %d constant features; they would not be used in fit


CPU times: user 58.6 s, sys: 1.39 s, total: 60 s
Wall time: 1min 10s


## Relief

In [269]:
%%time

start = time.perf_counter()
selector = UnivariateFilter(relief_measure,
                          select_k_best(n_features)).fit(x_train, y_train)
selected_features = selector.selected_features_
x_train_relief = x_train.iloc[:, selected_features]
x_test_relief = x_test.iloc[:, selected_features]
relief_precision_g = model_score(x_train_relief, x_test_relief, y_train, y_test)
end = time.perf_counter()
time_diff = end - start


selector = UnivariateFilter(relief_measure,
                          select_k_best(n_features)).fit(x_train_b, y_train_b)
selected_features_b = selector.selected_features_
x_train_relief = x_train_b.iloc[:, selected_features_b]
x_test_relief = x_test_b.iloc[:, selected_features_b]
relief_precision_g_b = model_score(x_train_relief, x_test_relief, y_train_b, y_test_b)


stability_index = jaccard_index(selected_features, selected_features_b, x_train.shape[1])

gisette_result['relief'] = {}
gisette_result['relief']['precision'] = max(relief_precision_g, relief_precision_g_b)
gisette_result['relief']['time'] = time_diff
gisette_result['relief']['stability'] = stability_index
gisette_result['relief']['features'] = n_features

Found %d constant features; they would not be used in fit
Found %d constant features; they would not be used in fit


CPU times: user 18.9 s, sys: 2.13 s, total: 21 s
Wall time: 20.6 s


## Boruta

In [270]:
%%time

x_values = x_train.values
y_values = y_train.values.ravel()

start = time.perf_counter()
rf = RandomForestClassifier(n_estimators=50, max_depth=5)
selector = BorutaPy(rf, n_estimators=50, verbose=1, random_state=1)
selector.fit(x_values, y_values)
selected_features = x_train.columns[selector.support_]

x_train_boruta = selector.transform(x_train.values) # call transform() on X to filter it down to selected features
x_test_boruta = selector.transform(x_test.values)

boruta_precision_g = model_score(x_train_boruta, x_test_boruta, y_train, y_test)
end = time.perf_counter()
time_diff = end - start


x_values = x_train_b.values
y_values = y_train_b.values.ravel()

rf = RandomForestClassifier(n_estimators=50, max_depth=5)
selector_b = BorutaPy(rf, n_estimators=50, verbose=1, random_state=1)
selector_b.fit(x_values, y_values)
selected_features_b = x_train_b.columns[selector_b.support_]

x_train_boruta = selector_b.transform(x_train_b.values) # call transform() on X to filter it down to selected features
x_test_boruta = selector_b.transform(x_test_b.values)

boruta_precision_g_b = model_score(x_train_boruta, x_test_boruta, y_train_b, y_test_b)


stability_index = jaccard_index(selected_features, selected_features_b)

gisette_result['boruta'] = {}
gisette_result['boruta']['precision'] = max(boruta_precision_g, boruta_precision_g_b)
gisette_result['boruta']['time'] = time_diff
gisette_result['boruta']['stability'] = stability_index
gisette_result['boruta']['features'] = max(len(selected_features), len(selected_features_b))

Iteration: 1 / 100
Iteration: 2 / 100
Iteration: 3 / 100
Iteration: 4 / 100
Iteration: 5 / 100
Iteration: 6 / 100
Iteration: 7 / 100
Iteration: 8 / 100
Iteration: 9 / 100
Iteration: 10 / 100
Iteration: 11 / 100
Iteration: 12 / 100
Iteration: 13 / 100
Iteration: 14 / 100
Iteration: 15 / 100
Iteration: 16 / 100
Iteration: 17 / 100
Iteration: 18 / 100
Iteration: 19 / 100
Iteration: 20 / 100
Iteration: 21 / 100
Iteration: 22 / 100
Iteration: 23 / 100
Iteration: 24 / 100
Iteration: 25 / 100
Iteration: 26 / 100
Iteration: 27 / 100
Iteration: 28 / 100
Iteration: 29 / 100
Iteration: 30 / 100
Iteration: 31 / 100
Iteration: 32 / 100
Iteration: 33 / 100
Iteration: 34 / 100
Iteration: 35 / 100
Iteration: 36 / 100
Iteration: 37 / 100
Iteration: 38 / 100
Iteration: 39 / 100
Iteration: 40 / 100
Iteration: 41 / 100
Iteration: 42 / 100
Iteration: 43 / 100
Iteration: 44 / 100
Iteration: 45 / 100
Iteration: 46 / 100
Iteration: 47 / 100
Iteration: 48 / 100
Iteration: 49 / 100
Iteration: 50 / 100
Iteration

## Gisette result

In [271]:
pp.pprint(gisette_result)

{'boruta': {'features': 228,
            'precision': 0.9658119658119658,
            'stability': 0.7791164658634538,
            'time': 126.89449593005702},
 'chi2': {'features': 50,
          'precision': 0.92,
          'stability': 1.0,
          'time': 10.372468177927658},
 'relief': {'features': 50,
            'precision': 0.9035532994923858,
            'stability': 0.8181818181818182,
            'time': 10.287922961986624},
 'rf': {'features': 50,
        'precision': 0.9290540540540541,
        'stability': 0.4492753623188406,
        'time': 7.083289135945961},
 'su': {'features': 50,
        'precision': 0.9181969949916527,
        'stability': 1.0,
        'time': 37.153728437027894},
 'w/o': {'features': 5000,
         'precision': 0.9829642248722317,
         'stability': 1,
         'time': 4.315752376918681}}


In [272]:
gisette_result_df = pd.DataFrame.from_records(gisette_result)
print(gisette_result_df)

               boruta       chi2     relief         rf         su          w/o
precision    0.965812   0.920000   0.903553   0.929054   0.918197     0.982964
time       126.894496  10.372468  10.287923   7.083289  37.153728     4.315752
stability    0.779116   1.000000   0.818182   0.449275   1.000000     1.000000
features   228.000000  50.000000  50.000000  50.000000  50.000000  5000.000000


In [273]:
gisette_result_df.to_excel('./results/gisette_result.xlsx')

# Dataset: Dexter(300 x 20000)

In [15]:
dexter = pd.read_csv(f'{data_path}/dexter.csv', header=None)
X = dexter.iloc[:, :dexter.shape[1]-1]
y = dexter.iloc[:, -1]

print(X.shape)
print(y.shape)

dexter_result = {}
n_features = 50

(300, 20000)
(300,)


In [16]:
x_train, x_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=3, test_size=0.2)
x_train_b, x_test_b, y_train_b, y_test_b = train_test_split(X, y, stratify=y, random_state=4, test_size=0.2)

## Without feature selection

In [17]:
%%time

start = time.perf_counter()
full_precision_d = model_score(x_train, x_test, y_train, y_test)
end = time.perf_counter()
time_diff = end - start

full_precision_d_b = model_score(x_train_b, x_test_b, y_train_b, y_test_b)

dexter_result['w/o'] = {}
dexter_result['w/o']['precision'] = max(full_precision_d, full_precision_d_b)
dexter_result['w/o']['time'] = time_diff
dexter_result['w/o']['stability'] = 1
dexter_result['w/o']['features'] = x_train.shape[1]

CPU times: user 3.11 s, sys: 160 ms, total: 3.27 s
Wall time: 19.5 s


## Chi-square

In [19]:
%%time

start = time.perf_counter()
selector = UnivariateFilter(chi2_measure,
                          select_k_best(n_features)).fit(x_train, y_train)
selected_features = selector.selected_features_
x_train_chi2 = x_train.iloc[:, selected_features]
x_test_chi2 = x_test.iloc[:, selected_features]
chi2_precision_d = model_score(x_train_chi2, x_test_chi2, y_train, y_test)
end = time.perf_counter()
time_diff = end - start


selector_b = UnivariateFilter(chi2_measure,
                          select_k_best(n_features)).fit(x_train_b, y_train_b)
selected_features_b = selector_b.selected_features_
x_train_chi2 = x_train_b.iloc[:, selected_features_b]
x_test_chi2 = x_test_b.iloc[:, selected_features_b]
chi2_precision_d_b = model_score(x_train_chi2, x_test_chi2, y_train_b, y_test_b)


stability_index = jaccard_index(selected_features, selected_features_b, x_train.shape[1])

dexter_result['chi2'] = {}
dexter_result['chi2']['precision'] = max(chi2_precision_d, chi2_precision_d_b)
dexter_result['chi2']['time'] = time_diff
dexter_result['chi2']['stability'] = stability_index
dexter_result['chi2']['features'] = n_features

Found %d constant features; they would not be used in fit
Found %d constant features; they would not be used in fit


CPU times: user 6.87 s, sys: 182 ms, total: 7.06 s
Wall time: 9.36 s


## Random forest

In [20]:
%%time

start = time.perf_counter()
selector = SelectFromModel(RandomForestClassifier(n_estimators = 100),
                           max_features=n_features).fit(x_train, y_train)
selected_features = x_train.columns[selector.get_support()]
x_train_rf = x_train.iloc[:, selected_features]
x_test_rf = x_test.iloc[:, selected_features]
rf_precision_d = model_score(x_train_rf, x_test_rf, y_train, y_test)
end = time.perf_counter()
time_diff = end - start


selector_b = SelectFromModel(RandomForestClassifier(n_estimators = 100),
                           max_features=n_features).fit(x_train_b, y_train_b)
selected_features_b = x_train_b.columns[selector_b.get_support()]
x_train_rf = x_train_b.iloc[:, selected_features_b]
x_test_rf = x_test_b.iloc[:, selected_features_b]
rf_precision_d_b = model_score(x_train_rf, x_test_rf, y_train_b, y_test_b)


stability_index = jaccard_index(selected_features, selected_features_b, x_train.shape[1])

dexter_result['rf'] = {}
dexter_result['rf']['precision'] = max(rf_precision_d, rf_precision_d_b)
dexter_result['rf']['time'] = time_diff
dexter_result['rf']['stability'] = stability_index
dexter_result['rf']['features'] = n_features

CPU times: user 1.28 s, sys: 107 ms, total: 1.39 s
Wall time: 1.49 s


## SU — Symmetrical Uncertainty

In [21]:
%%time

start = time.perf_counter()
selector = UnivariateFilter(su_measure,
                          select_k_best(n_features)).fit(x_train, y_train)
selected_features = selector.selected_features_
x_train_su = x_train.iloc[:, selected_features]
x_test_su = x_test.iloc[:, selected_features]

su_precision_d = model_score(x_train_su, x_test_su, y_train, y_test)
end = time.perf_counter()
time_diff = end - start


selector_b = UnivariateFilter(su_measure,
                          select_k_best(n_features)).fit(x_train_b, y_train_b)
selected_features_b = selector_b.selected_features_
x_train_su = x_train_b.iloc[:, selected_features_b]
x_test_su = x_test_b.iloc[:, selected_features_b]
su_precision_d_b = model_score(x_train_su, x_test_su, y_train_b, y_test_b)


stability_index = jaccard_index(selected_features, selected_features_b, x_train.shape[1])

dexter_result['su'] = {}
dexter_result['su']['precision'] = max(su_precision_d, su_precision_d_b)
dexter_result['su']['time'] = time_diff
dexter_result['su']['stability'] = stability_index
dexter_result['su']['features'] = n_features

Found %d constant features; they would not be used in fit
Found %d constant features; they would not be used in fit


CPU times: user 3.92 s, sys: 101 ms, total: 4.02 s
Wall time: 4.53 s


## Relief

In [22]:
%%time

start = time.perf_counter()
selector = UnivariateFilter(relief_measure,
                          select_k_best(n_features)).fit(x_train, y_train)
selected_features = selector.selected_features_
x_train_relief = x_train.iloc[:, selected_features]
x_test_relief = x_test.iloc[:, selected_features]
relief_precision_d = model_score(x_train_relief, x_test_relief, y_train, y_test)
end = time.perf_counter()
time_diff = end - start


selector_b = UnivariateFilter(relief_measure,
                          select_k_best(n_features)).fit(x_train_b, y_train_b)
selected_features_b = selector_b.selected_features_
x_train_relief = x_train_b.iloc[:, selected_features_b]
x_test_relief = x_test_b.iloc[:, selected_features_b]
relief_precision_d_b = model_score(x_train_relief, x_test_relief, y_train_b, y_test_b)


stability_index = jaccard_index(selected_features, selected_features_b, x_train.shape[1])

dexter_result['relief'] = {}
dexter_result['relief']['precision'] = max(relief_precision_d, relief_precision_d_b)
dexter_result['relief']['time'] = time_diff
dexter_result['relief']['stability'] = stability_index
dexter_result['relief']['features'] = n_features

Found %d constant features; they would not be used in fit
Found %d constant features; they would not be used in fit


CPU times: user 962 ms, sys: 147 ms, total: 1.11 s
Wall time: 1.49 s


## Boruta

In [23]:
%%time

x_values = x_train.values
y_values = y_train.values.ravel()

start = time.perf_counter()
rf = RandomForestClassifier(n_estimators=50, max_depth=5)
selector = BorutaPy(rf, n_estimators=50, verbose=1, random_state=1)
selector.fit(x_values, y_values)
selected_features = x_train.columns[selector.support_]

x_train_boruta = selector.transform(x_train.values) # call transform() on X to filter it down to selected features
x_test_boruta = selector.transform(x_test.values)

boruta_precision_d = model_score(x_train_boruta, x_test_boruta, y_train, y_test)
end = time.perf_counter()
time_diff = end - start


x_values = x_train_b.values
y_values = y_train_b.values.ravel()

rf = RandomForestClassifier(n_estimators=50, max_depth=5)
selector_b = BorutaPy(rf, n_estimators=50, verbose=1, random_state=1)
selector_b.fit(x_values, y_values)
selected_features_b = x_train_b.columns[selector_b.support_]

x_train_boruta = selector_b.transform(x_train_b.values) # call transform() on X to filter it down to selected features
x_test_boruta = selector_b.transform(x_test_b.values)

boruta_precision_d_b = model_score(x_train_boruta, x_test_boruta, y_train_b, y_test_b)


stability_index = jaccard_index(selected_features, selected_features_b, x_train.shape[1])

dexter_result['boruta'] = {}
dexter_result['boruta']['precision'] = max(boruta_precision_d, boruta_precision_d_b)
dexter_result['boruta']['time'] = time_diff
dexter_result['boruta']['stability'] = stability_index
dexter_result['boruta']['features'] = max(len(selected_features), len(selected_features_b))

Iteration: 1 / 100
Iteration: 2 / 100
Iteration: 3 / 100
Iteration: 4 / 100
Iteration: 5 / 100
Iteration: 6 / 100
Iteration: 7 / 100
Iteration: 8 / 100
Iteration: 9 / 100
Iteration: 10 / 100
Iteration: 11 / 100
Iteration: 12 / 100
Iteration: 13 / 100
Iteration: 14 / 100
Iteration: 15 / 100
Iteration: 16 / 100
Iteration: 17 / 100
Iteration: 18 / 100
Iteration: 19 / 100
Iteration: 20 / 100
Iteration: 21 / 100
Iteration: 22 / 100
Iteration: 23 / 100
Iteration: 24 / 100
Iteration: 25 / 100
Iteration: 26 / 100
Iteration: 27 / 100
Iteration: 28 / 100
Iteration: 29 / 100
Iteration: 30 / 100
Iteration: 31 / 100
Iteration: 32 / 100
Iteration: 33 / 100
Iteration: 34 / 100
Iteration: 35 / 100
Iteration: 36 / 100
Iteration: 37 / 100
Iteration: 38 / 100
Iteration: 39 / 100
Iteration: 40 / 100
Iteration: 41 / 100
Iteration: 42 / 100
Iteration: 43 / 100
Iteration: 44 / 100
Iteration: 45 / 100
Iteration: 46 / 100
Iteration: 47 / 100
Iteration: 48 / 100
Iteration: 49 / 100
Iteration: 50 / 100
Iteration

## Dexter result

In [24]:
pp.pprint(dexter_result)

{'boruta': {'features': 25,
            'precision': 0.9615384615384616,
            'stability': 0.5862068965517241,
            'time': 19.08413399799997},
 'chi2': {'features': 50,
          'precision': 0.9565217391304348,
          'stability': 0.8518518518518519,
          'time': 4.803842994999968},
 'relief': {'features': 50,
            'precision': 0.78125,
            'stability': 0.25,
            'time': 0.6551037979999137},
 'rf': {'features': 50,
        'precision': 0.9615384615384616,
        'stability': 0.5151515151515151,
        'time': 0.7624066419999735},
 'su': {'features': 50,
        'precision': 0.9629629629629629,
        'stability': 0.7857142857142857,
        'time': 2.1916900409999016},
 'w/o': {'features': 20000,
         'precision': 0.9666666666666667,
         'stability': 1,
         'time': 2.225876378999999}}


In [25]:
dexter_result_df = pd.DataFrame.from_records(dexter_result)
print(dexter_result_df)

              boruta       chi2     relief         rf         su           w/o
precision   0.961538   0.956522   0.781250   0.961538   0.962963      0.966667
time       19.084134   4.803843   0.655104   0.762407   2.191690      2.225876
stability   0.586207   0.851852   0.250000   0.515152   0.785714      1.000000
features   25.000000  50.000000  50.000000  50.000000  50.000000  20000.000000


In [26]:
dexter_result_df.to_excel('./results/dexter_result.xlsx')