In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import itertools
from sklearn.metrics import adjusted_mutual_info_score
from scipy.stats import special_ortho_group

import tools

In [None]:
d = 30
df = pd.read_csv('data/data.csv')
expected_features = np.array([10, 27])

In [None]:
features = [f'f{n}' for n in range(d)]
targets = ['y']
xdf = df[features]
ydf = df[targets]
x = xdf.values
y = ydf.values

## Uncovering the dependence between target and features

We check that 
$$
y =
\begin{cases}
1 & \text{  if } x_{k_{0}} = x_{k_{1}}
\\
0 & \text { otherwise},
\end{cases}
$$
where $k_0 = 10$ and $k_1 = 27 $
are the expected features.

From the thirty features $0, \dots, 29$ in `df`, our feature selection is correct if it keeps features 10 and 27, and it discards all others.

In [None]:
test = np.array(x[:, expected_features[0]] == x[:, expected_features[1]], dtype=int)
assert np.all(test == y[:, 0])

## Preliminary check: expected features bear the highest information content

Of all $d \choose 2$ pairs of features, we check that the expected pair $(10, 27)$ has the highest mutual information with the target.

In [None]:
def onedimlabel(x):
    assert x.ndim == 2
    ns = np.amax(x, axis=0)
    res = np.array(x[:, 0], copy=True)
    m = 1
    for i in range(1, x.shape[1]):
        m *= max(1, ns[i-1])
        res += (1+m) * x[:, i]
    return res

In [None]:
l = 2
miscores = {subset: 
           adjusted_mutual_info_score(onedimlabel(x[:, list(subset)]), y[:, 0])
            for subset in itertools.combinations(list(range(d)), l)
            
}

In [None]:
s = (0,1)
mi = 0
for k, v in miscores.items():
    if v > mi:
        s = k
        mi = v
highest_info = s

In [None]:
print(f'Expected features: {sorted(expected_features)}')
print(f'Pair of features with highest information content: {sorted(highest_info)}')

### Selection with marginal 1D ksg mutual info

In [None]:
ksgselection, mis = tools.ksgmi(xdf, ydf, threshold=0.05)

In [None]:
print(f'Expected features: {sorted(expected_features)}')
print(f'Marginal KSG selection: {sorted(ksgselection)}')

### Selection with HSIC Lasso

### Selection with Boruta

In [None]:
from arfs.feature_selection import allrelevant
from arfs.feature_selection.allrelevant import Leshy
from sklearn.ensemble import RandomForestClassifier

In [None]:
n_estimators = 'auto'
perc = 95
alpha = 0.05
importance = "shap"
two_step = True
max_iter = 100
random_state = 1234
verbose = 0
keep_weak = False

In [None]:
xdf = pd.DataFrame(x, columns = [f'f{i}' for i in range(d)])
yser = pd.Series(y[:, 0], name='y')

In [None]:
rf = RandomForestClassifier(n_jobs=-1, max_depth=8)

In [None]:
leshy = Leshy(
    rf,
    n_estimators=n_estimators,
    perc=perc,
    alpha=alpha,
    importance=importance,
    two_step=two_step,
    max_iter=max_iter,
    random_state=random_state,
    verbose=verbose,
    keep_weak=keep_weak,
)

In [None]:
leshy.fit(xdf, yser)
leshy_selection = [int(col.replace('f', '')) for col in leshy.selected_features_]

In [None]:
print(f'Expected features: {sorted(expected_features)}')
print(f'Boruta selection: {sorted(leshy_selection)}')