In [None]:
import numpy as np
import pandas as pd
import itertools
from sklearn.metrics import mutual_info_score, adjusted_mutual_info_score
import seaborn as sns

import tools

## Synthesize the data

In [None]:
k = 5
n = 10000
d = 10

In [None]:
x0 = np.random.randint(k, size=(n, 1))
x1 = np.random.randint(k, size=(n, 1))
ms = np.random.randint(low=4, high=8, size = d-2)
others = [np.random.choice(m, size=(n, 1)) for m in ms]
all_ = np.concatenate(
    [x0, x1] + others,
    axis=1
)
y = np.asarray(x0 == x1, dtype=int) # k + x0 - x1 # np.asarray(x0 == x1, dtype=int)
permuter =  np.random.permutation(np.eye(d, dtype=int).T).T
x = np.array(all_ @ permuter, dtype=int)
expected_features = [np.argmax(permuter[0, :]), np.argmax(permuter[1, :])]

In [None]:
assert np.all(x[:, expected_features[0]] == x0[:, 0])
assert np.all(x[:, expected_features[1]] == x1[:, 0])

In [None]:
xdf = pd.DataFrame(x, columns = [f'x{i}' for i in range(d)])
ydf = pd.Series(y[:, 0], name='y')

## Uncover the dependence between target and features

We check that 
$$
y =
\begin{cases}
1 & \text{  if } x_{k_{0}} = x_{k_{1}}
\\
0 & \text { otherwise},
\end{cases}
$$
where $k_0$ and $k_1 $
are the expected features.

In [None]:
test = np.array(x[:, expected_features[0]] == x[:, expected_features[1]], dtype=int)
assert np.all(test == y[:, 0])

## Preliminary check: expected features bear the highest information content

Of all $d \choose 2$ pairs of features, we check that the expected pair $\lbrace k_0 , k_1 \rbrace$ has the highest mutual information with the target.

In [None]:
l = 2
miscores = {subset: 
           adjusted_mutual_info_score(tools.onedimlabel(x[:, list(subset)]), y[:, 0])
            for subset in itertools.combinations(list(range(d)), l)
            
}

In [None]:
s = (0,1)
mi = 0
for k, v in miscores.items():
    if v > mi:
        s = k
        mi = v
highest_info = s

In [None]:
print(f'Expected features: {sorted(expected_features)}')
print(f'Pair of features with highest information content: {sorted(highest_info)}')

### Selection with marginal 1D ksg mutual info

In [None]:
ksgselection, mis = tools.ksgmi(xdf, ydf, threshold=0.05)

In [None]:
print(f'Expected features: {sorted(expected_features)}')
print(f'Marginal KSG selection: {sorted(ksgselection)}')

## Selection via exclusion 

In [None]:
mi_estimator = mutual_info_score # or: adjusted_mutual_info_score

In [None]:
allfeatures_mi = mi_estimator(tools.onedimlabel(x), y[:, 0])

In [None]:
miscores_exclusions = {set(range(d)).difference(subset).pop(): 
           mi_estimator(tools.onedimlabel(x[:, list(subset)]), y[:, 0])
            for subset in itertools.combinations(list(range(d)), d-1)
            
}

In [None]:
mi_difference = {feature: excluded - allfeatures_mi for feature, excluded in miscores_exclusions.items()}
scores = [mi_difference[i] for i in range(d)]

In [None]:
scores = pd.DataFrame({feature: [value] for feature, value in mi_difference.items()})
scores.sort_index(axis=1, inplace=True)

In [None]:
scores

In [None]:
sns.barplot(scores)