# Task 3: Feature importance

In [110]:
import pickle
import numpy as np
import pandas as pd
from sklearn.datasets import make_regression
from sklearn.linear_model import RidgeCV
from scipy.stats import pearsonr

## Load data

In [148]:
with open("./task3_feature-importance_data.pickle", "rb") as f:
    X, y = pickle.load(f)

## Fit model

In [149]:
model = RidgeCV()
model.fit(X, y)

RidgeCV(alphas=array([ 0.1,  1. , 10. ]))

## Analyse correlations and model coefficients

In [150]:
correlations_r = [pearsonr(X[:, i], y)[0] for i in range(X.shape[1])]
correlations_p = [pearsonr(X[:, i], y)[1] for i in range(X.shape[1])]
    

In [151]:
df = pd.DataFrame({
        "coef": model.coef_, 
        "correlation r": correlations_r, 
        "correlation p": correlations_p}, 
    index=[f"feature {i}" for i in range(X.shape[1])])
df

Unnamed: 0,coef,correlation r,correlation p
feature 0,7.025404,0.188073,0.06095162
feature 1,2.377854,0.092541,0.3598008
feature 2,33.485614,0.138256,0.1701322
feature 3,78.315351,0.569326,6.374702e-10
feature 4,79.870733,0.523729,2.252291e-08
feature 5,-3.73937,-0.105416,0.2965699
feature 6,-2.360193,-0.129693,0.1984219
feature 7,32.780314,0.141958,0.1588725
feature 8,38.077457,0.030352,0.764348
feature 9,-4.928516,-0.020712,0.837932


In [152]:
# feature 7 and 8 are binary
np.unique(X[:,7]), np.unique(X[:,8])

(array([0., 1.]), array([0., 1.]))

In [154]:
# feature 7 and 8 are mutually exclusive
((X[:,7] == 1) & (X[:,8] == 1)).sum()

0

In [155]:
# feature 7 and 8 nearly cover the whole dataset!
((X[:,7] == 1) | (X[:,8] == 1)).sum() / X.shape[0]

0.9

## Tasks

Feature 7 and 8 seem to be important features for the model (with coefficients > 30!). However, taking a closer look, they are both binary, mutually exclusive, and nearly cover the whole dataset. Also they are barely correlated to the outcome by themselves. I would not expect them to both have such a high importance for the model and on top of that both positive! What is going on?