# Objective

Try alternate methods to identify related features. The repeated related feature models took a considerable amount of time to run. I want to try some different methods to see if the process can be accelerated

In [1]:
%run data_package_loading.py # Code loads data as well as packages that are relevant across most project phases
%matplotlib inline

### 1. Single feature prediction of Target. 
Will a single feature be a strong predictor of the target? Can we identify the true features as those that are strong predictors of the target?

In [2]:
data_target = [(Xuci_1, yuci_1, 'uci_1'), 
               (Xuci_2, yuci_2, 'uci_2'), 
               (Xuci_3, yuci_3, 'uci_3'), 
               (Xdb_1,  ydb_1, 'db_1'), 
               (Xdb_2,  ydb_2, 'db_2'), 
               (Xdb_3,  ydb_3, 'db_3')]

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm

In [6]:
# no need to scale since we are only testing one feature at a time.
def feature_test(X, y, classifier):
    mean_scores = []
#     y = np.array(y).reshape(-1, 1)
    
    # Run regresspr with Kfold
    for col in tqdm(X.columns):
        train_scores = []
        test_scores = []
        
        Xcol = X[[col]]
        
        # Set up Kfolds split
        skf = StratifiedKFold(n_splits=10, shuffle=True, random_state = 42)
        skf.get_n_splits(Xcol, y)
        
        for train_cv_index, val_cv_index in skf.split(Xcol, y):
            X_train_temp = Xcol.iloc[train_cv_index, :]
            y_train_temp = y[train_cv_index]
            X_test_temp = Xcol.iloc[val_cv_index, :]
            y_test_temp = y[val_cv_index]
        
            #instantiate and fit
            model = classifier
            model.fit(X_train_temp, y_train_temp)
            
            #score
            train_scores.append(model.score(X_train_temp, y_train_temp))
            test_scores.append(model.score(X_test_temp, y_test_temp))
        
        #store mean scores for each feature
        mean_scores.append({'feature': col,
                            'train_score': np.array(train_scores).mean(),
                            'test_score': np.array(test_scores).mean()})
        
    df_scores = pd.DataFrame(mean_scores)
    return df_scores

In [7]:
uci_feature_test = feature_test(Xuci_1, yuci_1, DecisionTreeClassifier())

100%|██████████| 500/500 [00:22<00:00, 22.01it/s]


In [8]:
uci_feature_test.sort_values('test_score', ascending=False).head()

Unnamed: 0,feature,test_score,train_score
86,86,0.593277,0.610607
339,339,0.582363,0.761622
125,125,0.579999,0.737384
176,176,0.577457,0.68763
23,23,0.577404,0.651772


In [9]:
uci_feature_test['feature'] = [int(i) for i in uci_feature_test['feature']]
top_features = uci_feature_test.sort_values('test_score', ascending=False).head(20)['feature'].values
top_features.sort()
top_features

array([ 18,  23,  30,  42,  56,  65,  68,  86, 102, 116, 125, 140, 176,
       189, 197, 249, 339, 389, 403, 444])

In [10]:
uci_true_features = np.array([ 28,  48,  64, 105, 128, 153, 241, 281, 318, 336, 338, 378, 433, 442, 451, 453, 455, 472, 475, 493])

In [11]:
sum([i in uci_true_features for i in top_features])

0

From prior work, I have strong reason to believe that the 'right' features are the following:

`array([ 28,  48,  64, 105, 128, 153, 241, 281, 318, 336, 338, 378, 433, 442, 451, 453, 455, 472, 475, 493])`

It appears that using we  identify none of the correct features using `DecisionTreeRegressor()` on the `uci_1` data.

Let's try to use KNeighborsClassifier()

In [12]:
uci_feature_test = feature_test(Xuci_1, yuci_1, KNeighborsClassifier())

100%|██████████| 500/500 [00:28<00:00, 17.25it/s]


In [13]:
uci_feature_test['feature'] = [int(i) for i in uci_feature_test['feature']]
top_features = uci_feature_test.sort_values('test_score', ascending=False).head(20)['feature'].values
top_features.sort()
top_features

array([ 42,  56,  68,  79, 129, 140, 176, 180, 184, 204, 227, 249, 298,
       310, 360, 370, 374, 376, 405, 484])

In [14]:
sum([i in uci_true_features for i in top_features])

0

No good with KNN.

In [15]:
uci_feature_test = feature_test(Xuci_1, yuci_1, LogisticRegression())
uci_feature_test['feature'] = [int(i) for i in uci_feature_test['feature']]
top_features = uci_feature_test.sort_values('test_score', ascending=False).head(20)['feature'].values
top_features.sort()
top_features

100%|██████████| 500/500 [00:24<00:00, 20.52it/s]


array([ 48,  64,  75, 105, 130, 153, 199, 241, 271, 272, 290, 336, 338,
       378, 431, 433, 442, 463, 472, 475])

In [16]:
sum([i in uci_true_features for i in top_features])

12

12 features isnt bad. Let's see where the right features are

In [17]:
# uci_feature_test[uci_feature_test['feature'].isin(uci_true_features)]

Unnamed: 0,feature,test_score,train_score
28,28,0.513641,0.513636
48,48,0.554156,0.557064
64,64,0.575386,0.572988
105,105,0.543194,0.544948
128,128,0.513641,0.513636
153,153,0.533788,0.53889
241,241,0.570574,0.576013
281,281,0.522838,0.530801
318,318,0.511419,0.513385
336,336,0.577659,0.574754


In [18]:
# uci_feature_test.sort_values('test_score', ascending=False).head(20)

Unnamed: 0,feature,test_score,train_score
475,475,0.57972,0.579044
336,336,0.577659,0.574754
64,64,0.575386,0.572988
338,338,0.570889,0.57147
241,241,0.570574,0.576013
272,272,0.565673,0.564139
199,199,0.564131,0.562884
378,378,0.558958,0.55126
48,48,0.554156,0.557064
472,472,0.552193,0.560096


Let's see if more data helps by testing with data sourced from the Madelon db

In [19]:
db1_feature_test = feature_test(Xdb_1, ydb_1, LogisticRegression())

100%|██████████| 1000/1000 [00:50<00:00, 19.98it/s]


In [20]:
top_db1_features = db1_feature_test.sort_values('test_score', ascending=False).head(20)['feature'].values
top_db1_features.sort()
top_db1_features

array(['feat_226', 'feat_269', 'feat_336', 'feat_341', 'feat_416',
       'feat_443', 'feat_480', 'feat_543', 'feat_559', 'feat_639',
       'feat_681', 'feat_701', 'feat_707', 'feat_769', 'feat_778',
       'feat_808', 'feat_829', 'feat_873', 'feat_920', 'feat_956'], dtype=object)

'True' Madelon DB predictors:

`[257, 269, 308, 315, 336, 341, 395, 504, 526, 639, 681, 701, 724, 736, 769, 808, 829, 867, 920, 956]`

### 2. Correlations of features. 
Pandas has a built in `.corr()` method that tests the correlation of features within a matrix. I originally disregarded this approach since parsing the correlation matrix seemed more effort than it was worth. Let's see if I can find a clean way to parse the correlation matrix to identify related features.

In [57]:
corr_test = Xuci_1.corr()

In [58]:
corr_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,1.0,0.001863,-0.030589,0.079044,-0.01602,0.045018,-0.022883,0.002588,0.031829,0.026756,...,0.013224,0.034647,0.023064,-0.056057,-0.012904,-0.01046,0.017305,-0.055814,0.057639,-0.033592
1,0.001863,1.0,-0.045873,0.112781,0.072174,0.027823,-0.086126,0.008964,-0.005845,-0.048256,...,0.007706,-0.101836,-0.027601,0.00641,0.049305,0.022063,0.04879,0.001745,-0.037004,0.06049
2,-0.030589,-0.045873,1.0,-0.01234,-0.050528,0.060894,0.035209,-0.050305,0.059404,0.040547,...,0.012974,0.044949,0.097373,-0.016076,0.002061,-0.014319,0.039001,0.016703,0.015267,0.088202
3,0.079044,0.112781,-0.01234,1.0,0.002612,0.064137,-0.011659,-0.070683,0.033268,-0.004106,...,0.032361,-0.031768,-0.043497,-0.000271,0.078904,0.012186,0.041357,-0.061805,-0.050831,0.034481
4,-0.01602,0.072174,-0.050528,0.002612,1.0,0.00071,-0.049369,0.047841,-0.02797,-0.049692,...,-0.0249,0.0021,0.002794,0.042962,0.012581,-0.008397,0.042619,0.015844,-0.021339,-0.067939


In [59]:
# zero at the diagonal.
for i in corr_test.columns:
    corr_test.loc[i,i] = 0

corr_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,0.0,0.001863,-0.030589,0.079044,-0.01602,0.045018,-0.022883,0.002588,0.031829,0.026756,...,0.013224,0.034647,0.023064,-0.056057,-0.012904,-0.01046,0.017305,-0.055814,0.057639,-0.033592
1,0.001863,0.0,-0.045873,0.112781,0.072174,0.027823,-0.086126,0.008964,-0.005845,-0.048256,...,0.007706,-0.101836,-0.027601,0.00641,0.049305,0.022063,0.04879,0.001745,-0.037004,0.06049
2,-0.030589,-0.045873,0.0,-0.01234,-0.050528,0.060894,0.035209,-0.050305,0.059404,0.040547,...,0.012974,0.044949,0.097373,-0.016076,0.002061,-0.014319,0.039001,0.016703,0.015267,0.088202
3,0.079044,0.112781,-0.01234,0.0,0.002612,0.064137,-0.011659,-0.070683,0.033268,-0.004106,...,0.032361,-0.031768,-0.043497,-0.000271,0.078904,0.012186,0.041357,-0.061805,-0.050831,0.034481
4,-0.01602,0.072174,-0.050528,0.002612,0.0,0.00071,-0.049369,0.047841,-0.02797,-0.049692,...,-0.0249,0.0021,0.002794,0.042962,0.012581,-0.008397,0.042619,0.015844,-0.021339,-0.067939


In [60]:
# take the absolute value of correlations. We only care about the magnitude, not the direction, of the correlations
corr_test = abs(corr_test)

In [61]:
corr_test.max().sort_values(ascending=False)[:25]
#pretty clear drop in correlations after the 20th 

64     0.992330
336    0.992330
451    0.990578
28     0.990578
318    0.990541
153    0.990379
281    0.990379
433    0.990082
105    0.989993
128    0.989993
241    0.988937
475    0.988937
48     0.988595
378    0.988595
493    0.988309
453    0.988309
472    0.988133
442    0.988133
455    0.725369
338    0.685807
486    0.216672
269    0.216672
162    0.205203
389    0.205203
144    0.203834
dtype: float64

In [34]:
best_features = test_corr.max().sort_values(ascending=False)[:20].index
best_features = [int(i) for i in best_features]
best_features.sort()
best_features

[28,
 48,
 64,
 105,
 128,
 153,
 241,
 281,
 318,
 336,
 338,
 378,
 433,
 442,
 451,
 453,
 455,
 472,
 475,
 493]

In [35]:
sum([i in uci_true_features for i in best_features])

20

Woohoo! This approach is much faster than the iterative model training method.

In [36]:
# Functionalize and run against all datasets.

def test_corr(df):
    # get the absolute values of correlations
    corr_df = abs(df.corr())
    
    # zero out the diagonal
    for i in corr_df.columns:
        corr_df.loc[i,i] = 0
    
    top_features = corr_df.max().sort_values(ascending=False)[:20].index
    return np.array(top_features)

In [46]:
uci_1_features = test_corr(Xuci_1)
uci_2_features = test_corr(Xuci_2)
uci_3_features = test_corr(Xuci_3)

db_1_features = test_corr(Xdb_1)
db_2_features = test_corr(Xdb_2)
db_3_features = test_corr(Xdb_3)


In [48]:
uci_1_features.sort()
uci_2_features.sort()
uci_3_features.sort()
db_1_features.sort()
db_2_features.sort()
db_3_features.sort()

In [51]:
uci_1_features == uci_2_features

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True], dtype=bool)

In [52]:
uci_2_features == uci_3_features

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True], dtype=bool)

In [53]:
db_1_features == db_2_features

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True], dtype=bool)

In [62]:
db_1_features

array(['feat_257', 'feat_269', 'feat_308', 'feat_315', 'feat_336',
       'feat_341', 'feat_395', 'feat_504', 'feat_526', 'feat_639',
       'feat_681', 'feat_701', 'feat_724', 'feat_736', 'feat_769',
       'feat_808', 'feat_829', 'feat_867', 'feat_920', 'feat_956'], dtype=object)