In [4]:
import numpy as np

In [31]:
all_feats = np.load("real_output/optimized_with_outlier/all_features.npy")
print(all_feats.shape)

(537, 2)


In [32]:
# 537 distinct features amongst all 36 runs of "optimized_with_outlier"
# Too many to run an exhaustive hyperparam tuning + feature_selection
# Interested in either looking only at the selected feature for each model, all features selected more than once, or both


from pathlib import Path
from os import listdir
from os.path import join
import json
# rewriting function from feature_selection_analysis
def scrape_selected_model(directory: str | Path, model_name: str) -> np.ndarray:
    # returns an array of shape (all selected features, 2), where second entry is frequency of selection

    selected = []
    for file in listdir(directory):
        split = file.split(".")
        if split[-1] == "json":
            underscore = split[0].split("_")
            if(underscore[-1] != "scores" and underscore[1] == model_name):  # only json files in directory should be feat_select_mae_scores and all selected feat
                path = join(directory, file)
                with open(path, "r") as f:
                    temp = json.load(f)
                if len(selected) == 0:
                    for mol in temp:
                        selected.append([mol, 1])
                else:
                    for mol in temp:
                        for set in selected:
                            add = True
                            if mol == set[0]:
                                set[1] += 1
                                add = False
                                break
                        if add:
                            selected.append([mol, 1])
 
    selected = sorted(selected, key=lambda x: int(x[0]))


    result = np.array(selected, np.int32)
    #np.save(directory + "all_features.npy", result)
    return result

In [33]:
all_EN = scrape_selected_model("real_output/optimized_with_outlier/", "Elastic Net")
print(all_EN.shape)

(298, 2)


In [34]:
all_Ridge = scrape_selected_model("real_output/optimized_with_outlier/", "Ridge")
print(all_Ridge.shape)

(131, 2)


In [35]:
all_Lasso = scrape_selected_model("real_output/optimized_with_outlier/", "Lasso")
print(all_Lasso.shape)

(206, 2)


Interestingly, this approach can also delineate differences in features selected between the different models. 
E.g. Elastic Net selected a lot more features than Ridge

In [36]:
def filter_feats(feats: np.ndarray, threshold: int=2):
    return np.array([i for i in feats if i[1] >= threshold])

In [37]:
all_thresh = filter_feats(all_feats)
print(all_thresh.shape)
print(all_thresh)

(133, 2)
[[  294     5]
 [  296     4]
 [  297     3]
 [  298     3]
 [  299     2]
 [ 3626     2]
 [ 3679     3]
 [ 3680     3]
 [ 3704     2]
 [ 3705     3]
 [ 3707     3]
 [ 3711     2]
 [ 4381     2]
 [ 4437     2]
 [ 5081     5]
 [ 5680     2]
 [ 5757     3]
 [ 5786     3]
 [ 5808     2]
 [ 5809     2]
 [ 5835     2]
 [ 5836     2]
 [ 6329     2]
 [ 6330     2]
 [ 6388     3]
 [ 6437     2]
 [ 6462     3]
 [ 6509     2]
 [ 6510     5]
 [ 6515     2]
 [ 6537     2]
 [ 6540     2]
 [ 6923     2]
 [ 7059     2]
 [ 7083     4]
 [ 7111     7]
 [ 7112     3]
 [ 7114     2]
 [ 7135     2]
 [ 7139     2]
 [ 7162     2]
 [ 7164     2]
 [ 7193     2]
 [ 7210     2]
 [ 7577     2]
 [ 7651     2]
 [ 7653     2]
 [ 7731     3]
 [ 7732     4]
 [ 7785     5]
 [ 7811     2]
 [ 7818     2]
 [ 7886     2]
 [ 8224     2]
 [ 8251     2]
 [ 8298     2]
 [ 8328     2]
 [ 8354     8]
 [ 8355     2]
 [ 8356     2]
 [ 8384     3]
 [ 8412     2]
 [ 8488     4]
 [ 8490     3]
 [ 8491     3]
 [ 8514     2]
 

In [38]:
EN_thresh = filter_feats(all_EN)
print(EN_thresh.shape)
print(EN_thresh)

(87, 2)
[[  294     2]
 [ 3679     3]
 [ 3680     3]
 [ 3704     2]
 [ 3705     2]
 [ 4381     2]
 [ 5081     2]
 [ 5680     2]
 [ 5757     2]
 [ 5786     2]
 [ 5808     2]
 [ 5809     2]
 [ 5835     2]
 [ 5836     2]
 [ 6329     2]
 [ 6437     2]
 [ 6462     3]
 [ 6510     2]
 [ 6515     2]
 [ 6537     2]
 [ 6540     2]
 [ 6923     2]
 [ 7059     2]
 [ 7083     2]
 [ 7111     3]
 [ 7112     3]
 [ 7114     2]
 [ 7135     2]
 [ 7162     2]
 [ 7164     2]
 [ 7210     2]
 [ 7577     2]
 [ 7651     2]
 [ 7653     2]
 [ 7731     2]
 [ 7732     2]
 [ 7785     2]
 [ 7811     2]
 [ 7886     2]
 [ 8251     2]
 [ 8298     2]
 [ 8328     2]
 [ 8354     3]
 [ 8355     2]
 [ 8356     2]
 [ 8384     2]
 [ 8412     2]
 [ 8488     2]
 [ 8490     2]
 [ 8491     2]
 [ 8514     2]
 [ 8516     2]
 [ 9032     4]
 [ 9033     2]
 [ 9037     2]
 [ 9056     2]
 [ 9082     3]
 [ 9083     2]
 [ 9137     2]
 [ 9192     2]
 [ 9292     2]
 [ 9293     2]
 [ 9682     2]
 [ 9706     2]
 [ 9707     2]
 [ 9709     6]
 [

In [39]:
Ridge_thresh = filter_feats(all_Ridge)
print(Ridge_thresh.shape)
print(Ridge_thresh)

(18, 2)
[[ 3711     2]
 [ 5081     2]
 [ 6510     2]
 [ 7111     2]
 [ 8354     2]
 [ 9033     2]
 [ 9083     2]
 [ 9089     2]
 [ 9165     2]
 [ 9296     2]
 [ 9706     3]
 [ 9709     5]
 [10493     2]
 [11016     2]
 [12547     2]
 [13197     7]
 [13795     2]
 [15175     2]]


In [40]:
Lasso_thresh = filter_feats(all_Lasso)
print(Lasso_thresh.shape)
print(Lasso_thresh)

(36, 2)
[[  294     2]
 [  296     2]
 [  297     2]
 [  298     2]
 [  299     2]
 [ 3707     2]
 [ 6388     2]
 [ 7083     2]
 [ 7111     2]
 [ 7193     2]
 [ 7732     2]
 [ 7785     2]
 [ 7818     2]
 [ 8224     2]
 [ 8354     3]
 [ 8488     2]
 [ 9031     2]
 [ 9032     2]
 [ 9137     2]
 [ 9293     2]
 [ 9706     3]
 [ 9709     5]
 [ 9809     2]
 [10366     2]
 [10493     2]
 [10518     3]
 [11059     2]
 [11664     2]
 [11819     3]
 [12520     2]
 [12547     3]
 [13197     6]
 [13281     2]
 [13795     2]
 [15197     3]
 [15201     2]]


## For at least the initial final optimization, will utilize all features selected by each respective model more than once between separate runs

In [45]:
# testing
import pandas as pd
temp = pd.read_csv("nocorr_extracted_aso/aso_vt0_nocorr950.csv", index_col=0)


In [50]:
temp.info()

<class 'pandas.core.frame.DataFrame'>
Index: 96420 entries, 100_1_1_1 to aa_9
Columns: 11100 entries, 294 to 19362
dtypes: float64(11100)
memory usage: 8.0+ GB


In [51]:
Lasso_thresh[:, 0].astype("str").tolist()

['294',
 '296',
 '297',
 '298',
 '299',
 '3707',
 '6388',
 '7083',
 '7111',
 '7193',
 '7732',
 '7785',
 '7818',
 '8224',
 '8354',
 '8488',
 '9031',
 '9032',
 '9137',
 '9293',
 '9706',
 '9709',
 '9809',
 '10366',
 '10493',
 '10518',
 '11059',
 '11664',
 '11819',
 '12520',
 '12547',
 '13197',
 '13281',
 '13795',
 '15197',
 '15201']

In [53]:
temp.columns

Index(['294', '296', '297', '298', '299', '300', '319', '320', '321', '322',
       ...
       '19333', '19334', '19335', '19336', '19337', '19358', '19359', '19360',
       '19361', '19362'],
      dtype='object', length=11100)

In [54]:
select_temp = temp.loc[:, Lasso_thresh[:, 0].astype("str").tolist()]
select_temp

Unnamed: 0,294,296,297,298,299,3707,6388,7083,7111,7193,...,11059,11664,11819,12520,12547,13197,13281,13795,15197,15201
100_1_1_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.000000,0.000000,...,0.000000,0.000000,1.000000,0.966667,0.800000,0.626667,0.0,0.000000,0.0,0.0
100_1_1_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.000000,0.000000,...,0.192982,0.000000,1.000000,1.000000,0.913793,0.827586,0.0,0.000000,0.0,0.0
100_1_1_11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.000000,0.000000,...,0.162162,0.000000,1.000000,1.000000,0.891892,0.756757,0.0,0.000000,0.0,0.0
100_1_1_12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.000000,0.017241,...,0.275862,0.120690,1.000000,1.000000,0.758621,0.689655,0.0,0.017241,0.0,0.0
100_1_1_13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.000000,0.000000,...,0.183673,0.081633,1.000000,1.000000,0.836735,0.795918,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
aa_5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.000000,0.000000,...,0.375000,0.000000,1.000000,0.333333,0.166667,0.000000,0.0,0.666667,0.0,0.0
aa_6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.000000,0.000000,...,0.565217,0.000000,1.000000,0.750000,0.333333,0.000000,0.0,0.700000,0.0,0.0
aa_7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.892857,0.000000,...,0.724138,0.000000,0.692308,0.500000,0.272727,0.045455,0.0,0.550000,0.0,0.0
aa_8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.920000,0.000000,...,0.872340,0.000000,0.619048,0.428571,0.250000,0.000000,0.0,0.666667,0.0,0.0
