# Select Features

Remove features that have low variance and that are highly correlated.

In [1]:
import os
import pandas as pd

from pycytominer.feature_select import feature_select

In [2]:
file = os.path.join("data", "train_ebfeatures.csv")
train_df = pd.read_csv(file)

print(train_df.shape)
train_df.head()

(49567, 123)


Unnamed: 0,cell_code,cell_id,plate,replicate,well,field,actin.s.area,actin.s.perimeter,actin.s.radius.mean,actin.s.radius.sd,...,DNA.h.ent.s3,DNA.h.dva.s3,DNA.h.den.s3,DNA.h.f12.s3,DNA.h.f13.s3,dist.10.nn,dist.20.nn,dist.30.nn,nuclear.displacement,target
0,OaJHcDs2kh,1,P1,1,C10,1,579,90,13.942759,4.352958,...,0.0,0.0,0.0,0.0,0.0,128.889522,178.595764,207.18737,5.938882,adrenoceptor
1,nwLFF4l070,2,P1,1,C10,1,1132,136,20.179007,4.583255,...,0.0,0.0,0.0,0.0,0.0,79.885304,123.697473,190.59876,0.478024,adrenoceptor
2,JU4SIplWZ7,3,P1,1,C10,1,736,99,15.209404,2.408709,...,0.0,0.0,0.0,0.0,0.0,64.370636,88.931673,127.452918,3.928325,adrenoceptor
3,pqkTwaHa2L,4,P1,1,C10,1,611,92,14.398528,3.662234,...,0.0,0.0,0.0,0.0,0.0,87.536494,121.879169,160.752674,3.336081,adrenoceptor
4,ng7c7qtodJ,6,P1,1,C10,1,1926,209,25.394352,5.18838,...,0.0,0.0,0.0,0.0,0.0,104.691595,190.961698,218.972768,1.820529,adrenoceptor


In [3]:
eb_features = [x for x in train_df.columns if x.startswith(("actin", "DNA", "dist", "nuclear"))]

feature_select_df = feature_select(
    profiles=train_df,
    features=eb_features,
    operation=["variance_threshold", "correlation_threshold"],
    freq_cut=0.01,
    unique_cut=0.01,
    corr_threshold=0.95
)

feature_select_df.head()

Unnamed: 0,cell_code,cell_id,plate,replicate,well,field,actin.s.area,actin.s.radius.mean,actin.s.radius.sd,actin.s.radius.min,...,DNA.b.q005,DNA.b.q05,DNA.m.cy,DNA.m.majoraxis,DNA.m.eccentricity,DNA.m.theta,dist.10.nn,dist.30.nn,nuclear.displacement,target
0,OaJHcDs2kh,1,P1,1,C10,1,579,13.942759,4.352958,6.502659,...,0.002518,0.004013,1183.251577,25.618028,0.672437,-0.824253,128.889522,207.18737,5.938882,adrenoceptor
1,nwLFF4l070,2,P1,1,C10,1,1132,20.179007,4.583255,12.64475,...,0.002763,0.004334,848.566929,21.634056,0.427024,-1.05185,79.885304,190.59876,0.478024,adrenoceptor
2,JU4SIplWZ7,3,P1,1,C10,1,736,15.209404,2.408709,11.188623,...,0.004032,0.006477,1212.405633,19.026609,0.640358,0.670619,64.370636,127.452918,3.928325,adrenoceptor
3,pqkTwaHa2L,4,P1,1,C10,1,611,14.398528,3.662234,7.847653,...,0.004266,0.008896,697.780525,18.350133,0.690171,0.475043,87.536494,160.752674,3.336081,adrenoceptor
4,ng7c7qtodJ,6,P1,1,C10,1,1926,25.394352,5.18838,16.504224,...,0.003246,0.006043,1122.335988,53.965501,0.911716,-1.570463,104.691595,218.972768,1.820529,adrenoceptor


In [4]:
file = os.path.join("data", "training_data_feature_select.csv")
feature_select_df.to_csv(file, sep=',', index=False)

## Select Features in CNN Activations

In [5]:
file = os.path.join("data", "train_cnn_activations.csv")
train_cnn_df = pd.read_csv(file, index_col=0).reindex(train_df.cell_code, axis="rows")

print(train_cnn_df.shape)
train_cnn_df.head()

(49567, 1000)


Unnamed: 0_level_0,FC1_1,FC1_2,FC1_3,FC1_4,FC1_5,FC1_6,FC1_7,FC1_8,FC1_9,FC1_10,...,FC2_491,FC2_492,FC2_493,FC2_494,FC2_495,FC2_496,FC2_497,FC2_498,FC2_499,FC2_500
cell_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
OaJHcDs2kh,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.271193,0.0,0.752577,0.0,1.374832,0.0,0.0,0.0,0.0
nwLFF4l070,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.684311,0.213631,0.0,1.251752,0.0,0.0,1.190106,0.0
JU4SIplWZ7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.378023,0.265288,0.54902,0.0,1.245002,0.0,0.0,0.541393,0.0
pqkTwaHa2L,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.874324,0.54329,0.363319,0.0,1.410975,0.0,0.0,0.872607,0.0
ng7c7qtodJ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.492612,0.0,0.737473,0.0,0.0,0.0,0.0


In [6]:
cnn_features = train_cnn_df.columns.to_list()

feature_select_df = feature_select(
    profiles=train_cnn_df,
    features=cnn_features,
    operation=["variance_threshold", "correlation_threshold"],
    freq_cut=0.01,
    unique_cut=0.01,
    corr_threshold=0.95
)

feature_select_df.head()

Unnamed: 0_level_0,FC2_14,FC2_46,FC2_78,FC2_86,FC2_91,FC2_125,FC2_142,FC2_157,FC2_257,FC2_260,FC2_262,FC2_283,FC2_308,FC2_335,FC2_352,FC2_398,FC2_401,FC2_473,FC2_477
cell_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
OaJHcDs2kh,1.319307,0.318558,1.205875,1.350009,1.910532,2.04263,0.859613,1.043137,1.901225,1.325096,1.78898,1.468829,0.526473,2.261007,1.625711,1.320874,0.616183,1.504879,1.228745
nwLFF4l070,0.649212,1.555879,1.780678,1.370201,1.63202,2.35307,2.292508,1.27858,1.903095,1.449631,1.283137,0.97052,1.277169,1.615691,3.049478,1.520226,1.199523,0.749439,0.906905
JU4SIplWZ7,0.494773,0.690964,1.672676,0.830226,1.25025,1.732242,1.101756,0.650364,1.634636,1.09155,1.584421,0.637806,0.531286,1.45858,1.708291,1.253612,1.081043,1.030694,0.961403
pqkTwaHa2L,0.440168,0.930884,1.547437,1.362906,1.931606,2.018149,1.497556,1.28641,1.929644,1.049759,1.862568,1.026277,0.936188,1.746144,2.177337,1.406913,1.042338,0.668548,1.225439
ng7c7qtodJ,0.953127,1.5289,1.407254,2.344501,1.621911,2.127393,1.871136,1.270951,2.267502,1.412761,0.875918,0.832258,1.25869,1.426081,2.135995,1.095517,1.171175,1.236609,1.449591


In [8]:
file = os.path.join("data", "cnn_training_data_feature_select.csv")
feature_select_df.to_csv(file, sep=',', index=True)