# Select Features

Remove features that have low variance and that are highly correlated.

In [2]:
import os
import pandas as pd

from pycytominer.feature_select import feature_select

In [3]:
file = os.path.join("data", "train_ebfeatures.csv")
train_df = pd.read_csv(file)

print(train_df.shape)
train_df.head()

(49567, 123)


Unnamed: 0,cell_code,cell_id,plate,replicate,well,field,actin.s.area,actin.s.perimeter,actin.s.radius.mean,actin.s.radius.sd,...,DNA.h.ent.s3,DNA.h.dva.s3,DNA.h.den.s3,DNA.h.f12.s3,DNA.h.f13.s3,dist.10.nn,dist.20.nn,dist.30.nn,nuclear.displacement,target
0,OaJHcDs2kh,1,P1,1,C10,1,579,90,13.942759,4.352958,...,0.0,0.0,0.0,0.0,0.0,128.889522,178.595764,207.18737,5.938882,adrenoceptor
1,nwLFF4l070,2,P1,1,C10,1,1132,136,20.179007,4.583255,...,0.0,0.0,0.0,0.0,0.0,79.885304,123.697473,190.59876,0.478024,adrenoceptor
2,JU4SIplWZ7,3,P1,1,C10,1,736,99,15.209404,2.408709,...,0.0,0.0,0.0,0.0,0.0,64.370636,88.931673,127.452918,3.928325,adrenoceptor
3,pqkTwaHa2L,4,P1,1,C10,1,611,92,14.398528,3.662234,...,0.0,0.0,0.0,0.0,0.0,87.536494,121.879169,160.752674,3.336081,adrenoceptor
4,ng7c7qtodJ,6,P1,1,C10,1,1926,209,25.394352,5.18838,...,0.0,0.0,0.0,0.0,0.0,104.691595,190.961698,218.972768,1.820529,adrenoceptor


In [4]:
eb_features = [x for x in train_df.columns if x.startswith(("actin", "DNA", "dist", "nuclear"))]

feature_select_df = feature_select(
    profiles=train_df,
    features=eb_features,
    operation=["variance_threshold", "correlation_threshold"],
    freq_cut=0.01,
    unique_cut=0.01,
    corr_threshold=0.95
)

feature_select_df.head()

Unnamed: 0,cell_code,cell_id,plate,replicate,well,field,actin.s.area,actin.s.radius.mean,actin.s.radius.sd,actin.s.radius.min,...,DNA.b.q005,DNA.b.q05,DNA.m.cy,DNA.m.majoraxis,DNA.m.eccentricity,DNA.m.theta,dist.10.nn,dist.30.nn,nuclear.displacement,target
0,OaJHcDs2kh,1,P1,1,C10,1,579,13.942759,4.352958,6.502659,...,0.002518,0.004013,1183.251577,25.618028,0.672437,-0.824253,128.889522,207.18737,5.938882,adrenoceptor
1,nwLFF4l070,2,P1,1,C10,1,1132,20.179007,4.583255,12.64475,...,0.002763,0.004334,848.566929,21.634056,0.427024,-1.05185,79.885304,190.59876,0.478024,adrenoceptor
2,JU4SIplWZ7,3,P1,1,C10,1,736,15.209404,2.408709,11.188623,...,0.004032,0.006477,1212.405633,19.026609,0.640358,0.670619,64.370636,127.452918,3.928325,adrenoceptor
3,pqkTwaHa2L,4,P1,1,C10,1,611,14.398528,3.662234,7.847653,...,0.004266,0.008896,697.780525,18.350133,0.690171,0.475043,87.536494,160.752674,3.336081,adrenoceptor
4,ng7c7qtodJ,6,P1,1,C10,1,1926,25.394352,5.18838,16.504224,...,0.003246,0.006043,1122.335988,53.965501,0.911716,-1.570463,104.691595,218.972768,1.820529,adrenoceptor


In [5]:
file = os.path.join("data", "training_data_feature_select.csv")
feature_select_df.to_csv(file, sep=',', index=False)