# Normalize Features

In [2]:
import os
import pandas as pd

from pycytominer.normalize import normalize

In [3]:
file = os.path.join("data", "training_data_feature_select.csv")
train_df = pd.read_csv(file)

print(train_df.shape)
train_df.head()

(51267, 33)


Unnamed: 0,cell_code,cell_id,plate,replicate,well,field,actin.s.area,actin.s.radius.mean,actin.s.radius.sd,actin.s.radius.min,...,DNA.b.q05,DNA.m.cx,DNA.m.cy,DNA.m.majoraxis,DNA.m.eccentricity,DNA.m.theta,dist.10.nn,dist.20.nn,nuclear.displacement,target
0,OaJHcDs2kh,1,P1,1,C10,1,579,13.942759,4.352958,6.502659,...,0.004013,1327.866259,1183.251577,25.618028,0.672437,-0.824253,128.889522,178.595764,5.938882,adrenoceptor
1,nwLFF4l070,2,P1,1,C10,1,1132,20.179007,4.583255,12.64475,...,0.004334,1280.486929,848.566929,21.634056,0.427024,-1.05185,79.885304,123.697473,0.478024,adrenoceptor
2,JU4SIplWZ7,3,P1,1,C10,1,736,15.209404,2.408709,11.188623,...,0.006477,647.536053,1212.405633,19.026609,0.640358,0.670619,64.370636,88.931673,3.928325,adrenoceptor
3,pqkTwaHa2L,4,P1,1,C10,1,611,14.398528,3.662234,7.847653,...,0.008896,473.519826,697.780525,18.350133,0.690171,0.475043,87.536494,121.879169,3.336081,adrenoceptor
4,pB2BlQoW94,5,P1,1,C10,1,585,14.030675,3.408844,7.606134,...,0.005264,1031.364956,431.005584,18.814685,0.419099,-0.969878,105.866735,142.023858,2.343776,adrenoceptor


In [4]:
eb_features = [x for x in train_df.columns if
               x.startswith(("actin", "DNA", "dist", "nuclear"))]

normalize_df = normalize(
    profiles=train_df,
    features=eb_features,
    method="robustize"
)

normalize_df.head()

Unnamed: 0,cell_code,cell_id,plate,replicate,well,field,target,actin.s.area,actin.s.radius.mean,actin.s.radius.sd,...,DNA.b.q005,DNA.b.q05,DNA.m.cx,DNA.m.cy,DNA.m.majoraxis,DNA.m.eccentricity,DNA.m.theta,dist.10.nn,dist.20.nn,nuclear.displacement
0,OaJHcDs2kh,1,P1,1,C10,1,adrenoceptor,-0.59342,-0.675666,0.086844,...,-1.006489,-1.158621,0.878272,0.665581,0.186041,0.001559,-0.502246,0.967154,0.678866,1.355007
1,nwLFF4l070,2,P1,1,C10,1,adrenoceptor,0.056404,0.153582,0.177495,...,-0.774333,-1.013793,0.812902,0.200107,-0.131399,-0.987558,-0.645558,-0.053793,0.001083,-0.804906
2,JU4SIplWZ7,3,P1,1,C10,1,adrenoceptor,-0.408931,-0.507237,-0.678469,...,0.424658,-0.044828,-0.060396,0.706128,-0.339158,-0.127732,0.439039,-0.377023,-0.428142,0.559778
3,pqkTwaHa2L,4,P1,1,C10,1,adrenoceptor,-0.555817,-0.615061,-0.185045,...,0.645999,1.048276,-0.30049,-0.009605,-0.393059,0.073032,0.315889,0.105611,-0.021366,0.32553
4,pB2BlQoW94,5,P1,1,C10,1,adrenoceptor,-0.586369,-0.663976,-0.284787,...,-0.818313,-0.593103,0.469182,-0.380631,-0.356044,-1.019499,-0.593943,0.487501,0.227343,-0.066952


In [5]:
file = os.path.join("data", "training_feature_select_robust.csv")
normalize_df.to_csv(file, sep=',', index=False, float_format="%.5f")

## Normalize Validation Data

In [6]:
file = os.path.join("data", "validation_data.csv")
val_df = pd.read_csv(file)

print(val_df.shape)
val_df.head()

(12957, 118)


Unnamed: 0,well_code,cell_code,actin.s.area,actin.s.perimeter,actin.s.radius.mean,actin.s.radius.sd,actin.s.radius.min,actin.s.radius.max,actin.b.mean,actin.b.sd,...,DNA.h.sen.s3,DNA.h.ent.s3,DNA.h.dva.s3,DNA.h.den.s3,DNA.h.f12.s3,DNA.h.f13.s3,dist.10.nn,dist.20.nn,dist.30.nn,nuclear.displacement
0,Ros6NxC0kA,Hmv43N53kD,1030,114,17.53612,3.50282,9.451391,24.187767,0.008434,0.002092,...,0.0,0.0,0.0,0.0,0.0,0.0,89.823141,172.683313,194.416288,1.86432
1,Ros6NxC0kA,B8kmJi71Zt,1002,114,17.72134,2.73876,12.667362,22.997078,0.01017,0.002222,...,0.0,0.0,0.0,0.0,0.0,0.0,73.187298,139.353843,177.67053,1.299625
2,Ros6NxC0kA,lcvffs6oty,1037,117,17.933854,3.009534,10.949922,23.196742,0.0053,0.001122,...,0.0,0.0,0.0,0.0,0.0,0.0,57.696602,80.344097,100.682363,1.713505
3,Ros6NxC0kA,jVHzZ1tdUT,1344,160,20.707114,3.634602,13.331076,27.539155,0.007294,0.001332,...,0.0,0.0,0.0,0.0,0.0,0.0,77.281014,104.123299,131.762808,2.252549
4,Ros6NxC0kA,Bcx0AaJQ49,867,98,16.52518,3.142137,10.92201,22.215193,0.007146,0.001165,...,0.0,0.0,0.0,0.0,0.0,0.0,67.015967,120.695692,190.567175,0.739423


In [13]:
val_normalize_df = normalize(
    profiles=val_df,
    features=eb_features,
    method="robustize"
)

val_normalize_df = val_normalize_df.loc[:, ["well_code", "cell_code"] + eb_features]

val_normalize_df.head()

Unnamed: 0,well_code,cell_code,actin.s.area,actin.s.radius.mean,actin.s.radius.sd,actin.s.radius.min,actin.b.sd,actin.b.mad,actin.b.q005,actin.b.q01,...,DNA.b.q005,DNA.b.q05,DNA.m.cx,DNA.m.cy,DNA.m.majoraxis,DNA.m.eccentricity,DNA.m.theta,dist.10.nn,dist.20.nn,nuclear.displacement
0,Ros6NxC0kA,Hmv43N53kD,-0.067442,-0.1975,-0.244878,-0.250143,0.002559,0.278571,0.012008,0.02657,...,-0.533186,-0.125,-0.297686,0.607327,1.318708,0.873413,-0.047098,0.155205,0.606092,-0.261654
1,Ros6NxC0kA,B8kmJi71Zt,-0.1,-0.173233,-0.545217,0.393992,0.070274,0.507143,1.463016,1.042673,...,0.002212,-0.493056,-0.146959,-0.532345,-0.042008,-0.450439,-0.55795,-0.188598,0.191814,-0.486814
2,Ros6NxC0kA,lcvffs6oty,-0.059302,-0.14539,-0.43878,0.050001,-0.503682,-0.492857,-0.537464,-0.721417,...,0.016962,-0.923611,-0.711468,0.848423,-0.06315,-0.112519,0.365245,-0.508736,-0.541664,-0.321788
3,Ros6NxC0kA,jVHzZ1tdUT,0.297674,0.217956,-0.193077,0.526929,-0.394243,-0.392857,0.205572,0.10789,...,-0.030236,0.097222,-0.133743,0.729492,-0.118838,-0.818868,-0.63902,-0.103996,-0.246094,-0.106857
4,Ros6NxC0kA,Bcx0AaJQ49,-0.256977,-0.329951,-0.386657,0.044411,-0.481437,-0.521429,0.005283,-0.136876,...,-0.602507,-0.604167,0.501564,-0.450717,-0.187347,-0.141445,0.640189,-0.316138,-0.040102,-0.710181


In [14]:
file = os.path.join("data", "validation_feature_select_robust.csv")
val_normalize_df.to_csv(file, sep=',', index=False, float_format="%.5f")