# Normalize Features

In [1]:
import os
import pandas as pd

from pycytominer.normalize import normalize

In [2]:
file = os.path.join("data", "training_data_feature_select.csv")
train_df = pd.read_csv(file)

print(train_df.shape)
train_df.head()

(49567, 33)


Unnamed: 0,cell_code,cell_id,plate,replicate,well,field,actin.s.area,actin.s.radius.mean,actin.s.radius.sd,actin.s.radius.min,...,DNA.b.q005,DNA.b.q05,DNA.m.cy,DNA.m.majoraxis,DNA.m.eccentricity,DNA.m.theta,dist.10.nn,dist.30.nn,nuclear.displacement,target
0,OaJHcDs2kh,1,P1,1,C10,1,579,13.942759,4.352958,6.502659,...,0.002518,0.004013,1183.251577,25.618028,0.672437,-0.824253,128.889522,207.18737,5.938882,adrenoceptor
1,nwLFF4l070,2,P1,1,C10,1,1132,20.179007,4.583255,12.64475,...,0.002763,0.004334,848.566929,21.634056,0.427024,-1.05185,79.885304,190.59876,0.478024,adrenoceptor
2,JU4SIplWZ7,3,P1,1,C10,1,736,15.209404,2.408709,11.188623,...,0.004032,0.006477,1212.405633,19.026609,0.640358,0.670619,64.370636,127.452918,3.928325,adrenoceptor
3,pqkTwaHa2L,4,P1,1,C10,1,611,14.398528,3.662234,7.847653,...,0.004266,0.008896,697.780525,18.350133,0.690171,0.475043,87.536494,160.752674,3.336081,adrenoceptor
4,ng7c7qtodJ,6,P1,1,C10,1,1926,25.394352,5.18838,16.504224,...,0.003246,0.006043,1122.335988,53.965501,0.911716,-1.570463,104.691595,218.972768,1.820529,adrenoceptor


In [3]:
file = os.path.join("data", "test_ebfeatures.csv")
test_df = pd.read_csv(file).reindex(train_df.columns, axis="columns")

print(test_df.shape)
test_df.head()

(1700, 33)


Unnamed: 0,cell_code,cell_id,plate,replicate,well,field,actin.s.area,actin.s.radius.mean,actin.s.radius.sd,actin.s.radius.min,...,DNA.b.q005,DNA.b.q05,DNA.m.cy,DNA.m.majoraxis,DNA.m.eccentricity,DNA.m.theta,dist.10.nn,dist.30.nn,nuclear.displacement,target
0,pB2BlQoW94,5,P1,1,C10,1,585,14.030675,3.408844,7.606134,...,0.002717,0.005264,431.005584,18.814685,0.419099,-0.969878,105.866735,182.990012,2.343776,adrenoceptor
1,X9f5EAZwSK,95,P1,1,C10,1,463,12.386436,3.634217,6.19523,...,0.002799,0.00557,1396.276973,19.079194,0.738169,0.007138,126.951421,189.826554,3.595257,adrenoceptor
2,VDmGHDKms7,110,P1,1,C10,1,1317,21.404926,5.652149,13.50344,...,0.003331,0.006142,763.35691,39.368269,0.8962,0.743232,94.305708,252.453307,7.454674,adrenoceptor
3,fJZPphpLGW,132,P1,1,C10,1,878,17.400843,4.080833,9.28904,...,0.002795,0.005035,1049.50476,23.658402,0.587324,-1.049983,128.048654,213.030839,5.409438,adrenoceptor
4,XXzcyKRY89,1,P1,1,G12,4,3906,36.080358,7.310274,21.575674,...,0.003937,0.006638,662.452186,36.210088,0.474172,0.292205,120.14455,216.740003,4.139002,AMPA


## Apply z-score and robust normalization

In [4]:
eb_features = [x for x in train_df.columns if
               x.startswith(("actin", "DNA", "dist", "nuclear"))]

for method in ["robust", "zscore"]:
    
    if method == "robust":
        method_use = "robustize"
    else:
        method_use = "standardize"

    train_normalize_df = normalize(
        profiles=train_df,
        features=eb_features,
        method=method_use
    )

    file = os.path.join("data", "training_feature_select_{}.csv".format(method))
    train_normalize_df.to_csv(file, sep=',', index=False, float_format="%.5f")
    
    test_normalize_df = normalize(
        profiles=test_df,
        features=eb_features,
        method=method_use
    )

    file = os.path.join("data", "testing_feature_select_{}.csv".format(method))
    test_normalize_df.to_csv(file, sep=',', index=False, float_format="%.5f")

print(test_normalize_df.shape)
test_normalize_df.head()

  return self.partial_fit(X, y)
  fitted_scaler.transform(feature_df),


(1700, 33)


  return self.partial_fit(X, y)
  fitted_scaler.transform(feature_df),


Unnamed: 0,cell_code,cell_id,plate,replicate,well,field,target,actin.s.area,actin.s.radius.mean,actin.s.radius.sd,...,DNA.b.mad,DNA.b.q005,DNA.b.q05,DNA.m.cy,DNA.m.majoraxis,DNA.m.eccentricity,DNA.m.theta,dist.10.nn,dist.30.nn,nuclear.displacement
0,pB2BlQoW94,5,P1,1,C10,1,adrenoceptor,-0.648852,-0.678123,-0.349304,...,0.33595,-0.947293,-0.667644,-0.653414,-0.659654,-1.483563,-1.067318,-0.20574,-0.384454,-0.255776
1,X9f5EAZwSK,95,P1,1,C10,1,adrenoceptor,-0.761173,-0.891236,-0.260163,...,-0.119308,-0.842329,-0.509399,1.615763,-0.640597,0.383692,0.015858,0.094689,-0.332785,0.211048
2,VDmGHDKms7,110,P1,1,C10,1,adrenoceptor,0.025075,0.277667,0.537982,...,0.08881,-0.157616,-0.21269,0.127883,0.821158,1.308521,0.831932,-0.37047,0.140536,1.650678
3,fJZPphpLGW,132,P1,1,C10,1,adrenoceptor,-0.379097,-0.241309,-0.083514,...,-0.099797,-0.846253,-0.786328,0.800564,-0.310682,-0.499076,-1.156126,0.110323,-0.157412,0.887769
4,XXzcyKRY89,1,P1,1,G12,4,AMPA,2.408674,2.179777,1.193814,...,-0.633099,0.621271,0.044457,-0.109325,0.593623,-1.161266,0.331898,-0.0023,-0.129379,0.413875


## Normalize Validation Data

In [5]:
file = os.path.join("data", "validation_data.csv")
val_df = pd.read_csv(file)

print(val_df.shape)
val_df.head()

(12957, 118)


Unnamed: 0,well_code,cell_code,actin.s.area,actin.s.perimeter,actin.s.radius.mean,actin.s.radius.sd,actin.s.radius.min,actin.s.radius.max,actin.b.mean,actin.b.sd,...,DNA.h.sen.s3,DNA.h.ent.s3,DNA.h.dva.s3,DNA.h.den.s3,DNA.h.f12.s3,DNA.h.f13.s3,dist.10.nn,dist.20.nn,dist.30.nn,nuclear.displacement
0,Ros6NxC0kA,Hmv43N53kD,1030,114,17.53612,3.50282,9.451391,24.187767,0.008434,0.002092,...,0.0,0.0,0.0,0.0,0.0,0.0,89.823141,172.683313,194.416288,1.86432
1,Ros6NxC0kA,B8kmJi71Zt,1002,114,17.72134,2.73876,12.667362,22.997078,0.01017,0.002222,...,0.0,0.0,0.0,0.0,0.0,0.0,73.187298,139.353843,177.67053,1.299625
2,Ros6NxC0kA,lcvffs6oty,1037,117,17.933854,3.009534,10.949922,23.196742,0.0053,0.001122,...,0.0,0.0,0.0,0.0,0.0,0.0,57.696602,80.344097,100.682363,1.713505
3,Ros6NxC0kA,jVHzZ1tdUT,1344,160,20.707114,3.634602,13.331076,27.539155,0.007294,0.001332,...,0.0,0.0,0.0,0.0,0.0,0.0,77.281014,104.123299,131.762808,2.252549
4,Ros6NxC0kA,Bcx0AaJQ49,867,98,16.52518,3.142137,10.92201,22.215193,0.007146,0.001165,...,0.0,0.0,0.0,0.0,0.0,0.0,67.015967,120.695692,190.567175,0.739423


In [6]:
for method in ["robust", "zscore"]:
    
    if method == "robust":
        method_use = "robustize"
    else:
        method_use = "standardize"

    val_normalize_df = normalize(
        profiles=val_df,
        features=eb_features,
        method=method_use
    )

    val_normalize_df = val_normalize_df.loc[:, ["well_code", "cell_code"] + eb_features]

    file = os.path.join("data", "validation_feature_select_{}.csv".format(method))
    val_normalize_df.to_csv(file, sep=',', index=False, float_format="%.5f")

  return self.partial_fit(X, y)
  fitted_scaler.transform(feature_df),


## Normalize CNN Activations

In [7]:
file = os.path.join("data", "cnn_training_data_feature_select.csv")
cnn_train_df = pd.read_csv(file, index_col=0)

print(cnn_train_df.shape)
cnn_train_df.head()

(49567, 19)


Unnamed: 0_level_0,FC2_14,FC2_46,FC2_78,FC2_86,FC2_91,FC2_125,FC2_142,FC2_157,FC2_257,FC2_260,FC2_262,FC2_283,FC2_308,FC2_335,FC2_352,FC2_398,FC2_401,FC2_473,FC2_477
cell_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
OaJHcDs2kh,1.319307,0.318558,1.205875,1.350009,1.910532,2.04263,0.859613,1.043137,1.901225,1.325096,1.78898,1.468829,0.526473,2.261007,1.625711,1.320874,0.616183,1.504879,1.228745
nwLFF4l070,0.649212,1.555879,1.780678,1.370201,1.63202,2.35307,2.292508,1.27858,1.903095,1.449631,1.283137,0.97052,1.277169,1.615691,3.049478,1.520226,1.199523,0.749439,0.906905
JU4SIplWZ7,0.494773,0.690964,1.672676,0.830226,1.25025,1.732242,1.101756,0.650364,1.634636,1.09155,1.584421,0.637806,0.531286,1.45858,1.708291,1.253612,1.081043,1.030694,0.961403
pqkTwaHa2L,0.440168,0.930884,1.547437,1.362906,1.931606,2.018149,1.497556,1.28641,1.929644,1.049759,1.862568,1.026277,0.936188,1.746144,2.177337,1.406913,1.042338,0.668548,1.225439
ng7c7qtodJ,0.953127,1.5289,1.407254,2.344501,1.621911,2.127393,1.871136,1.270951,2.267502,1.412761,0.875918,0.832258,1.25869,1.426081,2.135995,1.095517,1.171175,1.236609,1.449591


In [8]:
cnn_features = cnn_train_df.columns.to_list()

cnn_train_normalize_df = normalize(
        profiles=cnn_train_df,
        features=cnn_features,
        method="robustize"
    )

cnn_train_normalize_df.head()

Unnamed: 0_level_0,FC2_14,FC2_46,FC2_78,FC2_86,FC2_91,FC2_125,FC2_142,FC2_157,FC2_257,FC2_260,FC2_262,FC2_283,FC2_308,FC2_335,FC2_352,FC2_398,FC2_401,FC2_473,FC2_477
cell_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
OaJHcDs2kh,1.055989,-1.268815,-1.004927,-0.200011,0.665522,-0.297586,-1.157776,-0.041017,-0.161409,-0.066286,0.430355,1.23164,-1.15215,0.885447,-0.609201,-0.039515,-1.166997,0.716192,0.087069
nwLFF4l070,-0.360839,0.774033,0.180621,-0.164266,0.166755,0.23991,0.517132,0.418673,-0.157894,0.170046,-0.641488,0.184315,0.467556,-0.077574,0.805975,0.444259,0.242458,-0.663878,-0.682934
JU4SIplWZ7,-0.687382,-0.653963,-0.042137,-1.120173,-0.516931,-0.834993,-0.874736,-0.807885,-0.662529,-0.509488,-0.00309,-0.51497,-1.141767,-0.312035,-0.527119,-0.202741,-0.043812,-0.150069,-0.552548
pqkTwaHa2L,-0.802836,-0.257849,-0.300446,-0.17718,0.703262,-0.339972,-0.412086,0.43396,-0.107989,-0.588796,0.586282,0.301503,-0.268146,0.117104,-0.060903,0.169279,-0.137329,-0.811653,0.079158
ng7c7qtodJ,0.28175,0.729489,-0.589577,1.560516,0.14865,-0.150827,0.024591,0.403778,0.5271,0.100077,-1.504356,-0.106278,0.427685,-0.360535,-0.101995,-0.586394,0.173962,0.226106,0.615442


In [9]:
file = os.path.join("data", "cnn_train_feature_select_robustize.csv")
cnn_train_normalize_df.to_csv(file, sep=',', index=True, float_format="%.5f")

In [10]:
file = os.path.join("data", "validation_cnn_activations.csv")
cnn_val_df = pd.read_csv(file, index_col=0).reindex(cnn_train_normalize_df.columns, axis="columns")

print(cnn_val_df.shape)
cnn_val_df.head()

(12957, 19)


Unnamed: 0_level_0,FC2_14,FC2_46,FC2_78,FC2_86,FC2_91,FC2_125,FC2_142,FC2_157,FC2_257,FC2_260,FC2_262,FC2_283,FC2_308,FC2_335,FC2_352,FC2_398,FC2_401,FC2_473,FC2_477
cell_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
KuQJTlfiFW,1.138097,1.324271,1.931999,1.998099,0.98821,2.717685,2.807382,0.972931,2.003976,2.098864,1.552968,0.889109,1.698688,1.090821,2.663394,1.108442,1.388412,1.385989,2.158504
OHLCBK5YUr,0.258425,1.18846,1.493393,1.617923,2.179364,1.848405,1.777582,1.524898,1.80796,0.811084,1.436177,1.307456,0.778623,1.485963,2.517108,1.229181,1.039404,0.510394,1.212029
hdDTiJ9PFF,1.548556,2.811147,1.831032,2.105069,1.273564,2.629782,2.709061,1.785514,2.212574,2.060718,1.658796,1.095924,1.328151,1.470233,3.008486,1.96424,1.408314,1.146898,1.14271
UZtW0gA3q5,0.667898,1.397672,2.066211,1.25776,1.449914,2.53025,2.141422,1.216181,2.295937,1.53335,1.755542,0.991107,1.376119,1.923636,3.072068,1.732395,1.439003,0.794548,1.007725
IAKfYa4KWb,0.705101,0.941053,1.791833,1.297677,1.50892,2.133636,1.828982,0.892535,1.885177,1.269348,1.494987,0.817996,0.949681,1.685264,2.357535,1.05066,0.931707,1.060216,1.004667


In [11]:
cnn_validation_normalize_df = normalize(
        profiles=cnn_val_df,
        features=cnn_features,
        method="robustize"
    )

cnn_validation_normalize_df.head()

Unnamed: 0_level_0,FC2_14,FC2_46,FC2_78,FC2_86,FC2_91,FC2_125,FC2_142,FC2_157,FC2_257,FC2_260,FC2_262,FC2_283,FC2_308,FC2_335,FC2_352,FC2_398,FC2_401,FC2_473,FC2_477
cell_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
KuQJTlfiFW,0.667271,0.418262,0.499909,0.973003,-1.005111,0.886686,1.139002,-0.15882,0.041129,1.431426,-0.074747,0.020602,1.402283,-0.857529,0.437778,-0.543433,0.717094,0.5027,2.276706
OHLCBK5YUr,-1.179653,0.18786,-0.421594,0.291907,1.160597,-0.629379,-0.074356,0.914416,-0.327225,-1.069426,-0.317152,0.894501,-0.606724,-0.263725,0.291505,-0.252254,-0.127077,-1.100606,0.052612
hdDTiJ9PFF,1.529053,2.940729,0.287779,1.164644,-0.486292,0.733378,1.023155,1.421155,0.433125,1.357346,0.144904,0.452624,0.593199,-0.287364,0.782839,1.520461,0.765234,0.0649,-0.110278
UZtW0gA3q5,-0.31994,0.542786,0.781887,-0.353334,-0.165659,0.55979,0.354337,0.314151,0.589782,0.333204,0.345705,0.233671,0.697938,0.393992,0.846414,0.96133,0.839463,-0.580291,-0.427475
IAKfYa4KWb,-0.241831,-0.231863,0.205423,-0.281822,-0.058376,-0.131924,-0.013794,-0.315142,-0.182119,-0.179484,-0.195088,-0.127947,-0.23321,0.035776,0.131946,-0.682784,-0.387572,-0.093824,-0.434661


In [12]:
file = os.path.join("data", "cnn_validation_feature_select_robustize.csv")
cnn_validation_normalize_df.to_csv(file, sep=',', index=True, float_format="%.5f")