# Normalize Features

In [1]:
import os
import pandas as pd

from pycytominer.normalize import normalize

In [2]:
file = os.path.join("data", "training_data_feature_select.csv")
train_df = pd.read_csv(file)

print(train_df.shape)
train_df.head()

(49567, 33)


Unnamed: 0,cell_code,cell_id,plate,replicate,well,field,actin.s.area,actin.s.radius.mean,actin.s.radius.sd,actin.s.radius.min,...,DNA.b.q005,DNA.b.q05,DNA.m.cy,DNA.m.majoraxis,DNA.m.eccentricity,DNA.m.theta,dist.10.nn,dist.30.nn,nuclear.displacement,target
0,OaJHcDs2kh,1,P1,1,C10,1,579,13.942759,4.352958,6.502659,...,0.002518,0.004013,1183.251577,25.618028,0.672437,-0.824253,128.889522,207.18737,5.938882,adrenoceptor
1,nwLFF4l070,2,P1,1,C10,1,1132,20.179007,4.583255,12.64475,...,0.002763,0.004334,848.566929,21.634056,0.427024,-1.05185,79.885304,190.59876,0.478024,adrenoceptor
2,JU4SIplWZ7,3,P1,1,C10,1,736,15.209404,2.408709,11.188623,...,0.004032,0.006477,1212.405633,19.026609,0.640358,0.670619,64.370636,127.452918,3.928325,adrenoceptor
3,pqkTwaHa2L,4,P1,1,C10,1,611,14.398528,3.662234,7.847653,...,0.004266,0.008896,697.780525,18.350133,0.690171,0.475043,87.536494,160.752674,3.336081,adrenoceptor
4,ng7c7qtodJ,6,P1,1,C10,1,1926,25.394352,5.18838,16.504224,...,0.003246,0.006043,1122.335988,53.965501,0.911716,-1.570463,104.691595,218.972768,1.820529,adrenoceptor


In [3]:
file = os.path.join("data", "test_ebfeatures.csv")
test_df = pd.read_csv(file).reindex(train_df.columns, axis="columns")

print(test_df.shape)
test_df.head()

(1700, 33)


Unnamed: 0,cell_code,cell_id,plate,replicate,well,field,actin.s.area,actin.s.radius.mean,actin.s.radius.sd,actin.s.radius.min,...,DNA.b.q005,DNA.b.q05,DNA.m.cy,DNA.m.majoraxis,DNA.m.eccentricity,DNA.m.theta,dist.10.nn,dist.30.nn,nuclear.displacement,target
0,pB2BlQoW94,5,P1,1,C10,1,585,14.030675,3.408844,7.606134,...,0.002717,0.005264,431.005584,18.814685,0.419099,-0.969878,105.866735,182.990012,2.343776,adrenoceptor
1,X9f5EAZwSK,95,P1,1,C10,1,463,12.386436,3.634217,6.19523,...,0.002799,0.00557,1396.276973,19.079194,0.738169,0.007138,126.951421,189.826554,3.595257,adrenoceptor
2,VDmGHDKms7,110,P1,1,C10,1,1317,21.404926,5.652149,13.50344,...,0.003331,0.006142,763.35691,39.368269,0.8962,0.743232,94.305708,252.453307,7.454674,adrenoceptor
3,fJZPphpLGW,132,P1,1,C10,1,878,17.400843,4.080833,9.28904,...,0.002795,0.005035,1049.50476,23.658402,0.587324,-1.049983,128.048654,213.030839,5.409438,adrenoceptor
4,XXzcyKRY89,1,P1,1,G12,4,3906,36.080358,7.310274,21.575674,...,0.003937,0.006638,662.452186,36.210088,0.474172,0.292205,120.14455,216.740003,4.139002,AMPA


## Apply z-score and robust normalization

In [4]:
eb_features = [x for x in train_df.columns if
               x.startswith(("actin", "DNA", "dist", "nuclear"))]

for method in ["robust", "zscore"]:
    
    if method == "robust":
        method_use = "robustize"
    else:
        method_use = "standardize"

    train_normalize_df = normalize(
        profiles=train_df,
        features=eb_features,
        method=method_use
    )

    file = os.path.join("data", "training_feature_select_{}.csv".format(method))
    train_normalize_df.to_csv(file, sep=',', index=False, float_format="%.5f")
    
    test_normalize_df = normalize(
        profiles=test_df,
        features=eb_features,
        method=method_use
    )

    file = os.path.join("data", "testing_feature_select_{}.csv".format(method))
    test_normalize_df.to_csv(file, sep=',', index=False, float_format="%.5f")

print(test_normalize_df.shape)
test_normalize_df.head()

  return self.partial_fit(X, y)
  fitted_scaler.transform(feature_df),


(1700, 33)


  return self.partial_fit(X, y)
  fitted_scaler.transform(feature_df),


Unnamed: 0,cell_code,cell_id,plate,replicate,well,field,target,actin.s.area,actin.s.radius.mean,actin.s.radius.sd,...,DNA.b.mad,DNA.b.q005,DNA.b.q05,DNA.m.cy,DNA.m.majoraxis,DNA.m.eccentricity,DNA.m.theta,dist.10.nn,dist.30.nn,nuclear.displacement
0,pB2BlQoW94,5,P1,1,C10,1,adrenoceptor,-0.648852,-0.678123,-0.349304,...,0.33595,-0.947293,-0.667644,-0.653414,-0.659654,-1.483563,-1.067318,-0.20574,-0.384454,-0.255776
1,X9f5EAZwSK,95,P1,1,C10,1,adrenoceptor,-0.761173,-0.891236,-0.260163,...,-0.119308,-0.842329,-0.509399,1.615763,-0.640597,0.383692,0.015858,0.094689,-0.332785,0.211048
2,VDmGHDKms7,110,P1,1,C10,1,adrenoceptor,0.025075,0.277667,0.537982,...,0.08881,-0.157616,-0.21269,0.127883,0.821158,1.308521,0.831932,-0.37047,0.140536,1.650678
3,fJZPphpLGW,132,P1,1,C10,1,adrenoceptor,-0.379097,-0.241309,-0.083514,...,-0.099797,-0.846253,-0.786328,0.800564,-0.310682,-0.499076,-1.156126,0.110323,-0.157412,0.887769
4,XXzcyKRY89,1,P1,1,G12,4,AMPA,2.408674,2.179777,1.193814,...,-0.633099,0.621271,0.044457,-0.109325,0.593623,-1.161266,0.331898,-0.0023,-0.129379,0.413875


## Normalize Validation Data

In [5]:
file = os.path.join("data", "validation_data.csv")
val_df = pd.read_csv(file)

print(val_df.shape)
val_df.head()

(12957, 118)


Unnamed: 0,well_code,cell_code,actin.s.area,actin.s.perimeter,actin.s.radius.mean,actin.s.radius.sd,actin.s.radius.min,actin.s.radius.max,actin.b.mean,actin.b.sd,...,DNA.h.sen.s3,DNA.h.ent.s3,DNA.h.dva.s3,DNA.h.den.s3,DNA.h.f12.s3,DNA.h.f13.s3,dist.10.nn,dist.20.nn,dist.30.nn,nuclear.displacement
0,Ros6NxC0kA,Hmv43N53kD,1030,114,17.53612,3.50282,9.451391,24.187767,0.008434,0.002092,...,0.0,0.0,0.0,0.0,0.0,0.0,89.823141,172.683313,194.416288,1.86432
1,Ros6NxC0kA,B8kmJi71Zt,1002,114,17.72134,2.73876,12.667362,22.997078,0.01017,0.002222,...,0.0,0.0,0.0,0.0,0.0,0.0,73.187298,139.353843,177.67053,1.299625
2,Ros6NxC0kA,lcvffs6oty,1037,117,17.933854,3.009534,10.949922,23.196742,0.0053,0.001122,...,0.0,0.0,0.0,0.0,0.0,0.0,57.696602,80.344097,100.682363,1.713505
3,Ros6NxC0kA,jVHzZ1tdUT,1344,160,20.707114,3.634602,13.331076,27.539155,0.007294,0.001332,...,0.0,0.0,0.0,0.0,0.0,0.0,77.281014,104.123299,131.762808,2.252549
4,Ros6NxC0kA,Bcx0AaJQ49,867,98,16.52518,3.142137,10.92201,22.215193,0.007146,0.001165,...,0.0,0.0,0.0,0.0,0.0,0.0,67.015967,120.695692,190.567175,0.739423


In [6]:
for method in ["robust", "zscore"]:
    
    if method == "robust":
        method_use = "robustize"
    else:
        method_use = "standardize"

    val_normalize_df = normalize(
        profiles=val_df,
        features=eb_features,
        method=method_use
    )

    val_normalize_df = val_normalize_df.loc[:, ["well_code", "cell_code"] + eb_features]

    file = os.path.join("data", "validation_feature_select_{}.csv".format(method))
    val_normalize_df.to_csv(file, sep=',', index=False, float_format="%.5f")

  return self.partial_fit(X, y)
  fitted_scaler.transform(feature_df),
