In [1]:
import os
import pandas as pd

from sklearn import decomposition

In [2]:
file = os.path.join("data", "training_feature_select_robust.csv")
df = pd.read_csv(file, index_col=0)

eb_features = [x for x in df.columns if
               x.startswith(("actin", "DNA", "dist", "nuclear"))]

print(df.shape)
df.head()

(49567, 32)


Unnamed: 0_level_0,cell_id,plate,replicate,well,field,target,actin.s.area,actin.s.radius.mean,actin.s.radius.sd,actin.s.radius.min,...,DNA.b.mad,DNA.b.q005,DNA.b.q05,DNA.m.cy,DNA.m.majoraxis,DNA.m.eccentricity,DNA.m.theta,dist.10.nn,dist.30.nn,nuclear.displacement
cell_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
OaJHcDs2kh,1,P1,1,C10,1,adrenoceptor,-0.5967,-0.68155,0.0853,-0.86212,...,-0.35385,-1.00865,-1.17014,0.66734,0.18642,0.00252,-0.50109,0.99585,0.4897,1.35557
nwLFF4l070,2,P1,1,C10,1,adrenoceptor,0.05542,0.15232,0.17621,0.38119,...,-0.18462,-0.77666,-1.02431,0.20105,-0.1314,-0.98666,-0.64425,-0.05066,0.32791,-0.8074
JU4SIplWZ7,3,P1,1,C10,1,adrenoceptor,-0.41156,-0.51218,-0.68223,0.08644,...,-0.18462,0.42147,-0.04861,0.70795,-0.3394,-0.12678,0.43913,-0.38199,-0.28796,0.55922
pqkTwaHa2L,4,P1,1,C10,1,adrenoceptor,-0.55896,-0.6206,-0.18738,-0.58986,...,1.21538,0.64265,1.05208,-0.00902,-0.39337,0.074,0.31612,0.11273,0.03682,0.32464
ng7c7qtodJ,6,P1,1,C10,1,adrenoceptor,0.99175,0.84967,0.4151,1.16244,...,0.23077,-0.32061,-0.24653,0.58247,2.44783,0.96697,-0.97043,0.47909,0.60464,-0.27565


In [3]:
pca_fit = decomposition.PCA(n_components=15)
pca_df = pca_fit.fit_transform(df.loc[:, eb_features])
colnames = ['pca_{}'.format(x) for x in range(15)]

pca_df = pd.DataFrame(pca_df,
                      index=df.index,
                      columns=colnames)

pca_final_df = df.drop(eb_features, axis="columns").merge(pca_df, left_index=True, right_index=True)

output_file = os.path.join("data", "pca_train.csv")
pca_final_df.to_csv(output_file, float_format="%.5f")

pca_final_df.head()

Unnamed: 0_level_0,cell_id,plate,replicate,well,field,target,pca_0,pca_1,pca_2,pca_3,...,pca_5,pca_6,pca_7,pca_8,pca_9,pca_10,pca_11,pca_12,pca_13,pca_14
cell_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
OaJHcDs2kh,1,P1,1,C10,1,adrenoceptor,-1.817608,2.668399,0.598139,-0.924694,...,0.812588,-0.368145,-0.22672,-0.78569,-0.900064,-1.347657,-0.139622,0.195779,-0.900752,-0.022522
nwLFF4l070,2,P1,1,C10,1,adrenoceptor,-1.577657,1.743385,-0.674194,0.495618,...,-1.173733,0.531189,-0.630432,-1.063752,-1.270119,-0.448406,0.011268,0.099486,0.310657,-0.091042
JU4SIplWZ7,3,P1,1,C10,1,adrenoceptor,-2.039359,1.039083,-0.666614,-0.435052,...,0.001676,0.337543,0.723472,0.576248,0.610254,-0.311478,0.1215,0.364209,-0.361732,-0.059575
pqkTwaHa2L,4,P1,1,C10,1,adrenoceptor,-1.298555,-2.797526,-1.146916,0.81367,...,0.47046,-1.734189,-0.316328,0.54665,0.093386,0.204931,-0.139654,0.37807,-0.094724,0.134315
ng7c7qtodJ,6,P1,1,C10,1,adrenoceptor,3.094821,1.12003,-0.110399,-1.905335,...,-1.705397,-0.262349,1.302634,-0.226103,-0.519817,-0.372849,0.137574,0.324442,0.299011,1.149425


## Apply PCA to Testing and Validation Set

In [4]:
file = os.path.join("data", "testing_feature_select_robust.csv")
test_df = pd.read_csv(file, index_col=0)

pca_test_df = pca_fit.transform(test_df.loc[:, eb_features])

pca_test_df = pd.DataFrame(pca_test_df,
                           index=test_df.index,
                           columns=colnames)

pca_test_df = test_df.drop(eb_features, axis="columns").merge(pca_test_df, left_index=True, right_index=True)

output_file = os.path.join("data", "pca_test.csv")
pca_test_df.to_csv(output_file, float_format="%.5f")

pca_test_df.head()

Unnamed: 0_level_0,cell_id,plate,replicate,well,field,target,pca_0,pca_1,pca_2,pca_3,...,pca_5,pca_6,pca_7,pca_8,pca_9,pca_10,pca_11,pca_12,pca_13,pca_14
cell_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
pB2BlQoW94,5,P1,1,C10,1,adrenoceptor,-2.169833,0.566833,-0.140243,0.31968,...,-0.363381,-0.595835,-0.56362,-0.931901,-1.03233,-0.229103,-0.202998,-0.441061,-0.172056,-0.004392
X9f5EAZwSK,95,P1,1,C10,1,adrenoceptor,-2.027952,0.902694,-0.095052,-1.092949,...,0.406752,-0.832799,0.117361,0.08192,-0.24065,-0.269305,-0.186414,0.911838,-0.038208,0.103521
VDmGHDKms7,110,P1,1,C10,1,adrenoceptor,1.410425,0.69559,-0.580646,-0.599862,...,0.806514,-1.415967,0.849543,0.71021,-0.506309,-0.281182,0.009737,0.222729,-1.021626,0.05681
fJZPphpLGW,132,P1,1,C10,1,adrenoceptor,-1.231871,1.603903,0.025352,0.06231,...,0.43943,-0.54024,-0.035095,-1.072998,-0.725132,-0.961485,0.162815,0.13146,-0.57512,-0.054043
XXzcyKRY89,1,P1,1,G12,4,AMPA,2.802743,2.372537,-1.257186,3.104671,...,-0.770131,0.276345,-1.274382,0.177184,0.512512,0.074058,-0.600262,-0.008873,-0.479769,-0.045113


In [5]:
file = os.path.join("data", "validation_feature_select_robust.csv")
val_df = pd.read_csv(file, index_col=1)

pca_val_df = pca_fit.transform(val_df.loc[:, eb_features])

pca_val_df = pd.DataFrame(pca_val_df,
                          index=val_df.index,
                          columns=colnames)

pca_val_df = val_df.drop(eb_features, axis="columns").merge(pca_val_df, left_index=True, right_index=True)

output_file = os.path.join("data", "pca_validation.csv")
pca_val_df.to_csv(output_file, float_format="%.5f")

pca_val_df.head()

Unnamed: 0_level_0,well_code,pca_0,pca_1,pca_2,pca_3,pca_4,pca_5,pca_6,pca_7,pca_8,pca_9,pca_10,pca_11,pca_12,pca_13,pca_14
cell_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Hmv43N53kD,Ros6NxC0kA,0.481584,0.064508,-0.172455,-1.809265,0.434023,-0.582712,0.084632,0.446478,-0.034903,-0.200489,-0.136876,-0.623003,0.539264,-0.203863,0.056111
B8kmJi71Zt,Ros6NxC0kA,-1.115156,-0.58422,-1.456184,0.870037,1.395533,-0.684877,0.883873,0.205581,-0.559826,-0.495978,0.246687,-0.39533,-0.350918,0.125778,0.253799
lcvffs6oty,Ros6NxC0kA,-1.80796,1.767832,-1.429516,-0.437676,-0.80377,-0.765423,1.008527,0.42756,0.336292,0.301095,-0.178798,-0.867856,0.469347,0.235425,-0.178891
jVHzZ1tdUT,Ros6NxC0kA,-1.161946,0.749557,-1.115783,1.08876,-0.410991,-0.83184,0.067309,-0.161118,-0.944022,0.090092,-0.429363,-0.38017,0.431266,0.091133,-0.003084
Bcx0AaJQ49,Ros6NxC0kA,-1.729369,0.575342,-0.526729,-0.17705,-0.181925,-0.830085,-0.041979,-0.367983,0.821112,-1.247638,0.354467,0.14296,0.10065,-0.304262,-0.042199
