### Set parameters and names

In [14]:
from CustomObjects import *

In [15]:
np.random.seed(42)
tf.random.set_seed(42)
%matplotlib inline

In [16]:
INPUT_GENES    = 'ALL'
INPUT_FEATURES = 'X_FC'
INPUT_NORM     = '_z'
INPUT_NAME     = f'{INPUT_GENES}_{INPUT_FEATURES}{INPUT_NORM}'
INPUT_TRAIN   =f'../data/training/{INPUT_NAME}_training.csv'
INPUT_VAL   = f'../data/validation/{INPUT_NAME}_validation.csv'
INPUT_TEST    =f'../data/test/{INPUT_NAME}_test.csv'
#
CODINGS_SIZE = 6
#
#MODEL_ID=f'VAE1_{CODINGS_SIZE}D_{CHIP_NORM}_{INPUT_NAME}'
MODEL_ID=f'PCA_{CODINGS_SIZE}D_{INPUT_NAME}'

DIR_FIG=f'../figures/{MODEL_ID}/'
DIR_DATA=f'../data/{MODEL_ID}/'
! mkdir -p {DIR_FIG}
! mkdir -p {DIR_DATA}


### Import ChIP and RNA data

In [17]:
# Import input data 
X_train = pd.read_csv( INPUT_TRAIN ).set_index('GENE')
X_val = pd.read_csv( INPUT_VAL ).set_index('GENE')
X_test = pd.read_csv( INPUT_TEST ).set_index('GENE')


In [18]:

X = pd.concat((X_test,X_train,X_val)).sort_index()
assert X.isna().any().sum() == 0

In [19]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(10796, 38)
(2700, 38)
(1500, 38)


# PCA

In [20]:
def DR_transform(X, model, set_label, METHOD):
    # Reconstruct X -> R
    embedding = model.transform(X)
    R = model.inverse_transform(embedding)
    
    # Get errors
    Sc = -np.array(tf.keras.losses.CosineSimilarity(reduction='none', axis=1)(np.array(X, dtype='float32'), np.array(R, dtype='float32')))
    Sc_mean = np.round(float(Sc.mean()), 2)
    Sc_perc = np.round(np.percentile(Sc, [5, 95]), 2)
    #print(f"Sc (mean, [5th-95th]) {set_label}: {Sc_mean} [{Sc_perc[0]}-{Sc_perc[1]}]")
    
    RMSE = np.sqrt(np.array(tf.keras.losses.MSE(X, R)))
    RMSE_mean = np.round(float(RMSE.mean()), 2)
    RMSE_perc = np.round(np.percentile(RMSE, [5, 95]), 2)
    #print(f"MSE (mean, [5th-95th]) {set_label}: {MSE_mean} [{MSE_perc[0]}-{MSE_perc[1]}]")

    Err = pd.DataFrame({'RMSE': RMSE, 'Sc': Sc})

    # Get latent code
    C = pd.DataFrame(embedding, columns=[f'{METHOD}{x}' for x in range(1, embedding.shape[1] + 1)])

    # Re-annotate Rec. with feature names and index
    FEATURE_NAMES = X.columns.to_list()
    R = pd.DataFrame(R, columns=FEATURE_NAMES)
    R = pd.concat((R, C, Err), axis=1)
    R = R.set_index(X.index)
    R['SET'] = set_label

    # Return results as a list
    results = {
        'Set': set_label,
        'Sc':       f'{Sc_mean} [{Sc_perc[0]}-{Sc_perc[1]}]',
        'RMSE': f'{RMSE_mean} [{RMSE_perc[0]}-{RMSE_perc[1]}]',
    }
    
    return R, results

In [21]:
# Example usage with a PCA model trained on X_train
from sklearn.decomposition import PCA

# Assuming X_train is already defined and preprocessed
pca_model = PCA(n_components=CODINGS_SIZE)  # or any number of components you want to retain
pca_model.fit(X_train)


In [22]:
# Transform X_train
R_train,metrics_train = DR_transform(X_train, pca_model, 'training','PCA')
R_val,metrics_val = DR_transform(X_val, pca_model, 'validation','PCA')
R_test,metrics_test = DR_transform(X_test, pca_model, 'test','PCA')

# Combine results
R_X = pd.concat((R_train, R_val, R_test)).sort_index()

# Combine metrics
metrics_df = pd.DataFrame([metrics_train, metrics_val, metrics_test])
# Display metrics
metrics_df.to_csv(f'{DIR_DATA}metrics.csv', index=False)
metrics_df


Unnamed: 0,Set,Sc,RMSE
0,training,0.93 [0.79-0.99],0.29 [0.16-0.48]
1,validation,0.93 [0.8-0.99],0.28 [0.16-0.48]
2,test,0.93 [0.8-0.99],0.29 [0.16-0.48]


In [23]:

assert (X.index==R_X.index).all() 

DF = X.copy()
R_DF = R_X.copy()
DF['SET'] = R_DF['SET']

R_DF.to_csv(f'{DIR_DATA}R_DF.csv')
DF.to_csv(f'{DIR_DATA}DF.csv')

In [None]:
violins_error(R_DF,ERROR_COL='RMSE',SET_COL_DICT=SET_COL_DICT,SAVEFIG=f'{DIR_FIG}/RMSE_violins.png')
violins_error(R_DF,ERROR_COL='Sc',SET_COL_DICT=SET_COL_DICT,SAVEFIG=f'{DIR_FIG}/Sc_violins.png')

In [None]:
SEL = [gene for genes_list in MARKER_GENES.values() for gene in genes_list]
X_heatmaps(DF,R_DF,SEL,HM_LIST,SET_COL_DICT,HM_COL_DICT,CT_COL_DICT, DIR_FIG, cmap='viridis',TITLE='SELECTED',
            relative_range=False, vmin=-2.5,vmax=2.5 )
Sc_selected(R_DF,SEL,'Sc','SEL', DIR_FIG)
Sc_selected(R_DF,SEL,'RMSE','SEL', DIR_FIG)
FC_heatmaps(DF,R_DF,SEL,HM_LIST,HM_COL_DICT, DIR_FIG,fc_cmap='RdBu_r',TITLE='SELECTED',vmax=3,vmin=-3)

In [None]:
for CT, MARKERS in MARKER_GENES_EXT.items():
    Sc_selected(R_DF,MARKERS,'Sc',CT, DIR_FIG)
    Sc_selected(R_DF,MARKERS,'RMSE',CT, DIR_FIG)
    X_heatmaps(DF,R_DF,MARKERS,HM_LIST,SET_COL_DICT,HM_COL_DICT,CT_COL_DICT, DIR_FIG, cmap='viridis',TITLE=CT,vmax=2.5,vmin=-2.5)

In [None]:
for CT, MARKERS in MARKER_GENES_EXT.items():
    FC_heatmaps(DF,R_DF,MARKERS,HM_LIST,HM_COL_DICT, DIR_FIG,fc_cmap='RdBu_r',TITLE=CT,vmax=3,vmin=-3)

# Re-train on the whole dataset

In [28]:
pca_model = PCA(n_components=CODINGS_SIZE)  # or any number of components you want to retain
pca_model.fit(X)

In [29]:
R_DF_WHOLE,metrics_w = DR_transform(X, pca_model, 'training', 'PCA')
metrics_w = pd.DataFrame([metrics_w])
# Combine metrics
metrics_df_final = pd.concat([metrics_df, metrics_w])
# Save and Display metrics
metrics_df_final.to_csv(f'{DIR_DATA}metrics.csv', index=False)
metrics_df_final


Unnamed: 0,Set,Sc,RMSE
0,training,0.93 [0.79-0.99],0.29 [0.16-0.48]
1,validation,0.93 [0.8-0.99],0.28 [0.16-0.48]
2,test,0.93 [0.8-0.99],0.29 [0.16-0.48]
0,training,0.93 [0.79-0.99],0.29 [0.16-0.48]


In [None]:

R_DF_WHOLE.to_csv(f'{DIR_DATA}R_DF_WHOLE.csv')
violins_error(R_DF_WHOLE,ERROR_COL='Sc',SET_COL_DICT=SET_COL_DICT,SAVEFIG=f'{DIR_FIG}/Sc_violins_retrain.png')