### Set parameters and names

In [None]:
from CustomObjects import *

In [2]:
np.random.seed(42)
tf.random.set_seed(42)
%matplotlib inline

In [3]:
INPUT_GENES    = 'ALL'
INPUT_FEATURES = 'X_FC'
INPUT_NORM     = '_z'
INPUT_NAME     = f'{INPUT_GENES}_{INPUT_FEATURES}{INPUT_NORM}'
INPUT_TRAIN   =f'../data/training/{INPUT_NAME}_training.csv'
INPUT_VAL   = f'../data/validation/{INPUT_NAME}_validation.csv'
INPUT_TEST    =f'../data/test/{INPUT_NAME}_test.csv'
#
CODINGS_SIZE = 6
#
#MODEL_ID=f'VAE1_{CODINGS_SIZE}D_{CHIP_NORM}_{INPUT_NAME}'
MODEL_ID=f'UMAP_{CODINGS_SIZE}D_{INPUT_NAME}'

DIR_FIG=f'../figures/{MODEL_ID}/'
DIR_DATA=f'../data/{MODEL_ID}/'
! mkdir -p {DIR_FIG}
! mkdir -p {DIR_DATA}


### Import ChIP and RNA data

In [4]:
# Import input data 
X_train = pd.read_csv( INPUT_TRAIN ).set_index('GENE')
X_val = pd.read_csv( INPUT_VAL ).set_index('GENE')
X_test = pd.read_csv( INPUT_TEST ).set_index('GENE')


In [5]:

X = pd.concat((X_test,X_train,X_val)).sort_index()
assert X.isna().any().sum() == 0

In [6]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(10796, 38)
(2700, 38)
(1500, 38)


# UMAP

In [7]:
def DR_transform(X, model, set_label, METHOD):
    # Reconstruct X -> R
    embedding = model.transform(X)
    R = model.inverse_transform(embedding)
    
    # Get errors
    Sc = -np.array(tf.keras.losses.CosineSimilarity(reduction='none', axis=1)(np.array(X, dtype='float32'), np.array(R, dtype='float32')))
    Sc_mean = np.round(float(Sc.mean()), 2)
    Sc_perc = np.round(np.percentile(Sc, [5, 95]), 2)
    #print(f"Sc (mean, [5th-95th]) {set_label}: {Sc_mean} [{Sc_perc[0]}-{Sc_perc[1]}]")
    
    RMSE = np.sqrt(np.array(tf.keras.losses.MSE(X, R)))
    RMSE_mean = np.round(float(RMSE.mean()), 2)
    RMSE_perc = np.round(np.percentile(RMSE, [5, 95]), 2)
    #print(f"MSE (mean, [5th-95th]) {set_label}: {MSE_mean} [{MSE_perc[0]}-{MSE_perc[1]}]")

    Err = pd.DataFrame({'RMSE': RMSE, 'Sc': Sc})

    # Get latent code
    C = pd.DataFrame(embedding, columns=[f'{METHOD}{x}' for x in range(1, embedding.shape[1] + 1)])

    # Re-annotate Rec. with feature names and index
    FEATURE_NAMES = X.columns.to_list()
    R = pd.DataFrame(R, columns=FEATURE_NAMES)
    R = pd.concat((R, C, Err), axis=1)
    R = R.set_index(X.index)
    R['SET'] = set_label

    # Return results as a list
    results = {
        'Set': set_label,
        'Sc':       f'{Sc_mean} [{Sc_perc[0]}-{Sc_perc[1]}]',
        'RMSE': f'{RMSE_mean} [{RMSE_perc[0]}-{RMSE_perc[1]}]',
    }
    
    return R, results




In [9]:
# train a UMAP model on X_train
umap_model = umap.UMAP(n_components= CODINGS_SIZE, 
                        random_state=42,
                        #metric='correlation',
                        min_dist=0.0,
                        n_neighbors=30,
                        )
umap_model.fit(X_train)


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [10]:
# Transform X_train
R_train,metrics_train = DR_transform(X_train, umap_model, 'training', 'UMAP')
R_val,metrics_val =     DR_transform(X_val, umap_model, 'validation', 'UMAP')
R_test,metrics_test =   DR_transform(X_test, umap_model, 'test', 'UMAP')

# Combine results
R_X = pd.concat((R_train, R_val, R_test)).sort_index()

# Combine metrics
metrics_df = pd.DataFrame([metrics_train, metrics_val, metrics_test])
# Display metrics
metrics_df.to_csv(f'{DIR_DATA}metrics.csv', index=False)
metrics_df


2024-07-04 19:21:31.029201: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-07-04 19:21:31.029690: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Metal device set to: Apple M1 Pro

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



Unnamed: 0,Set,Sc,RMSE
0,training,0.89 [0.69-0.98],0.38 [0.19-0.7]
1,validation,0.71 [-0.48-0.98],0.51 [0.2-1.26]
2,test,0.68 [-0.56-0.98],0.53 [0.2-1.3]


In [11]:

assert (X.index==R_X.index).all() 

DF = X.copy()
R_DF = R_X.copy()
DF['SET'] = R_DF['SET']

R_DF.to_csv(f'{DIR_DATA}R_DF.csv')
DF.to_csv(f'{DIR_DATA}DF.csv')

In [None]:
violins_error(R_DF,ERROR_COL='RMSE',SET_COL_DICT=SET_COL_DICT,SAVEFIG=f'{DIR_FIG}/RMSE_violins.png')
violins_error(R_DF,ERROR_COL='Sc',SET_COL_DICT=SET_COL_DICT,SAVEFIG=f'{DIR_FIG}/Sc_violins.png')

In [None]:
SEL = [gene for genes_list in MARKER_GENES.values() for gene in genes_list]
X_heatmaps(DF,R_DF,SEL,HM_LIST,SET_COL_DICT,HM_COL_DICT,CT_COL_DICT, DIR_FIG, cmap='viridis',TITLE='SELECTED',
            relative_range=False, vmin=-2.5,vmax=2.5 )
Sc_selected(R_DF,SEL,'Sc','SEL', DIR_FIG)
Sc_selected(R_DF,SEL,'RMSE','SEL', DIR_FIG)
FC_heatmaps(DF,R_DF,SEL,HM_LIST,HM_COL_DICT, DIR_FIG,fc_cmap='RdBu_r',TITLE='SELECTED',vmax=3,vmin=-3)

In [None]:
for CT, MARKERS in MARKER_GENES_EXT.items():
    Sc_selected(R_DF,MARKERS,'Sc',CT, DIR_FIG)
    Sc_selected(R_DF,MARKERS,'RMSE',CT, DIR_FIG)
    X_heatmaps(DF,R_DF,MARKERS,HM_LIST,SET_COL_DICT,HM_COL_DICT,CT_COL_DICT, DIR_FIG, cmap='viridis',TITLE=CT,vmax=2.5,vmin=-2.5)

In [None]:
for CT, MARKERS in MARKER_GENES_EXT.items():
    FC_heatmaps(DF,R_DF,MARKERS,HM_LIST,HM_COL_DICT, DIR_FIG,fc_cmap='RdBu_r',TITLE=CT,vmax=3,vmin=-3)

# Re-train on the whole dataset

In [16]:
# train a UMAP model on X_train
umap_model = umap.UMAP(n_components=CODINGS_SIZE, 
                        random_state=42,
                        #metric='correlation',
                        min_dist=0.0,
                        n_neighbors=30,
                        )
umap_model.fit(X)

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [17]:
R_DF_WHOLE,metrics_w = DR_transform(X, umap_model, 'training','UMAP')
metrics_w = pd.DataFrame([metrics_w])
# Combine metrics
metrics_df_final = pd.concat([metrics_df, metrics_w])
# Save and Display metrics
metrics_df_final.to_csv(f'{DIR_DATA}metrics.csv', index=False)
metrics_df_final


Unnamed: 0,Set,Sc,RMSE
0,training,0.89 [0.69-0.98],0.38 [0.19-0.7]
1,validation,0.71 [-0.48-0.98],0.51 [0.2-1.26]
2,test,0.68 [-0.56-0.98],0.53 [0.2-1.3]
0,training,0.89 [0.7-0.98],0.37 [0.19-0.68]


In [None]:

R_DF_WHOLE.to_csv(f'{DIR_DATA}R_DF_WHOLE.csv')
violins_error(R_DF_WHOLE,ERROR_COL='Sc',SET_COL_DICT=SET_COL_DICT,SAVEFIG=f'{DIR_FIG}/Sc_violins_retrain.png')