In [89]:
CT_LIST = ['ESC', 'MES', 'CP', 'CM']
HM_LIST = ['H3K4me3', 'H3K27ac', 'H3K27me3',  'RNA']
PREFIXES = [HM + '_' + CT for HM in HM_LIST for CT in CT_LIST]


MARKER_GENES_EXT = {'ESC': ['Nanog','Pou5f1','Sox2','L1td1','Dppa5a','Tdh','Esrrb','Lefty1','Zfp42','Sfn','Lncenc1','Utf1'],
                    'MES': ['Mesp1','Mesp2','T', 'Vrtn','Dll3','Dll1', 'Evx1','Cxcr4','Pcdh8','Pcdh19','Robo3','Slit1'],
                    'CP':  ['Sfrp5', 'Gata5', 'Tek','Hbb-bh1','Hba-x', 'Pyy','Sox18','Lyl1','Rgs4','Igsf11','Tlx1','Ctse'],
                    'CM':  ['Nppa','Gipr', 'Actn2', 'Coro6', 'Col3a1', 'Bgn','Myh6','Myh7','Tnni3','Hspb7' ,'Igfbp7','Ndrg2'],
                    }



HM_COL_DICT = {'H3K4me3': '#f37654','H3K27ac': '#b62a77','H3K27me3': '#39A8AC','RNA':'#ED455C'}
CT_COL_DICT= {'ESC': '#405074',
                'MES': '#7d5185',
                'CP': '#c36171',
                'CM': '#eea98d',}
SET_COL_DICT= {'training':'#97DA58','validation':'#9b58da','test':'#DA5A58'}
GONZALEZ_COL_DICT= {'Active': '#E5AA44','Bivalent': '#7442BE'}

In [None]:
import plotly.express as px
import pandas as pd
import pickle



# Load gene cluster dictionary
with open(f'./data/gene_clusters_dict.pkl', 'rb') as f:
    GENE_CLUSTERS = pickle.load(f)

# Load CODE and LOG matrices
CODE = pd.read_csv(f'./data/CODE.csv', index_col='GENE')
CODE = CODE.iloc[:, 18:]
CODE.drop(columns=['AE_RMSE','AE_Sc', 'PCA_RMSE', 'PCA_Sc', 'UMAP_RMSE', 'UMAP_Sc'],inplace=True)
LOG = pd.read_csv(f'./data/ALL_X_FC.csv').set_index('GENE')

# Map cluster IDs to CODE and LOG
gene_to_cluster = {}
for cluster_id, gene_list in GENE_CLUSTERS.items():
    for gene in gene_list['gene_list']:
        gene_to_cluster[gene] = cluster_id
CODE["Cluster"] = CODE.index.map(gene_to_cluster).astype(int)


In [85]:
name = 'TOP'
N_TOP = 4000
with open(f'./data/RNA_CV/{name}{N_TOP}/dict.pkl', 'rb') as f:
    CV = pickle.load(f)
name = 'BOTTOM'
with open(f'./data/RNA_CV/{name}{N_TOP}/dict.pkl', 'rb') as f:
    BOTTOM_CV = pickle.load(f)
    
STABLE = [gene for gene_list in BOTTOM_CV.values() for gene in gene_list]
CV['STABLE'] = STABLE

CV_MAP={}
for CV_CAT, gene_list in CV.items():
    for gene in gene_list:
        CV_MAP[gene] = CV_CAT
        
CODE["CV_Category"] = CODE.index.map(CV_MAP)
CODE['CV_Category'] = CODE['CV_Category'].fillna('other')

In [86]:
with open('./data/gonzalez_dict.pkl', 'rb') as f:
    GONZALEZ = pickle.load(f)
GONZALEZ_MAP={}
for GONZALEZ_CAT, gene_list in GONZALEZ.items():
    for gene in gene_list:
        GONZALEZ_MAP[gene] = GONZALEZ_CAT
CODE["ESC_ChromState_Gonzalez2021"] = CODE.index.map(GONZALEZ_MAP)
CODE['ESC_ChromState_Gonzalez2021'] = CODE['ESC_ChromState_Gonzalez2021'].fillna('other')
CODE['ESC_ChromState_Gonzalez2021'].value_counts()

ESC_ChromState_Gonzalez2021
Active      9186
other       3495
Bivalent    2315
Name: count, dtype: int64

In [88]:
CODE.to_csv(f'./data/DATA.csv')
CODE.columns


Index(['VAE1', 'VAE2', 'VAE3', 'VAE4', 'VAE5', 'VAE6', 'H3K4me3_ESC',
       'H3K4me3_MES', 'H3K4me3_CP', 'H3K4me3_CM', 'H3K27ac_ESC', 'H3K27ac_MES',
       'H3K27ac_CP', 'H3K27ac_CM', 'H3K27me3_ESC', 'H3K27me3_MES',
       'H3K27me3_CP', 'H3K27me3_CM', 'RNA_ESC', 'RNA_MES', 'RNA_CP', 'RNA_CM',
       'RNA_CM_CP_FC', 'RNA_CM_MES_FC', 'RNA_CM_ESC_FC', 'RNA_CP_MES_FC',
       'RNA_CP_ESC_FC', 'RNA_MES_ESC_FC', 'VAE_RMSE', 'VAE_Sc', 'RNA_CV',
       'VAE_UMAP1', 'VAE_UMAP2', 'Cluster', 'CV_Category',
       'ESC_ChromState_Gonzalez2021'],
      dtype='object')

In [41]:
VAE_CODE = CODE.iloc[:, :6]
VAE_CODE

Unnamed: 0_level_0,VAE1,VAE2,VAE3,VAE4,VAE5,VAE6
GENE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0610005C13Rik,-10.469733,349.438200,-161.42465,-90.162674,-202.413000,49.895590
0610010F05Rik,46.057980,192.496380,-229.76756,13.015251,-27.973946,50.193714
0610010K14Rik,135.278000,101.991010,-273.49344,5.428470,-208.242810,-15.913120
0610012G03Rik,104.669460,102.431780,-294.82028,-33.677746,-171.845900,25.567083
0610030E20Rik,-1.907172,54.007175,-178.82239,-18.223375,-114.121640,75.195870
...,...,...,...,...,...,...
Zyg11b,43.561104,128.600080,-250.67589,29.285270,-111.688960,49.730457
Zyx,51.048225,95.455280,-347.88043,59.476078,-137.262040,12.057181
Zzef1,125.321970,108.019190,-290.48760,18.553220,-45.898750,130.741070
Zzz3,131.600080,88.010120,-256.74036,50.287690,-147.886280,97.206310


In [42]:
Z_AVG = CODE.iloc[:, 6:22]
Z_AVG

Unnamed: 0_level_0,H3K4me3_ESC,H3K4me3_MES,H3K4me3_CP,H3K4me3_CM,H3K27ac_ESC,H3K27ac_MES,H3K27ac_CP,H3K27ac_CM,H3K27me3_ESC,H3K27me3_MES,H3K27me3_CP,H3K27me3_CM,RNA_ESC,RNA_MES,RNA_CP,RNA_CM
GENE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0610005C13Rik,-1.341827,-2.274993,-2.001007,-2.347703,0.078994,-0.778873,-0.512341,-0.845164,-1.665598,-0.743799,-0.499390,-0.950986,-0.562474,-0.824653,-0.619771,-0.878567
0610010F05Rik,0.349394,0.285977,0.350528,0.312501,-0.083182,0.277252,-0.300362,-0.199833,0.453954,-0.223176,0.146651,-0.043748,-0.427455,-0.457987,-0.488086,-0.677222
0610010K14Rik,0.766511,0.641317,1.026444,0.957559,0.908110,0.752422,1.449594,1.153821,-1.520016,-1.498119,-0.916976,-0.864998,1.380881,1.556845,1.347083,1.115949
0610012G03Rik,0.932623,1.047206,1.029020,0.975399,0.552571,0.887095,1.036941,0.908532,-1.530310,-0.897432,-0.910333,-0.999972,0.649590,0.908545,0.869985,1.313180
0610030E20Rik,0.320979,0.397582,0.187307,0.229276,0.320144,0.726681,0.575277,0.907163,-0.356712,-1.758459,-0.985048,-0.998849,-0.461541,-0.011409,-0.110904,0.053040
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zyg11b,0.152709,0.896787,0.668738,0.394286,0.150801,1.091068,0.509513,0.387461,-0.376836,-0.315916,-0.824109,-0.783861,-0.221866,0.352127,0.034911,0.048833
Zyx,0.362277,0.558916,0.962010,0.765624,-0.058876,1.043048,1.551018,1.250911,1.265505,-1.159680,-0.642142,-0.478176,0.503472,0.989409,1.200964,1.188980
Zzef1,1.161443,1.027133,1.053159,1.172360,0.979192,1.116591,1.006632,1.326793,0.013620,-0.196379,-0.292896,-0.378813,-0.241298,-0.385902,-0.258751,-0.037254
Zzz3,0.913719,0.778364,0.884672,0.747808,1.539586,1.482397,1.711849,1.382595,-0.996441,-1.010890,-0.735883,-1.264653,0.270440,0.418342,0.435115,0.094259


In [43]:
CODE.iloc[:, 22:].columns

Index(['RNA_CM_CP_FC', 'RNA_CM_MES_FC', 'RNA_CM_ESC_FC', 'RNA_CP_MES_FC',
       'RNA_CP_ESC_FC', 'RNA_MES_ESC_FC', 'VAE_RMSE', 'VAE_Sc', 'RNA_CV',
       'VAE_UMAP1', 'VAE_UMAP2', 'GMM_VAE_80'],
      dtype='object')

In [39]:
RNA_FPKM= pd.read_csv(f'./data/RNA_FPKMs.csv', index_col='GENE')