### Import libraries

In [None]:
from CustomObjects import *

In [None]:
! mkdir -p ../data/
! mkdir -p ../data/matrices
! mkdir -p ../data/removed_genes
! mkdir -p ../figures/preprocessing
! mkdir -p ../figures/preprocessing/PCA
! mkdir -p ../figures/RNA_CV
! mkdir -p ../figures/gene_trend
%matplotlib inline

In [None]:
export_legend(CT_COL_DICT,filename='../figures/legend_CT.pdf',MK_SIZE=9)
export_legend(SET_COL_DICT,filename='../figures/legend_SET.pdf',MK_SIZE=9)
export_legend(HM_COL_DICT, filename='../figures/legend_HM.pdf',MK_SIZE=9)
HM_COL_DICT['WCE'] = '#B1BBBA'
export_legend(HM_COL_DICT, filename='../figures/legend_HM2.pdf',MK_SIZE=9)
HM_COL_DICT.popitem()
HM_COL_DICT['RNA'] = '#ED455C'
export_legend(HM_COL_DICT, filename='../figures/legend_FEAT.pdf',MK_SIZE=9)
HM_COL_DICT['WCE'] = '#B1BBBA'


In [None]:
DISTANCE=2500
PSEUDOCOUNT=1

METADATA = pd.read_csv("../../01_Mapping/data/ChIP_NARROW.csv")
assert METADATA.isna().any().sum() == 0
assert METADATA['SAMPLE_ID'].is_unique and METADATA['SAMPLE_ID_SHORT'].is_unique
CT_LIST = list(METADATA['CELL_TYPE'].unique())
HM_LIST = list(METADATA['TARGET'].sort_values().unique())


In [None]:
HM_LIST2 = ['H3K27ac', 'H3K27me3',  'H3K4me3']

# RNA-seq

### Import FPKMs

In [None]:
from itertools import combinations
COMB = list(combinations(CT_LIST,2))
COMB2 = (('ESC','MES'), ('CP','CM'))

RNA = []

for PAIR in COMB2:
    NAME = f'{PAIR[0]}_{PAIR[1]}'
    DIR = f"../../02_DESeq/Results/fpkm/{NAME}.txt"
    #
    TMP = pd.read_table(DIR)
    RNA.append(TMP)
RNA = pd.concat(RNA,axis="columns")
RNA = RNA.add_prefix('RNA_')
#
#
RNA.to_csv(f'../data/matrices/RNA_FPKM{DISTANCE}.csv',index_label='GENE')
RNA_MAIN = RNA.copy()


RNA

### Filtering (Replicates AVG FPKMs > 0.5 in at least 1 CT)

In [None]:
RNA_PREFIXES = []
for CT in CT_LIST: RNA_PREFIXES.append('RNA_'+CT)
RNA_AVG = calculate_mean_features(RNA, RNA_PREFIXES)
RNA_AVG = RNA_AVG[(RNA_AVG > 0.5).sum(axis=1) > 0]          # filter
RNA = RNA.loc[RNA_AVG.index]
RNA

### Compute Coeff. of Variation
##### *Only computed for genes having AVG FPKMs across CTs and Replicates > 0.5

In [None]:
from scipy.stats import gaussian_kde

# Filter the RNA_AVG for mean expression > 0.5
RNA_AVG = RNA_AVG[(RNA_AVG.mean(axis=1) > 0.5)]

# Compute coefficient of variation (CV)
RNA_CV = RNA_AVG.std(axis=1) / RNA_AVG.mean(axis=1)

# Sort CV values and get the indices for bottom and top 4000
N_TOP = 4000
sorted_CV = RNA_CV.sort_values(ascending=True)
BOTTOM_CV = sorted_CV[:N_TOP]
TOP_CV = sorted_CV[-N_TOP:]

# Create KDE data manually using scipy gaussian_kde
kde = gaussian_kde(RNA_CV)
x_data = np.linspace(RNA_CV.min(), RNA_CV.max(), 1000)
y_data = kde(x_data)

# Create the plot
plt.figure(figsize=(3, 2))

# Plot the full KDE
plt.plot(x_data, y_data, color='silver', label='All genes')

# Highlight the bottom 4000 (light green)
plt.fill_between(x_data, y_data, where=(x_data <= BOTTOM_CV.max()), color='silver', alpha=0.8, )

# Highlight the top 4000 (red)
plt.fill_between(x_data, y_data, where=(x_data >= TOP_CV.min()), color='silver', alpha=0.8, )

# Highlight the middle area (silver)
plt.fill_between(x_data, y_data, where=((x_data > BOTTOM_CV.max()) & (x_data < TOP_CV.min())), color='silver', alpha=0.2)

# Customize the plot
plt.xlabel('CV')
plt.ylabel('Density')

# Customize the plot
plt.title(f'CV on gene expression (n={len(RNA_CV)})')
plt.gca().xaxis.grid(False)  # Disable x-axis grid
sns.despine(left=True,bottom=True)  # Remove left and bottom spines
plt.savefig('../figures/RNA_CV/CV_distribution.pdf', format="pdf", bbox_inches="tight");


### Define the TOP4000 and BOTTOM4000 genes based on CV as STABLE and VARIABLE genes

In [None]:
N_TOP = 4000
BOTTOM_CV = RNA_CV.sort_values(ascending=1)[:N_TOP].index
BOTTOM_CV = RNA_AVG.loc[BOTTOM_CV]
violins(np.log10(BOTTOM_CV + 1),COL_DICT=CT_COL_DICT,SAVEFIG = f'../figures/RNA_CV/BOTTOM{N_TOP}.png', X_LAB = f"log{subscript_get('10')}(FPKM+1)",TITLE= f'STABLE genes (Bottom{N_TOP} CV)')
#
TOP_CV = RNA_CV.sort_values(ascending=1)[-N_TOP:].index
TOP_CV = RNA_AVG.loc[TOP_CV]
violins(np.log10(TOP_CV + 1 ),COL_DICT=CT_COL_DICT,SAVEFIG = f'../figures/RNA_CV/TOP{N_TOP}.png', X_LAB = f"log{subscript_get('10')}(FPKM+1)",TITLE= f'VARIABLE genes (Top{N_TOP} CV)')

### Extract the CT with maximum expression for each gene in VARIABLE (STABLE only as control)

In [None]:
freq_ct_max(BOTTOM_CV,CT_COL_DICT,SAVEFIG=f'../figures/RNA_CV/CT_max_BOTTOM{N_TOP}.pdf',TITLE= f"Freq. of each CT{subscript_get('max')} in STABLE genes")
freq_ct_max(TOP_CV,CT_COL_DICT,SAVEFIG=f'../figures/RNA_CV/CT_max_TOP{N_TOP}.pdf',      TITLE= f"Freq. of each CT{subscript_get('max')} in VARIABLE genes")

### Gene lists for TOP/BOTTOM for each CTmax (Term enrichment analysis)

In [None]:
! mkdir -p ../data/RNA_CV/BOTTOM{N_TOP}
! mkdir -p ../data/RNA_CV/TOP{N_TOP}

In [None]:
for name, df in zip(('BOTTOM','TOP'),(BOTTOM_CV,TOP_CV)):
    CT_MAX = df.idxmax(axis=1)
    DICT = {}
    for CT in RNA_PREFIXES:
        print(CT,len(CT_MAX[CT_MAX == CT]))
        gene_list = CT_MAX[CT_MAX==CT].index.to_list()
        DICT[CT] = gene_list
        pd.Series(gene_list).to_csv(f'../data/RNA_CV/{name}{N_TOP}/{CT}.list',index=False,header=False)
    import pickle
    with open(f'../data/RNA_CV/{name}{N_TOP}/dict.pkl', 'wb') as f:
        pickle.dump(DICT, f)
################################### ---> Term enrichment in ORA_CV.ipynb

# RNA DESeq2 output (FCs)

In [None]:
## ALL_FC
from itertools import combinations
COMB = list(combinations(CT_LIST[::-1], 2))
COMB

In [None]:
RNA_STATS = []
for PAIR in COMB:
    NAME = f'{PAIR[0]}_{PAIR[1]}'
    INV_NAME = f'{PAIR[1]}_{PAIR[0]}'                                #correct one
    DIR = f"../../02_DESeq/Results/stats/{INV_NAME}.txt"
    TMP = pd.read_table(DIR,usecols=['log2FoldChange','pvalue','padj'])
    TMP = TMP.add_prefix(f'{NAME}_')
    RNA_STATS.append(TMP)
RNA_STATS = pd.concat(RNA_STATS,axis="columns")
## Filter
#RNA_STATS = RNA_STATS.dropna()
RNA_STATS = RNA_STATS.loc[RNA.index]
assert (RNA_STATS.index == RNA.index).all()
RNA_STATS.to_csv(f'../data/matrices/RNAFC.csv',index_label='GENE')

# Cell type Marker genes
#### Selection made by knowledge in literature and FC/p-val

In [None]:
MARKER_GENES = {'ESC': ['Nanog','Pou5f1','Sox2','Dppa5a'],
                'MES': ['Mesp1','T', 'Vrtn','Dll3'],
                'CP':  ['Gata5', 'Tek','Sox18','Lyl1',],
                'CM':  ['Actn2', 'Coro6','Myh6','Myh7'],
                }

MARKER_GENES_EXT = {'ESC': ['Nanog','Pou5f1','Sox2','L1td1','Dppa5a','Tdh','Esrrb','Lefty1','Zfp42','Sfn','Lncenc1','Utf1'],
                    'MES': ['Mesp1','Mesp2','T', 'Vrtn','Dll3','Dll1', 'Evx1','Cxcr4','Pcdh8','Pcdh19','Robo3','Slit1'],
                    'CP':  ['Sfrp5', 'Gata5', 'Tek','Hbb-bh1','Hba-x', 'Pyy','Sox18','Lyl1','Rgs4','Igsf11','Tlx1','Ctse'],
                    'CM':  ['Nppa','Gipr', 'Actn2', 'Coro6', 'Col3a1', 'Bgn','Myh6','Myh7','Tnni3','Hspb7' ,'Igfbp7','Ndrg2'],
                    }

                
for CT,GENE_LIST in MARKER_GENES.items():
    RNA_gene_CT(RNA, GENE_LIST=GENE_LIST, CT_LIST=CT_LIST, CT_COL_DICT=CT_COL_DICT, SAVE_PREFIX=f'RNA_{CT}_GENES')
    
for CT,GENE_LIST in MARKER_GENES_EXT.items():
    RNA_gene_CT(RNA, GENE_LIST=GENE_LIST, CT_LIST=CT_LIST, CT_COL_DICT=CT_COL_DICT, SAVE_PREFIX=f'EXT_RNA_{CT}_GENES')



# CHIP-seq

In [None]:
# Input files format
# CHR TSS-DISTANCE      TSS+DISTANCE    GENE            AVG     MAX     TOT
# chr7	45570176	45580176	0610005C13Rik	0.36	5.42	355.45

CHIP = []

for index, row in METADATA.iterrows():
    NAME = row['SAMPLE_ID']
    DIR = f"../../03_RecoverSignal/recoverChIPlevels_promoters_{DISTANCE}/{NAME}_recoverChIPlevels/PEAKsignal_{NAME}.bed"
    SHORT_NAME = row['SAMPLE_ID_SHORT']
    assert SHORT_NAME == f"{row['TARGET']}_{row['CELL_TYPE']}_{row['REP']}"
    #
    AVG_TMP = pd.read_table(DIR,usecols=[3,6],header=None,names=['GENE',SHORT_NAME]).set_index('GENE')
    CHIP.append(AVG_TMP)
CHIP = pd.concat(CHIP,axis="columns")
#
#
COORD = pd.read_table(DIR,usecols=[1,2,3],header=None,names=['START','END','GENE']).set_index('GENE')
assert (COORD['END'] - COORD['START'] == DISTANCE*2).all()
assert (CHIP.index == COORD.index).all()
CHIP.to_csv(f'../data/matrices/ChIP_TSS{DISTANCE}_RAW.csv',index_label='GENE:PROMOTER')
CHIP.reset_index(inplace=True)


### Compute AVG values between Alternatives promoters for each gene

In [None]:
# Split the GENE column into GENE and ALT_PROM
CHIP[['GENE', 'ALT_PROM']] = CHIP['GENE'].str.split(':', expand=True)
CHIP.drop(columns=['ALT_PROM'], inplace=True)
# Group by the GENE column and calculate the mean for each group
CHIP = CHIP.groupby('GENE').mean()


### Keep intersection btw RNA and ChIP filtering
- ChIP filtering -> blacklist regions, sncRNA ...
- RNA filtering -> avg FPKMs

In [None]:
len_RNA = RNA.shape[0]
len_CHIP = CHIP.shape[0]
comm_index = CHIP.index.intersection(RNA.index,sort=True)
len_comm = len(comm_index)
print(f'# common: {len_comm}')

In [None]:
print(f"# removed in RNA-seq: {len_RNA-len_comm}/{len_RNA}")
RNA = RNA.loc[comm_index]
RNA.to_csv(f'../data/matrices/RNA_FPKM_TSS_{DISTANCE}_FILT.csv',index_label='GENE')


In [None]:
CHIP_ONLY = CHIP.loc[~CHIP.index.isin(comm_index)].index
print(f'# removed in ChIP-seq: {len_CHIP-len_comm}/{len_CHIP}')
CHIP = CHIP.loc[comm_index]
assert (CHIP.index == RNA.index).all()
CHIP.to_csv(f'../data/matrices/ChIP_TSS{DISTANCE}_RAW_FILT.csv',index_label='GENE')


In [None]:
from matplotlib_venn import venn2, venn2_circles
plt.figure(figsize=(4,4))
venn2(subsets=(len_RNA-len_comm, len_CHIP-len_comm, len_comm), set_labels=('RNA-seq\nLow count', 'ChIP-seq\n !encode_blacklist'))
venn2_circles(subsets=(len_RNA-len_comm, len_CHIP-len_comm, len_comm))
plt.title("Genes overlap after 2 indipendent filterings");
plt.savefig('../figures/preprocessing/Venn_filtering.pdf', format="pdf", bbox_inches="tight")

### Check intersection with bivalent genes in ESC (how many bivalent genes we are discarding?)

In [None]:
print(f'# removed (in ChIP-seq) because of RNA filtering: {len(CHIP_ONLY)}')
BIVALENT_ESC= pd.read_excel('../../00_RegionAnnotation/Gonzalez/bivalent_promoters.xlsx',usecols=[3]).set_index('gene').index
BIVALENT_ESC= set(BIVALENT_ESC)
CHIP_ONLY = set(CHIP_ONLY)
BIV_REM = CHIP_ONLY.intersection(BIVALENT_ESC)

In [None]:
plt.figure(figsize=(7,7))
venn2(subsets=[BIVALENT_ESC,CHIP_ONLY], set_labels=('Bivalent in ES,\nGonzalez 2021', 'Removed because of RNA filtering'))
venn2_circles(subsets=[BIVALENT_ESC,CHIP_ONLY])
plt.title("Genes overlap after filtering");
plt.savefig('../figures/preprocessing/Venn_bivalent.pdf', format="pdf", bbox_inches="tight")

In [None]:
len(BIV_REM)
RNA_BIV_REM = RNA_MAIN.loc[list(BIV_REM)]
RNA_BIV_REM.sort_values('RNA_ESC_1',ascending=0)

## QC before data trasformation

### RNA-seq

In [None]:
violins(RNA+1,COL_DICT=CT_COL_DICT,SAVEFIG='../figures/preprocessing/RNA_FPKM_boxplots.png',X_LAB='FPKMs+1',LOG_SCALE=1,TITLE='RAW')

In [None]:

CT_REG = r'(ESC|MES|CP|CM)'
MARKER_LIST=['o','X','^','*']
RNA_PCA(RNA.transpose(),  CT_REG=CT_REG, CT_COL_DICT=CT_COL_DICT, 
        SAVE_PREFIX='RAW') 

### ChIP-seq

In [None]:
HM_REG =r'(H3K4me3|H3K27ac|H3K27me3|WCE)'
CT_REG = r'(ESC|MES|CP|CM)'
MARKER_LIST=['o','X','^','*']


In [None]:
CHIP_PCA_HM(CHIP, HM_LIST=HM_LIST2,MARKER_LIST=MARKER_LIST, CT_REG=CT_REG, CT_COL_DICT=CT_COL_DICT, 
                SAVE_PREFIX=f'CHIP_RAW')

In [None]:
violins(CHIP+PSEUDOCOUNT, CT_COL_DICT,f'../figures/preprocessing/ChIP_RAW_boxplots_{DISTANCE}.png',TITLE='RAW',LOG_SCALE=1,X_LAB='x')
corr_clustering(CHIP,HM_COL_DICT,CT_COL_DICT,SAVEFIG=f'../figures/preprocessing/ChIP_RAW_corr_clustermap_{DISTANCE}.pdf',
                TITLE=f'Samples correlation RAW signal',
                CORR_METHOD='kendall',LINK_METHOD='average')


# ChIP-seq: Divide HM signal over WCE 

HM_(ct)_(n) /    WCE_(ct)_1

In [None]:
CHIP = CHIP.filter(regex='^(?!.*WCE.*3)', axis=1) # drop WCE_3 replicates
max_max = CHIP.filter(regex='WCE').max().max()
#
PSEUDOCOUNT=1
CHIP_N = CHIP.copy().filter(regex=('^(?!WCE)'))                     + PSEUDOCOUNT

for i,CT in enumerate(CT_LIST):
    CT_COLS = CHIP_N.filter(regex=CT).columns
    print(CT,list(CT_COLS))
    WCE_COL = CHIP[f'WCE_{CT}_1']                                   + PSEUDOCOUNT
    CHIP_N[CT_COLS] = CHIP_N[CT_COLS].div(WCE_COL, axis=0)
    # check not 0 min
    not_0_min = CHIP[CHIP[f'WCE_{CT}_1'] != 0][f'WCE_{CT}_1'].min()
    min_val = CHIP[f'WCE_{CT}_1'].min()
    print(f'WCE_{CT}_1')
    print(f'!0 Minimum = {not_0_min}')
    print(f'Minimum = {min_val}\n')
    plt.subplot(1,4,i+1)
    plt.title(f'WCE_{CT}_1')
    sns.boxenplot(CHIP[f'WCE_{CT}_1'],color='grey').set_ylim(-0.05,max_max)
    plt.ylabel('')
    plt.tight_layout()
    sns.despine(bottom=1,right=0)

In [None]:
violins(CHIP_N, CT_COL_DICT,SAVEFIG=f'../figures/preprocessing/ChIP_RATIO_boxplots_{DISTANCE}.png',TITLE='HM/WCE',LOG_SCALE=1,X_LAB='x')

corr_clustering(CHIP_N,HM_COL_DICT,CT_COL_DICT,SAVEFIG=f'../figures/preprocessing/ChIP_RATIO_corr_clustermap_{DISTANCE}.pdf',
                TITLE=f'Samples correlation HM/WCE signal',
                CORR_METHOD='kendall',LINK_METHOD='average')

CHIP_PCA_HM(CHIP_N, HM_LIST=HM_LIST2,MARKER_LIST=MARKER_LIST, CT_REG=CT_REG, CT_COL_DICT=CT_COL_DICT, 
                SAVE_PREFIX=f'CHIP_RATIO')

In [None]:
CHIP_N.to_csv(f'../data/matrices/ChIP_TSS{DISTANCE}_RATIO.csv',index_label='GENE')

# RNA-seq trasformation  (log and Z-score)

In [None]:
PSEUDOCOUNT_RNA = 1 
RNA_DICT = PREPROCESS_DATA(RNA+PSEUDOCOUNT_RNA)

In [None]:
SUBSET_KEYS=['original', 'log','log_StdScaler','StdScaler']
for i, key in enumerate(SUBSET_KEYS):
    print(key)
    violins(RNA_DICT[key],COL_DICT=CT_COL_DICT,SAVEFIG=f'../figures/preprocessing/RNA_boxplots_{key}.png',X_LAB='log(FPKMs+1)',LOG_SCALE=0,TITLE=key)
    X=RNA_DICT[key].transpose()
    RNA_PCA(X, CT_REG=CT_REG, CT_COL_DICT=CT_COL_DICT, 
            SAVE_PREFIX=f'{key}')

## RNA-seq UMAP and t-SNE

In [None]:
RNA_UMAP(RNA_DICT['log_StdScaler'], CT_REG=CT_REG, CT_COL_DICT=CT_COL_DICT,SAVE_PREFIX='log_StdScaler')
RNA_tSNE(RNA_DICT['log_StdScaler'], CT_REG=CT_REG, CT_COL_DICT=CT_COL_DICT,SAVE_PREFIX='log_StdScaler')

# ChIP trasformation (log and Z-score)

In [None]:
X = CHIP_N.copy()

X.iloc[:,:] = np.log10(X.iloc[:,:])

X.iloc[:,:] = StandardScaler(with_mean=True).fit_transform(X.iloc[:,:])


In [None]:
violins(X,COL_DICT=CT_COL_DICT,SAVEFIG=f'../figures/preprocessing/CHIP_boxplots_{key}.png',X_LAB='x',LOG_SCALE=0,TITLE=key,SAT=1)
CHIP_PCA_HM(X, HM_LIST=HM_LIST2,MARKER_LIST=MARKER_LIST, CT_REG=CT_REG, CT_COL_DICT=CT_COL_DICT, 
        SAVE_PREFIX=f'CHIP_{key}')
corr_clustering(X,HM_COL_DICT,CT_COL_DICT,SAVEFIG=f'../figures/preprocessing/ChIP_log_stdscaler_corr_clustermap_{DISTANCE}.pdf',
                TITLE=f'Samples correlation log_stdscaler',
                CORR_METHOD='kendall',LINK_METHOD='average')

In [None]:
CHIP_DICT = PREPROCESS_DATA(CHIP_N)
CHIP_DICT.keys()

In [None]:
SUBSET_KEYS=['original', 'log','log_StdScaler','StdScaler']


In [None]:
HM_REG =r'(H3K4me3|H3K27ac|H3K27me3)'
CT_REG = r'(ESC|MES|CP|CM)'
MARKER_LIST=['o','X','^']

for i, key in enumerate(SUBSET_KEYS):
    print(key)
    violins(CHIP_DICT[key],COL_DICT=CT_COL_DICT,SAVEFIG=f'../figures/preprocessing/CHIP_boxplots_{key}.png',X_LAB='Z-score(log(x))',LOG_SCALE=0,TITLE=key,SAT=1)
    CHIP_PCA_HM(CHIP_DICT[key], HM_LIST=HM_LIST2,MARKER_LIST=MARKER_LIST, CT_REG=CT_REG, CT_COL_DICT=CT_COL_DICT, 
            SAVE_PREFIX=f'CHIP_{key}')
    
    corr_clustering(CHIP_DICT[key],HM_COL_DICT,CT_COL_DICT,SAVEFIG=f'../figures/preprocessing/CHIP_{key}_corr_clustermap_{DISTANCE}.pdf',
                TITLE=f'Samples correlation {key}',
                CORR_METHOD='kendall',LINK_METHOD='average')

## ChIP-seq UMAP and t-SNE

In [None]:
CHIP_UMAP(CHIP_DICT['log_StdScaler'], HM_REG=HM_REG,MARKER_LIST=MARKER_LIST, CT_REG=CT_REG, CT_COL_DICT=CT_COL_DICT, 
        SAVE_PREFIX='log_StdScaler')
CHIP_tSNE(CHIP_DICT['log_StdScaler'], HM_REG=HM_REG,MARKER_LIST=MARKER_LIST, CT_REG=CT_REG, CT_COL_DICT=CT_COL_DICT, 
        SAVE_PREFIX='log_StdScaler')

# ALL (genes)

Genes->     ALL / DE 

Features->  X / FC / X_FC

## ALL_X

In [None]:
DATA_DIR=f'../data/matrices/ALL/'
! mkdir -p {DATA_DIR}
RNA_log = RNA_DICT['log'].copy()
CHIP_log = CHIP_DICT['log'].copy()
assert (RNA_log.index == CHIP_log.index).all()
ALL_X = pd.concat((RNA_log, CHIP_log), axis=1)
#violins(ALL_X,COL_DICT=HM_COL_DICT,SAVEFIG='./figures/preprocessing/ALL_X.pdf',X_LAB='log(x)',TITLE='ALL_X',SAT=0.75)
ALL_X.to_csv(f'{DATA_DIR}ALL_X.csv', index_label='GENE')
# Z-score
ALL_X_z = ALL_X.copy()
ALL_X_z.iloc[:,:] = StandardScaler().fit_transform(ALL_X_z.iloc[:,:])
#violins(ALL_X_z,COL_DICT=HM_COL_DICT,SAVEFIG='./figures/preprocessing/ALL_X_z.pdf',X_LAB='Z-score(log(x))',TITLE='ALL_X_z',SAT=0.75)
ALL_X_z.to_csv(f'{DATA_DIR}ALL_X_z.csv', index_label='GENE')


## ALL_FC

### RNA FC

In [None]:
RNA_FC = RNA_STATS.filter(regex='FoldChange').add_prefix('RNA_')
new_column_names = {col: col.replace('log2FoldChange', 'FC') for col in RNA_FC.columns}
RNA_FC = RNA_FC.rename(columns=new_column_names)
RNA_FC = RNA_FC.loc[ALL_X.index] #filter
print(RNA_FC.isna().any(axis=1).sum())
RNA_FC = RNA_FC.fillna(0)
RNA_FC.isna().sum().sum()

In [None]:

#
assert (RNA_FC.index == ALL_X.index).all()
RNA_FC_z = RNA_FC.copy()
RNA_FC_z.iloc[:,:] = StandardScaler(with_mean=False).fit_transform(RNA_FC_z.iloc[:,:])
#violins(RNA_FC_STD, COL_DICT=CT_COL_DICT,SAVEFIG='./figures/preprocessing/RNA_FC_STD_boxplots.pdf',X_LAB='log2FC')

### CHIP FC

In [None]:
PREFIXES = []
for HM in HM_LIST2:
    for CT in CT_LIST:
        PREFIXES.append(HM+'_'+CT)
#
CHIP_AVG= calculate_mean_features(CHIP_N,PREFIXES)

In [None]:
CHIP_FC = CHIP_AVG.copy()
for HM in HM_LIST2:
    TMP = CHIP_AVG.filter(regex=HM)
    print(TMP.columns)
    for PAIRS in COMB:
        print(PAIRS)
        CT1, CT2 = PAIRS
        CHIP_FC[f'{HM}_{CT1}_{CT2}_FC'] = TMP[f'{HM}_{CT1}'] / TMP[f'{HM}_{CT2}']
CHIP_FC = CHIP_FC.filter(regex='FC')
CHIP_FC.iloc[:,:] = np.log2(CHIP_FC.iloc[:,:])
CHIP_FC_z = CHIP_FC.copy()
CHIP_FC_z.iloc[:,:] = StandardScaler(with_mean=False).fit_transform(CHIP_FC_z.iloc[:,:])


In [None]:
assert (ALL_X.index == RNA_FC.index).all() and (ALL_X.index == CHIP_FC.index).all()
#ALL_FC = pd.concat((RNA_FC, CHIP_FC), axis=1)
ALL_FC = RNA_FC.copy()
ALL_FC.to_csv(f'{DATA_DIR}ALL_FC.csv', index_label='GENE')
#violins(ALL_FC,COL_DICT=HM_COL_DICT,SAVEFIG='./figures/preprocessing/ALL_FC.pdf',X_LAB='log(x)',TITLE='ALL_FC')

In [None]:
assert (ALL_X_z.index == RNA_FC_z.index).all() and (ALL_X_z.index == CHIP_FC_z.index).all()
#ALL_FC_z = pd.concat((RNA_FC_z, CHIP_FC_z), axis=1)
ALL_FC_z = RNA_FC_z.copy()
ALL_FC_z.to_csv(f'{DATA_DIR}ALL_FC_z.csv', index_label='GENE')
#violins(ALL_FC_z,COL_DICT=HM_COL_DICT,SAVEFIG='./figures/preprocessing/ALL_FC_z.png',X_LAB='Z-score(log(x))',TITLE='ALL_FC_z')

## ALL_X_FC

In [None]:
assert (ALL_X.index == ALL_FC.index).all() 
ALL_X_FC = pd.concat((ALL_X,ALL_FC),axis=1)
ALL_X_FC.to_csv(f'{DATA_DIR}ALL_X_FC.csv', index_label='GENE')
violins(ALL_X_FC,COL_DICT=HM_COL_DICT,SAVEFIG='../figures/preprocessing/ALL_X_FC.png',X_LAB='log(x)',TITLE='ALL_X_FC')

In [None]:
assert (ALL_X_z.index == ALL_FC_z.index).all() 
ALL_X_FC_z = pd.concat((ALL_X_z,ALL_FC_z),axis=1)
ALL_X_FC_z.to_csv(f'{DATA_DIR}ALL_X_FC_z.csv', index_label='GENE')
violins(ALL_X_FC_z,COL_DICT=HM_COL_DICT,SAVEFIG='../figures/preprocessing/ALL_X_FC_z.png',X_LAB='Z-score(log(x))',TITLE='ALL_X_FC_z')

# DE_ and DE_FC Datasets

### DE_RNA_FC

In [None]:
DATA_DIR=f'../data/matrices/DE/'
! mkdir -p {DATA_DIR}
DE_RNA_FC = RNA_STATS.filter(regex='FoldChange|pvalue')
new_column_names = {col: col.replace('log2FoldChange', 'FC') for col in DE_RNA_FC.columns}
DE_RNA_FC = DE_RNA_FC.rename(columns=new_column_names)
DE_RNA_FC = DE_RNA_FC.loc[ALL_X.index] #filter out ChIP high signal
#
FILTER = np.zeros(DE_RNA_FC.shape[0], dtype=bool)
for comb in COMB:
    comb = f'{comb[0]}_{comb[1]}'
    FILTER2 = ((DE_RNA_FC[f'{comb}_pvalue'] < 0.05) & (DE_RNA_FC[f'{comb}_FC'].abs() > 1))
    print(f'{comb}: {FILTER2.sum()}')
    FILTER = FILTER | FILTER2
print(FILTER.sum())
DE_RNA_FC = DE_RNA_FC[FILTER].filter(regex='FC').add_prefix('RNA_')      
print(f'NaN -> 0: #{DE_RNA_FC.isna().sum().sum()}')   
DE_RNA_FC = DE_RNA_FC.fillna(0)              

### DE_ChIP_FC

In [None]:
DE_CHIP_FC = CHIP_FC.loc[DE_RNA_FC.index]
assert (DE_CHIP_FC.index == DE_RNA_FC.index).all()
#DE_FC = pd.concat((DE_RNA_FC, DE_CHIP_FC), axis=1)
DE_FC = DE_RNA_FC.copy()
DE_FC.to_csv(f'{DATA_DIR}DE_FC.csv', index_label='GENE')
#
DE_FC_z = DE_FC.copy()
DE_FC_z.iloc[:,:] = StandardScaler().fit_transform(DE_FC_z.iloc[:,:])
DE_FC_z.to_csv(f'{DATA_DIR}DE_FC_z.csv', index_label='GENE')

### DE_X and DE_X_z

In [None]:
DE_X = ALL_X.loc[DE_FC.index]
DE_X.to_csv(f'{DATA_DIR}DE_X.csv', index_label='GENE')
#
DE_X_z = DE_X.copy()
DE_X_z.iloc[:,:] = StandardScaler().fit_transform(DE_X_z.iloc[:,:])
DE_X_z.to_csv(f'{DATA_DIR}DE_X_z.csv', index_label='GENE')

### DE_X_FC and DE_X_FC_z

In [None]:
assert (DE_X.index == DE_FC.index).all()
DE_X_FC = pd.concat((DE_X, DE_FC), axis=1)
DE_X_FC.to_csv(f'{DATA_DIR}DE_X_FC.csv', index_label='GENE')
#violins(DE_X_FC,COL_DICT=HM_COL_DICT,SAVEFIG='./figures/preprocessing/DE_X_FC.pdf',X_LAB='log(x)',TITLE='DE_X_FC')
assert (DE_X_z.index == DE_FC_z.index).all()
DE_X_FC_z = pd.concat((DE_X_z, DE_FC_z), axis=1)
DE_X_FC_z.to_csv(f'{DATA_DIR}DE_X_FC_z.csv', index_label='GENE')
violins(DE_X_FC_z,COL_DICT=HM_COL_DICT,SAVEFIG='../figures/preprocessing/DE_X_FC_z.png',X_LAB='Z-score(log(x))',TITLE='DE_X_FC_z')