In [None]:
#Trying differential compartment analysis https://github.com/ay-lab/dcHiC

In [None]:
import pandas as pd
import bioframe as bf
from scipy import stats
import biomart

In [None]:
#conditions
conditions = [
    'WT_Ctrl_R1',
    'WT_ATRA_R1',
    'BKO_Ctrl_R1',
    'BKO_ATRA_R1',
    'WT_Ctrl_R2',
    'WT_ATRA_R2',
    'BKO_Ctrl_R2',
    'BKO_ATRA_R2'
]

long_names = {
    'WT_Ctrl_R1' : 'CA-HiC-Dpn-SH-SY5Y-WT-Ctrl-4-51-R1-T1',
    'WT_ATRA_R1' : 'CA-HiC-Dpn-SH-SY5Y-WT-ATRA-5days-4-51-R1-T1',
    'BKO_Ctrl_R1' : 'CA-HiC-Dpn-SH-SY5Y-BKO98-Ctrl-4-51-R1-T1',
    'BKO_ATRA_R1' : 'CA-HiC-Dpn-SH-SY5Y-BKO98-ATRA-5days-4-51-R1-T1',
    'WT_Ctrl_R2' : 'CA-HiC-Dpn-SH-SY5Y-WT-Ctrl-4-52-R2-T1',
    'WT_ATRA_R2' : 'CA-HiC-Dpn-SH-SY5Y-WT-ATRA-5days-4-52-R2-T1',
    'BKO_Ctrl_R2' : 'CA-HiC-Dpn-SH-SY5Y-BKO98-Ctrl-4-52-R2-T1',
    'BKO_ATRA_R2' : 'CA-HiC-Dpn-SH-SY5Y-BKO98-ATRA-5days-4-52-R2-T1'
}

In [None]:
#add colors for each sample
sampleColors = {
    'WT_Ctrl_R1' : '#a6cee3',
    'WT_ATRA_R1' : '#1f78b4',
    'BKO_Ctrl_R1' : '#b2df8a',
    'BKO_ATRA_R1' : '#33a02c',
    'WT_Ctrl_R2' : '#a6cee3',
    'WT_ATRA_R2' : '#1f78b4',
    'BKO_Ctrl_R2' : '#b2df8a',
    'BKO_ATRA_R2' : '#33a02c',
    'WT_Ctrl_R1R2' : '#a6cee3',
    'WT_ATRA_R1R2' : '#1f78b4',
    'BKO_Ctrl_R1R2' : '#b2df8a',
    'BKO_ATRA_R1R2' : '#33a02c'
}


sampleLineStyles = {
    'WT_Ctrl_R1' : '--',
    'WT_ATRA_R1' : '--',
    'BKO_Ctrl_R1' : '--',
    'BKO_ATRA_R1' : '--',
    'WT_Ctrl_R2' : ':',
    'WT_ATRA_R2' : ':',
    'BKO_Ctrl_R2' : ':',
    'BKO_ATRA_R2' : ':'}

In [None]:
SepCtrlConds = [
    'WT_Ctrl_R1',
    'WT_Ctrl_R1',
    'WT_ATRA_R1',
    'BKO_Ctrl_R1',
    'WT_Ctrl_R2',
    'WT_Ctrl_R2',
    'WT_ATRA_R2',
    'BKO_Ctrl_R2',
]

SepTreatConds = [
    'WT_ATRA_R1',
    'BKO_Ctrl_R1',
    'BKO_ATRA_R1',
    'BKO_ATRA_R1',
    'WT_ATRA_R2',
    'BKO_Ctrl_R2',
    'BKO_ATRA_R2',
    'BKO_ATRA_R2',   
]

In [None]:
Treatment_Dict = {
    'WT_Ctrl_R1' : 'WT_Ctrl',
    'WT_ATRA_R1' : 'WT_ATRA',
    'BKO_Ctrl_R1' : 'BKO_Ctrl',
    'BKO_ATRA_R1' : 'BKO_ATRA',
    'WT_Ctrl_R2' : 'WT_Ctrl',
    'WT_ATRA_R2' : 'WT_ATRA',
    'BKO_Ctrl_R2' : 'BKO_Ctrl',
    'BKO_ATRA_R2' : 'BKO_ATRA'   
}

In [None]:
comparison_pairs = [
    ('WT_ATRA', 'WT_Ctrl'),
    ('BKO_ATRA', 'BKO_Ctrl'),
    ('BKO_Ctrl', 'WT_Ctrl'),
    ('BKO_ATRA', 'WT_ATRA')
]

rep_dict = {
    'WT_Ctrl' : ['WT_Ctrl_R1', 'WT_Ctrl_R2'],
    'WT_ATRA' : ['WT_ATRA_R1', 'WT_ATRA_R2'],
    'BKO_Ctrl' : ['BKO_Ctrl_R1', 'BKO_Ctrl_R2'],
    'BKO_ATRA' : ['BKO_ATRA_R1', 'BKO_ATRA_R2']

}

In [None]:
outDataDir = '..'
chromsizes = '../hg38_chromsizes_trimmed.txt' #tsv with chromosome name and size in bp

In [None]:
#coolers - 250kb bins
binsize = 250000

clr_paths= {}
for cond in conditions:
    clr_paths[cond] = f'{outDataDir}/data/{long_names[cond]}.sampled.hg38.mapq_30.1000.mcool'

In [None]:
#First need to convert coolers into hicpro files for this analysis
for cond in conditions:
    coolfile = clr_paths[cond]
    prefix_name = f'{outDataDir}/data/{long_names[cond]}'
    !python ./bin/git/dcHiC/utility/preprocess.py -input cool -file $coolfile -genomeFile $chromsizes -res $binsize -prefix $prefix_name


In [None]:
#Make input file for multiple pairwise comparisons - instead of all together

dchic_input_files_pairwise = {}

for pair in comparison_pairs:
    df = pd.DataFrame(columns = ['Matrix', 'Bed', 'Rep_Prefix', 'Treatment_Prefix'])
    pair_conds = rep_dict[pair[0]] + rep_dict[pair[1]]
    for cond in pair_conds:
        df = pd.concat([
            df, 
            pd.DataFrame.from_dict(data = {
                'Matrix' : [f'{outDataDir}/data/{long_names[cond]}_{binsize}.matrix'],
                'Bed' : [f'{outDataDir}/data/{long_names[cond]}_{binsize}_abs.bed'],
                'Rep_Prefix' : [cond],
                'Treatment_Prefix' : [Treatment_Dict[cond]]
            })], ignore_index = True).reset_index(drop = True)
    dchic_input_files_pairwise[pair] = f'{outDataDir}/data/dchic_input_file_{pair[0]}vs{pair[1]}.txt'
    df.to_csv(dchic_input_files_pairwise[pair], sep = '\t', header = False, index = False)

In [None]:
dchic_input_files_pairwise

In [None]:
#I don't like the dchic pcas - just chrom arms, not comps...using eigens from cooltools instead
#First need to replace NA with 0

for cond in conditions:
    eigs = pd.read_csv(f'{outDataDir}/data/{long_names[cond]}.250kb.mapq30.byarm.eigs.cis.vecs.E1.bedGraph', header = None, sep = '\t')
    eigs_nona = eigs.dropna()
    eigs_nona.to_csv(f'{outDataDir}/data/{long_names[cond]}_FillNA.250kb.mapq30.byarm.eigs.cis.vecs.E1.bedGraph', sep = '\t', header = False, index = False)

In [None]:
#make input file for cooltools eigens

input_cooltools_eigens = pd.DataFrame(columns = ['EigPath', 'EigType', 'Rep_Prefix', 'Treatment_Prefix'])
for cond in conditions:
    input_cooltools_eigens = input_cooltools_eigens.append(pd.DataFrame.from_dict(data = {
        'EigPath' : [f'{outDataDir}/data/{long_names[cond]}_FillNA.250kb.mapq30.byarm.eigs.cis.vecs.E1.bedGraph'],
        'EigType' : ['intra'],
        'Rep_Prefix' : [cond],
        'Treatment_Prefix' : [Treatment_Dict[cond]]
    })).reset_index(drop = True)

In [None]:
input_cooltools_eigens.to_csv(f'{outDataDir}/data/input_cooltools_eigens.txt', sep = '\t', header = False, index = False)

In [None]:
input_cooltools_eigens_filename = f'{outDataDir}/data/input_cooltools_eigens.txt'

In [None]:
input_cooltools_eigens

In [None]:
!Rscript /home/eh37w/bin/git/dcHiC/utility/getcHiCinputfromExistingPCs.r --input $input_cooltools_eigens_filename 

In [None]:
#Analyze eigens (from cooltools) for differential regions between pairs of samples

for pair in comparison_pairs:
    dchic_input_file = dchic_input_files_pairwise[pair]
    diff_dir_name = f'{pair[0]}vs{pair[1]}_dcHiC'
    !Rscript /home/bin/git/dcHiC/dchicf.r --pcatype analyze --dirovwt T --genome hg38 --file $dchic_input_file --diffdir $diff_dir_name


In [None]:
#call subcompartments using dchic for each pair

for pair in comparison_pairs:
    dchic_input_file = dchic_input_files_pairwise[pair]
    diff_dir_name = f'{pair[0]}vs{pair[1]}_dcHiC'    
    !Rscript /home/bin/git/dcHiC/dchicf.r --file $dchic_input_file --pcatype subcomp --dirovwt T --diffdir $diff_dir_name
    

In [None]:
#GSEA for pairwise comparisons
for pair in comparison_pairs:
    dchic_input_file = dchic_input_files_pairwise[pair]
    diff_dir_name = f'{pair[0]}vs{pair[1]}_dcHiC'  
    cells1 = pair[0]
    cells2 = pair[1]
    !Rscript /home/bin/git/dcHiC/dchicf.r --file $dchic_input_file --pcatype enrich --genome hg38 --diffdir $diff_dir_name --region anchor --exclA F --interaction intra --pcscore T --compare T --cells $cells1,$cells2

In [None]:
for pair in comparison_pairs:
    dchic_input_file = dchic_input_files_pairwise[pair]
    diff_dir_name = f'{pair[0]}vs{pair[1]}_dcHiC'  
    !Rscript /home/bin/git/dcHiC/dchicf.r --file $dchic_input_file --pcatype viz --diffdir $diff_dir_name --genome hg38 

In [None]:
#For each pairwise comparison - what fraction of the genome has differential compartments?

In [None]:
eig_binsize = 250000
diff_frac_df = pd.DataFrame(columns = ['comparison', 'cond_1', 'cond_2', 'diff_frac', 'diff_size', 'total_size'])
for pair in comparison_pairs:
    diff_dir_name = f'DifferentialResult/{pair[0]}vs{pair[1]}_dcHiC'  
    diff_result_filtered = pd.read_csv(f'{diff_dir_name}/fdr_result/differential.intra_sample_group.Filtered.pcQnm.bedGraph', sep = '\t')
    diff_result_unfiltered = pd.read_csv(f'{diff_dir_name}/fdr_result/differential.intra_sample_group.pcQnm.bedGraph', sep = '\t')
    diff_frac = len(diff_result_filtered['start'])/len(diff_result_unfiltered['start'])
    diff_size = len(diff_result_filtered['start'])*250000
    total_size = len(diff_result_unfiltered['start'])*250000
    diff_frac_df = pd.concat([diff_frac_df, pd.DataFrame({
        'comparison' : f'{pair[0]} \nvs \n{pair[1]}',
        'cond_1' : pair[0],
        'cond_2' : pair[1],
        'diff_frac' : diff_frac,
        'diff_size' : diff_size,
        'total_size' : total_size
    }, index = [0])], ignore_index = True).reset_index(drop = True)

In [None]:
diff_frac_df.to_csv(f'{outDataDir}/DifferentialCompartmentFractions_250kb.txt', sep = '\t', index = False)

In [None]:
diff_frac_df

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.gridspec import GridSpec

sns.set_style("ticks")
sns.set_context("paper")
gs = GridSpec(nrows= 1, ncols=1, wspace = 0.6, hspace = 0.6)
plt.figure(figsize=(5, 3))

cmap_bar = sns.color_palette(['#a6cee3', '#1f78b4', '#b2df8a', '#33a02c'])

ax1 = sns.barplot(data = diff_frac_df, x = 'comparison', y = 'diff_frac', palette = cmap_bar)
handles, labels = ax1.get_legend_handles_labels()
ax1.legend(handles[4:8], labels[4:8], bbox_to_anchor=(1.04,1), frameon = False)
 
plt.title(f'Differential Compartment Fraction')
plt.ylabel('Fraction Differential')  
plt.xlabel('Comparison')
        
plt.savefig(f'{outDataDir}/figures/R1R2_HiC_diff_comp_fraction_250kb.png', dpi = 300, bbox_inches = "tight")

In [None]:
#For each pairwise comparison - how many genes are in differential compartments?
#Differential A compartments (higher A score than other sample) for each

In [None]:
comparison_pairs

In [None]:
diff_gene_df = pd.DataFrame(columns = ['cond_1', 'cond_2', 'cond_1_diff_A_genes', 'cond_2_diff_A_genes'])
for pair in comparison_pairs:
    diff_dir_name = f'DifferentialResult/{pair[0]}vs{pair[1]}_dcHiC'  
    cond1_diff_genes = pd.read_csv(f'{diff_dir_name}/geneEnrichment/comparison_{pair[0]}_vs_{pair[1]}/{pair[0]}_geneEnrichment/{pair[0]}_geneList.anchor.txt', header = None)
    cond2_diff_genes = pd.read_csv(f'{diff_dir_name}/geneEnrichment/comparison_{pair[0]}_vs_{pair[1]}/{pair[1]}_geneEnrichment/{pair[1]}_geneList.anchor.txt', header = None)
    diff_gene_df = pd.concat([diff_gene_df, pd.DataFrame({
        'cond_1' : pair[0],
        'cond_2' : pair[1],
        'cond_1_diff_A_genes' : len(cond1_diff_genes),
        'cond_2_diff_A_genes' : len(cond2_diff_genes)}, index = [0])], ignore_index = True).reset_index(drop = True)

In [None]:
diff_gene_df

In [None]:
diff_gene_df.to_csv(f'{outDataDir}/Genes_Differential_Comps_Count.txt', sep = '\t', index = False)

In [None]:
#Is there concordance/discordance between differential compartment locations across comparisons?
#WT vs Top2BKO ATRA differentiation
diff_A_comps = {}
for pair in comparison_pairs:
    diff_A_comps[f'{pair[0]}vs{pair[1]}'] = {}
    diff_dir_name = f'DifferentialResult/{pair[0]}vs{pair[1]}_dcHiC'  
    diff_A_comps[f'{pair[0]}vs{pair[1]}'][f'{pair[0]}'] = pd.read_csv(f'{diff_dir_name}/geneEnrichment/comparison_{pair[0]}_vs_{pair[1]}/{pair[0]}_geneEnrichment/{pair[0]}_Diff_A_compartments.bedGraph', sep = '\t', header = None)
    diff_A_comps[f'{pair[0]}vs{pair[1]}'][f'{pair[1]}'] = pd.read_csv(f'{diff_dir_name}/geneEnrichment/comparison_{pair[0]}_vs_{pair[1]}/{pair[1]}_geneEnrichment/{pair[1]}_Diff_A_compartments.bedGraph', sep = '\t', header = None)
    diff_A_comps[f'{pair[0]}vs{pair[1]}'][f'{pair[0]}'].columns = ['chrom', 'start', 'end']
    diff_A_comps[f'{pair[0]}vs{pair[1]}'][f'{pair[1]}'].columns = ['chrom', 'start', 'end']

In [None]:
#What is different in the ATRA vs Ctrl comparison between WT and Top2BKO?
wt_vs_top2b_diff_Acomps = {}

wt_A_ATRA = diff_A_comps['WT_ATRAvsWT_Ctrl']['WT_ATRA']
wt_A_Ctrl = diff_A_comps['WT_ATRAvsWT_Ctrl']['WT_Ctrl']
top2bko_A_ATRA = diff_A_comps['BKO_ATRAvsBKO_Ctrl']['BKO_ATRA']
top2bko_A_Ctrl = diff_A_comps['BKO_ATRAvsBKO_Ctrl']['BKO_Ctrl']

olap_ATRA = bf.overlap(wt_A_ATRA, top2bko_A_ATRA, how='outer', suffixes=('_WT','_BKO'))
olap_Ctrl = bf.overlap(wt_A_Ctrl, top2bko_A_Ctrl, how='outer', suffixes=('_WT','_BKO'))

wt_vs_top2b_diff_Acomps['ATRA_diffA_Both'] = olap_ATRA[
    (olap_ATRA['chrom_WT'] == olap_ATRA['chrom_BKO']) &
    (olap_ATRA['start_WT'] == olap_ATRA['start_BKO']) &
    (olap_ATRA['end_WT'] == olap_ATRA['end_BKO'])
]

wt_vs_top2b_diff_Acomps['Ctrl_diffA_Both'] = olap_Ctrl[
    (olap_Ctrl['chrom_WT'] == olap_Ctrl['chrom_BKO']) &
    (olap_Ctrl['start_WT'] == olap_Ctrl['start_BKO']) &
    (olap_Ctrl['end_WT'] == olap_Ctrl['end_BKO'])
]

wt_vs_top2b_diff_Acomps['ATRA_diffA_WT_Only'] = olap_ATRA[
    (olap_ATRA['chrom_WT'] != olap_ATRA['chrom_BKO'])
][['chrom_WT', 'start_WT', 'end_WT']].dropna()

wt_vs_top2b_diff_Acomps['Ctrl_diffA_WT_Only'] = olap_Ctrl[
    (olap_Ctrl['chrom_WT'] != olap_Ctrl['chrom_BKO'])
][['chrom_WT', 'start_WT', 'end_WT']].dropna()

wt_vs_top2b_diff_Acomps['ATRA_diffA_BKO_Only'] = olap_ATRA[
    (olap_ATRA['chrom_WT'] != olap_ATRA['chrom_BKO'])
][['chrom_BKO', 'start_BKO', 'end_BKO']].dropna()

wt_vs_top2b_diff_Acomps['Ctrl_diffA_BKO_Only'] = olap_Ctrl[
    (olap_Ctrl['chrom_WT'] != olap_Ctrl['chrom_BKO'])
][['chrom_BKO', 'start_BKO', 'end_BKO']].dropna()

In [None]:
wt_vs_top2b_diff_Acomps

In [None]:
wt_vs_top2b_diff_Acomps_df = pd.DataFrame()
#make into dataframe to save as txt
for key in wt_vs_top2b_diff_Acomps.keys():
    wt_vs_top2b_diff_Acomps[key]['Category'] = key
    wt_vs_top2b_diff_Acomps_df = pd.concat([wt_vs_top2b_diff_Acomps_df, wt_vs_top2b_diff_Acomps[key]], ignore_index = True).reset_index(drop = True)

In [None]:
wt_vs_top2b_diff_Acomps_df.to_csv(f'{outDataDir}/250kb_Differential_A_Comps_WT_vs_Top2BKO_Ctrl_ATRA.txt', sep = '\t', index = False)

In [None]:
#For each pairwise comparison - overlap de genes at 24hrs with genes in differential compartments

In [None]:
comparison_pairs

In [None]:
import biomart

def get_ensembl_mappings():                                   
    # Set up connection to server                                               
    server = biomart.BiomartServer('http://useast.ensembl.org/biomart')            
    mart = server.datasets['hsapiens_gene_ensembl']                            
                                                                                
    # List the types of data we want                                            
    attributes = ['ensembl_transcript_id', 'hgnc_symbol',
                  'ensembl_gene_id', 'ensembl_peptide_id', 
                  'entrezgene_accession', 'entrezgene_id']
                                                                                
    # Get the mapping between the attributes                                    
    response = mart.search({'attributes': attributes})                          
    data = response.raw.data.decode('ascii')                                    
                                                                                
    ensembl_to_genesymbol = {}                                                  
    # Store the data in a dict                                                  
    for line in data.splitlines():                                              
        line = line.split('\t')                                                 
        # The entries are in the same order as in the `attributes` variable
        transcript_id = line[0]                                                 
        gene_symbol = line[1]                                                   
        ensembl_gene = line[2]                                                  
        ensembl_peptide = line[3] 
        entrezgene_accession = line[4]
        entrezgene_id = line[5]
                                                                                
        # Some of these keys may be an empty string. If you want, you can 
        # avoid having a '' key in your dict by ensuring the 
        # transcript/gene/peptide ids have a nonzero length before
        # adding them to the dict
        ensembl_to_genesymbol[transcript_id] = gene_symbol                      
        ensembl_to_genesymbol[ensembl_gene] = gene_symbol                       
        ensembl_to_genesymbol[ensembl_peptide] = gene_symbol 
        ensembl_to_genesymbol[entrezgene_accession] = gene_symbol
        ensembl_to_genesymbol[entrezgene_id] = ensembl_gene
                                                                                
    return ensembl_to_genesymbol

In [None]:
ensembl_mappings = get_ensembl_mappings()

In [None]:
#read in the gene lists for each comparison from the differential compartment analysis
#ensembl IDs, but missing ENSG and prepended 0s
diff_A_gene_lists = {}
for pair in comparison_pairs:
    diff_A_gene_lists[f'{pair[0]}vs{pair[1]}'] = {}
    diff_dir_name = f'DifferentialResult/{pair[0]}vs{pair[1]}_dcHiC'  
    diff_A_gene_lists[f'{pair[0]}vs{pair[1]}'][pair[0]] = pd.read_csv(f'{diff_dir_name}/geneEnrichment/comparison_{pair[0]}_vs_{pair[1]}/{pair[0]}_geneEnrichment/{pair[0]}_geneList.anchor.txt', header = None)
    diff_A_gene_lists[f'{pair[0]}vs{pair[1]}'][pair[1]] = pd.read_csv(f'{diff_dir_name}/geneEnrichment/comparison_{pair[0]}_vs_{pair[1]}/{pair[1]}_geneEnrichment/{pair[1]}_geneList.anchor.txt', header = None)


In [None]:
#convert to ensembl_geneid to be able to match with degenes
diff_A_gene_lists_ensembl = {}
for pair in comparison_pairs:
    diff_A_gene_lists_ensembl[f'{pair[0]}vs{pair[1]}'] = {}
    gene_list1 = list(diff_A_gene_lists[f'{pair[0]}vs{pair[1]}'][pair[0]][0])
    ensembl_list1 = []
    for gene in gene_list1:
        ens = ensembl_mappings.get(f'{gene}')
        if ens:
            ensembl_list1.append(ens)
    diff_A_gene_lists_ensembl[f'{pair[0]}vs{pair[1]}'][pair[0]] = ensembl_list1

    gene_list2 = list(diff_A_gene_lists[f'{pair[0]}vs{pair[1]}'][pair[1]][0])
    ensembl_list2 = []
    for gene in gene_list2:
        ens = ensembl_mappings.get(f'{gene}')
        if ens:
            ensembl_list2.append(ens)
    diff_A_gene_lists_ensembl[f'{pair[0]}vs{pair[1]}'][pair[1]] = ensembl_list2


In [None]:
gene_data_dir = '..'
deg_genes_24hr = pd.read_csv(f'{gene_data_dir}/SHSY2019_ATRA_24hr_union_table_test_all_log2.csv')

In [None]:
#split ensembl_gene_id column into just the final numbers
deg_genes_24hr

In [None]:
DEG_Comparisons = {
    ('WT_ATRA', 'WT_Ctrl') : 'WT_C_WT_R',
    ('BKO_ATRA', 'BKO_Ctrl') : 'BKO_C_BKO_R',
    ('BKO_Ctrl', 'WT_Ctrl') : 'WT_C_BKO_C',
    ('BKO_ATRA', 'WT_ATRA') : 'WT_R_BKO_R'   
}

In [None]:
deg_lists_24hrs = {}
for pair in comparison_pairs:
    deg_lists_24hrs[f'{pair[0]}vs{pair[1]}'] = {}
    
    deg_up = list(deg_genes_24hr[
        (deg_genes_24hr[f'{DEG_Comparisons[pair]}_log2FC'] > 1.5) & 
        (deg_genes_24hr[f'{DEG_Comparisons[pair]}_padj'] < 0.05)
    ]['ensembl_geneid'])
    deg_lists_24hrs[f'{pair[0]}vs{pair[1]}'][pair[0]] = deg_up
    
    deg_down = list(deg_genes_24hr[
        (deg_genes_24hr[f'{DEG_Comparisons[pair]}_log2FC'] < -1.5) &
        (deg_genes_24hr[f'{DEG_Comparisons[pair]}_padj'] < 0.05)
    ]['ensembl_geneid'])
    deg_lists_24hrs[f'{pair[0]}vs{pair[1]}'][pair[1]] = deg_down
        

In [None]:
#overlap the lists for each comparison
deg_vs_dchic_overlap_counts = pd.DataFrame(columns = ['comparison', 'upreg_cond', 'de_gene_count', 
                                                      'diffAcomp_gene_count', 'both_gene_count'])
for pair in comparison_pairs:
    deg_vs_dchic_overlap_counts = pd.concat([deg_vs_dchic_overlap_counts, pd.DataFrame({
        'comparison' : [f'{pair[0]}vs{pair[1]}', f'{pair[0]}vs{pair[1]}'],
        'upreg_cond' : [pair[0], pair[1]],
        'de_gene_count' : [len(deg_lists_24hrs[f'{pair[0]}vs{pair[1]}'][pair[0]]), len(deg_lists_24hrs[f'{pair[0]}vs{pair[1]}'][pair[1]])],
        'diffAcomp_gene_count' : [len(diff_A_gene_lists_ensembl[f'{pair[0]}vs{pair[1]}'][pair[0]]), len(diff_A_gene_lists_ensembl[f'{pair[0]}vs{pair[1]}'][pair[1]])],
        'both_gene_count' : [len(set(diff_A_gene_lists_ensembl[f'{pair[0]}vs{pair[1]}'][pair[0]]) & set(deg_lists_24hrs[f'{pair[0]}vs{pair[1]}'][pair[0]])),
                            len(set(diff_A_gene_lists_ensembl[f'{pair[0]}vs{pair[1]}'][pair[1]]) & set(deg_lists_24hrs[f'{pair[0]}vs{pair[1]}'][pair[1]]))]
    })], ignore_index = True).reset_index(drop = True)

In [None]:
deg_vs_dchic_overlap_counts

In [None]:
#add p value from hypergeometric distribution, using 25000 as estimate of total genes

In [None]:
#M = total number of genes in genome
#n = total number of differentially expressed genes
#N = total number of genes overlapping diff compartments
#k = number of genes overlapping diff comps that are also differentially expressed

pval_hyper = []
for pair in comparison_pairs:
    df1 = deg_vs_dchic_overlap_counts[
        (deg_vs_dchic_overlap_counts['comparison'] == f'{pair[0]}vs{pair[1]}') &
        (deg_vs_dchic_overlap_counts['upreg_cond'] == pair[0])]
    
    pval_hyper.append(
        stats.hypergeom(
            M = 25000,
            n = int(df1['de_gene_count']),
            N = int(df1['diffAcomp_gene_count'])).sf(int(df1['both_gene_count'])-1))
        
    df2 = deg_vs_dchic_overlap_counts[
        (deg_vs_dchic_overlap_counts['comparison'] == f'{pair[0]}vs{pair[1]}') &
        (deg_vs_dchic_overlap_counts['upreg_cond'] == pair[1])]
    
    pval_hyper.append(
        stats.hypergeom(
            M = 25000,
            n = int(df2['de_gene_count']),
            N = int(df2['diffAcomp_gene_count'])).sf(int(df2['both_gene_count'])-1))
        
deg_vs_dchic_overlap_counts['pvalue_hyper'] = pval_hyper

In [None]:
melted = deg_vs_dchic_overlap_counts.melt(id_vars = ['comparison', 'upreg_cond'])

In [None]:
melted['xaxis'] = melted['comparison'] + ': Up ' + melted['upreg_cond']

In [None]:
melted = melted[
       (melted['variable'] != 'pvalue_hyper')
      ]

In [None]:
sns.barplot(data = melted,
            x = 'xaxis', hue = 'variable', y = 'value')
plt.title('Overlap')
plt.ylabel('Gene Count')
plt.xlabel('Comparison')
plt.xticks(rotation=45, ha='right')
plt.savefig(f'{outDataDir}/figures/degenes_vs_dchic_overlap_barplot.png', dpi = 300, bbox_inches = 'tight')

In [None]:
melted

In [None]:
deg_vs_dchic_overlap_counts.to_csv(f'{outDataDir}/220930_250kbeigs_deg24hr_vs_dchic_overlap_counts_phyper.txt', sep = '\t', index = False)

In [None]:
deg_vs_dchic_overlap_counts

In [None]:
#List of interesting genes - concordant changes

In [None]:
#overlap the lists for each comparison
deg_vs_dchic_overlap_genes = {}
for pair in comparison_pairs:
    deg_vs_dchic_overlap_genes[f'{pair[0]}vs{pair[1]}'] = {} 
    deg_vs_dchic_overlap_genes[f'{pair[0]}vs{pair[1]}'][f'{pair[0]}_Up_A'] = set(diff_A_gene_lists_ensembl[f'{pair[0]}vs{pair[1]}'][pair[0]]) & set(deg_lists_24hrs[f'{pair[0]}vs{pair[1]}'][pair[0]])
    deg_vs_dchic_overlap_genes[f'{pair[0]}vs{pair[1]}'][f'{pair[1]}_Up_A'] = set(diff_A_gene_lists_ensembl[f'{pair[0]}vs{pair[1]}'][pair[1]]) & set(deg_lists_24hrs[f'{pair[0]}vs{pair[1]}'][pair[1]])


In [None]:
wt_vs_top2b_degene_Acomps_df = pd.DataFrame()
#make into dataframe to save as txt
for key1 in deg_vs_dchic_overlap_genes.keys():
    for key2 in deg_vs_dchic_overlap_genes[key1].keys():
        df = pd.DataFrame({'Genes' : list(deg_vs_dchic_overlap_genes[key1][key2])})
        df['Comparison'] = key1
        df['DEG_Up_diffA'] = key2

        wt_vs_top2b_degene_Acomps_df = pd.concat([wt_vs_top2b_degene_Acomps_df, df], ignore_index = True).reset_index(drop = True)

In [None]:
wt_vs_top2b_degene_Acomps_df.to_csv(f'{outDataDir}/data/Differential_Genes_In_Diff_A_Comps_WT_vs_Top2BKO_Ctrl_ATRA.txt', sep = '\t', index = False)

In [None]:
wt_vs_top2b_degene_Acomps_df