In [1]:
import pandas as pd
import os

### Import Epigenetics and RNA-seq data

In [2]:
master_folder_path = 'Multiome'

dfs_epigenetics = {}
dfs_transcriptomics = {}

# Iterate through all subdirectories and files in the master folder
for root, dirs, files in os.walk(master_folder_path):
    # If we are at the third level of directories get the histone modification assay data
    if len(root.split("/")) == 3:
        cell_line = root.split("/")[1]
        dfs_epigenetics[cell_line] = []
        for file in files:
            if file.endswith('.csv'):
                # Construct the full path to the CSV file
                csv_file_path = os.path.join(root, file)
                histone_mark = csv_file_path.split("_")[-1].strip(".csv")
        
                # Read the CSV file using pandas
                df = pd.read_csv(csv_file_path)
                df['annotation'] = df['annotation'].apply(lambda x: x.split(" (")[0])
                df['annotation'] = df['annotation'] + f"_{histone_mark}"
                dfs_epigenetics[cell_line].append(df)
    # If we are at the second level of directories get RNA seq data
    if len(root.split("/")) == 2:
        cell_line = root.split("/")[1]
        for file in files:
            if file.endswith('.tsv'):
                tsv_file_path = os.path.join(root, file)
                rna_seq = pd.read_csv(tsv_file_path, sep = '\t')
                dfs_transcriptomics[cell_line] = rna_seq

### Lets use Mammary Epithelial Cell as an example for now and create the feature matrix

In [3]:
# Take this as an example for now
df_aggregated = pd.concat(dfs_epigenetics['Mammary Epithelial Cell'])

In [4]:
df_aggregated.head()

Unnamed: 0.1,Unnamed: 0,seqnames,start,end,width,strand,annotation,geneChr,geneStart,geneEnd,geneLength,geneStrand,geneId,transcriptId,distanceToTSS
0,1,chr1,100000388,100000681,294,*,Exon_H3K36me3,1,99993546,100023411,29866,1,23443,ENST00000639148.1,6842
1,2,chr1,100008333,100009036,704,*,Exon_H3K36me3,1,99993546,100023411,29866,1,23443,ENST00000639148.1,14787
2,3,chr1,100013273,100013497,225,*,Intron_H3K36me3,1,99993546,100023411,29866,1,23443,ENST00000639148.1,19727
3,4,chr1,100013752,100014072,321,*,Intron_H3K36me3,1,99993546,100023411,29866,1,23443,ENST00000639148.1,20206
4,5,chr1,100015168,100015392,225,*,3' UTR_H3K36me3,1,99993546,100023411,29866,1,23443,ENST00000639148.1,21622


In [5]:
# Get the number of peaks per gene region
peak_count_df = df_aggregated.groupby(["geneId","annotation"],as_index=False).agg(
    peak_counts = ('annotation','size')
)

In [7]:
feature_matrix = peak_count_df.pivot_table(columns='annotation',index=['geneId'])['peak_counts']

In [8]:
feature_matrix.shape

(11382, 7)

In [9]:
feature_matrix.head()

annotation,3' UTR_H3K36me3,5' UTR_H3K36me3,Distal Intergenic_H3K36me3,Downstream_H3K36me3,Exon_H3K36me3,Intron_H3K36me3,Promoter_H3K36me3
geneId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1.0,,,,,,
9,,,,,,,1.0
14,1.0,,,1.0,,,
16,5.0,,,,,12.0,17.0
19,,,,,6.0,6.0,


### Import RNA-seq data

In [10]:
rna_seq = dfs_transcriptomics['Mammary Epithelial Cell']

In [21]:
rna_seq.head()

Unnamed: 0,gene_id,transcript_id(s),length,effective_length,expected_count,TPM,FPKM,posterior_mean_count,posterior_standard_deviation_of_count,pme_TPM,pme_FPKM,TPM_ci_lower_bound,TPM_ci_upper_bound,TPM_coefficient_of_quartile_variation,FPKM_ci_lower_bound,FPKM_ci_upper_bound,FPKM_coefficient_of_quartile_variation,gene_name
0,10904,10904,93.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BLCAP
223,22883,22883,73.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CLSTN1
225,23355,23355,74.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,VPS8
226,23461,23461,73.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ABCA5
286,25809,25809,73.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TTLL1


### Lets harmonizome the gene identifiers in the epigenetics data and RNA-seq data to gene names 

In [12]:
mapping = pd.read_table('mart_export_grch38.txt')

In [13]:
mapping.head()

Unnamed: 0,Gene stable ID,Gene stable ID version,Transcript stable ID,Transcript stable ID version,Gene name,NCBI gene (formerly Entrezgene) ID
0,ENSG00000210049,ENSG00000210049.1,ENST00000387314,ENST00000387314.1,MT-TF,
1,ENSG00000211459,ENSG00000211459.2,ENST00000389680,ENST00000389680.2,MT-RNR1,
2,ENSG00000210077,ENSG00000210077.1,ENST00000387342,ENST00000387342.1,MT-TV,
3,ENSG00000210082,ENSG00000210082.2,ENST00000387347,ENST00000387347.2,MT-RNR2,
4,ENSG00000209082,ENSG00000209082.1,ENST00000386347,ENST00000386347.1,MT-TL1,


In [14]:
ensembl_gene_id_map = mapping.set_index('Gene stable ID version').to_dict()['Gene name']
ensembl_transcript_id_map = mapping.set_index('Transcript stable ID version').to_dict()['Gene name']

# NCBI column has NaNs which turns the integers into floats, so drop NaNs, change floats --> int --> str, and create dict
ncbi_gene_id_map = mapping.dropna(subset='NCBI gene (formerly Entrezgene) ID')
ncbi_gene_id_map['NCBI gene (formerly Entrezgene) ID'] = ncbi_gene_id_map['NCBI gene (formerly Entrezgene) ID'].astype(int).astype(str)
ncbi_gene_id_map = ncbi_gene_id_map.set_index('NCBI gene (formerly Entrezgene) ID').to_dict()['Gene name']

# Combine the transcript IDs, gene IDs, and NCBI IDs dictionaries for all possible mappings
mapping_dict = ensembl_transcript_id_map | ensembl_gene_id_map | ncbi_gene_id_map 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ncbi_gene_id_map['NCBI gene (formerly Entrezgene) ID'] = ncbi_gene_id_map['NCBI gene (formerly Entrezgene) ID'].astype(int).astype(str)


In [16]:
rna_seq['gene_name'] = rna_seq['gene_id'].map(mapping_dict)
feature_matrix['gene_name'] = feature_matrix.index.astype(str).map(mapping_dict)


In [17]:
rna_seq.dropna(subset='gene_name', inplace = True)

### The below seems like a lot of genes aren't mapped between the epigenetics and RNA-seq datasets... this limits our training data quite a bit

In [20]:
pd.merge(feature_matrix,rna_seq,on='gene_name')

Unnamed: 0,3' UTR_H3K36me3,5' UTR_H3K36me3,Distal Intergenic_H3K36me3,Downstream_H3K36me3,Exon_H3K36me3,Intron_H3K36me3,Promoter_H3K36me3,gene_name,gene_id,transcript_id(s),...,posterior_mean_count,posterior_standard_deviation_of_count,pme_TPM,pme_FPKM,TPM_ci_lower_bound,TPM_ci_upper_bound,TPM_coefficient_of_quartile_variation,FPKM_ci_lower_bound,FPKM_ci_upper_bound,FPKM_coefficient_of_quartile_variation
0,2.0,,,,4.0,12.0,13.0,JAG1,ENSG00000101384.12,"ENST00000254958.10,ENST00000423891.6,ENST00000...",...,11444.00,0.00,99.75,105.99,96.428500,103.203000,0.011759,102.481000,109.678000,0.011786
1,3.0,,,,7.0,3.0,2.0,ALCAM,ENSG00000170017.12,"ENST00000306107.9,ENST00000460954.1,ENST000004...",...,3032.00,0.00,27.99,29.74,26.401500,29.539100,0.019641,28.076200,31.414700,0.019630
2,8.0,,,,1.0,4.0,3.0,C1QBP,ENSG00000108561.8,"ENST00000225698.8,ENST00000570805.1,ENST000005...",...,2415.00,0.00,90.66,96.33,86.624700,94.748200,0.015599,92.040600,100.678000,0.015580
3,1.0,,,,,,,GAS8-AS1,ENSG00000221819.6,"ENST00000408886.4,ENST00000623094.2",...,0.00,0.00,0.15,0.16,0.003490,0.390104,0.499912,0.003707,0.414650,0.499905
4,,,,,,2.0,,S100G,ENSG00000169906.5,ENST00000380200.3,...,0.00,0.00,0.11,0.11,0.000003,0.317267,0.657416,0.000003,0.337123,0.657527
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1132,,,,,,1.0,,ZNF710-AS1,ENSG00000259291.2,"ENST00000558334.1,ENST00000620791.1",...,12.00,0.00,0.36,0.38,0.180287,0.560744,0.189055,0.191685,0.596078,0.188964
1133,1.0,,,,1.0,,2.0,LAMC1-AS1,ENSG00000224468.3,ENST00000457852.3,...,0.00,0.00,0.10,0.11,0.000002,0.295451,0.657439,0.000003,0.313975,0.657322
1134,,,,,,,2.0,MVP-DT,ENSG00000238045.9,"ENST00000563806.1,ENST00000564980.1,ENST000005...",...,22.00,0.00,1.14,1.22,0.616980,1.708690,0.167849,0.653801,1.814020,0.167858
1135,,,,,4.0,3.0,3.0,ABCF2-H2BK1,ENSG00000285292.1,ENST00000222388.6,...,297.64,49.22,5.39,5.72,3.601470,7.188580,0.122905,3.828690,7.642080,0.122929
