In [1]:
import pandas as pd
import os

### Import Epigenetics and RNA-seq data

In [33]:
master_folder_path = 'Multiome'

dfs_epigenetics = {}
dfs_transcriptomics = {}

# Iterate through all subdirectories and files in the master folder
for root, dirs, files in os.walk(master_folder_path):
    # If we are at the third level of directories get the histone modification assay data
    if len(root.split("/")) == 3:
        cell_line = root.split("/")[1]
        if cell_line not in dfs_epigenetics:
            dfs_epigenetics[cell_line] = []
        for file in files:
            if file.endswith('.csv'):
                # Construct the full path to the CSV file
                csv_file_path = os.path.join(root, file)
                histone_mark = csv_file_path.split("_")[-1].strip(".csv")
        
                # Read the CSV file using pandas
                df = pd.read_csv(csv_file_path)
                df['annotation'] = df['annotation'].apply(lambda x: x.split(" (")[0])
                df['annotation'] = df['annotation'] + f"_{histone_mark}"
                dfs_epigenetics[cell_line].append(df)
    # If we are at the second level of directories get RNA seq data
    if len(root.split("/")) == 2:
        cell_line = root.split("/")[1]
        for file in files:
            if file.endswith('.tsv'):
                tsv_file_path = os.path.join(root, file)
                rna_seq = pd.read_csv(tsv_file_path, sep = '\t')
                dfs_transcriptomics[cell_line] = rna_seq

### Lets use Mammary Epithelial Cell as an example for now and create the feature matrix

In [34]:
# Take this as an example for now
df_aggregated = pd.concat(dfs_epigenetics['Mammary Epithelial Cell'])

In [35]:
df_aggregated.head()

Unnamed: 0.1,Unnamed: 0,seqnames,start,end,width,strand,annotation,geneChr,geneStart,geneEnd,geneLength,geneStrand,geneId,transcriptId,distanceToTSS
0,1,chr1,100035284,100036150,867,*,3' UTR_H3K4me2,1,100038095,100083377,45283,1,64645,ENST00000370152.8,-1945
1,2,chr1,100036465,100038061,1597,*,Promoter_H3K4me2,1,100038095,100083377,45283,1,64645,ENST00000370152.8,-34
2,3,chr1,100038100,100040871,2772,*,Promoter_H3K4me2,1,100038095,100083377,45283,1,64645,ENST00000370152.8,5
3,4,chr1,100042106,100042601,496,*,Intron_H3K4me2,1,100038095,100083377,45283,1,64645,ENST00000370152.8,4011
4,5,chr1,100042648,100042938,291,*,Intron_H3K4me2,1,100038095,100083377,45283,1,64645,ENST00000370152.8,4553


In [36]:
# Get the number of peaks per gene region
peak_count_df = df_aggregated.groupby(["geneId","annotation"],as_index=False).agg(
    peak_counts = ('annotation','size')
)

In [37]:
feature_matrix = peak_count_df.pivot_table(columns='annotation',index=['geneId'])['peak_counts']

In [38]:
feature_matrix.shape

(28763, 77)

In [39]:
feature_matrix.head()

annotation,3' UTR_H2AFZ,3' UTR_H3K27a,3' UTR_H3K27me3,3' UTR_H3K36me3,3' UTR_H3K4me1,3' UTR_H3K4me2,3' UTR_H3K4me3,3' UTR_H3K79me2,3' UTR_H3K9a,3' UTR_H3K9me3,...,Promoter_H3K27a,Promoter_H3K27me3,Promoter_H3K36me3,Promoter_H3K4me1,Promoter_H3K4me2,Promoter_H3K4me3,Promoter_H3K79me2,Promoter_H3K9a,Promoter_H3K9me3,Promoter_H4K20me1
geneId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,1.0,,,,,,,...,1.0,,,1.0,2.0,2.0,,1.0,2.0,
2,,,,,,,,,,,...,,,,1.0,2.0,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,1.0,,1.0,,1.0,1.0,,,,
10,,,,,,,,,,,...,,,,1.0,1.0,,,,,


### Import RNA-seq data

In [41]:
rna_seq = dfs_transcriptomics['Mammary Epithelial Cell']

In [42]:
rna_seq.head()

Unnamed: 0,gene_id,transcript_id(s),length,effective_length,expected_count,TPM,FPKM,posterior_mean_count,posterior_standard_deviation_of_count,pme_TPM,pme_FPKM,TPM_ci_lower_bound,TPM_ci_upper_bound,TPM_coefficient_of_quartile_variation,FPKM_ci_lower_bound,FPKM_ci_upper_bound,FPKM_coefficient_of_quartile_variation
0,10904,10904,93.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,12954,12954,94.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,12956,12956,72.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,12958,12958,82.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,12960,12960,73.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Lets harmonizome the gene identifiers in the epigenetics data and RNA-seq data to gene names 

In [43]:
mapping = pd.read_table('mart_export_grch38.txt')

In [44]:
mapping.head()

Unnamed: 0,Gene stable ID,Gene stable ID version,Transcript stable ID,Transcript stable ID version,Gene name,NCBI gene (formerly Entrezgene) ID
0,ENSG00000210049,ENSG00000210049.1,ENST00000387314,ENST00000387314.1,MT-TF,
1,ENSG00000211459,ENSG00000211459.2,ENST00000389680,ENST00000389680.2,MT-RNR1,
2,ENSG00000210077,ENSG00000210077.1,ENST00000387342,ENST00000387342.1,MT-TV,
3,ENSG00000210082,ENSG00000210082.2,ENST00000387347,ENST00000387347.2,MT-RNR2,
4,ENSG00000209082,ENSG00000209082.1,ENST00000386347,ENST00000386347.1,MT-TL1,


In [45]:
ensembl_gene_id_map = mapping.set_index('Gene stable ID version').to_dict()['Gene name']
ensembl_transcript_id_map = mapping.set_index('Transcript stable ID version').to_dict()['Gene name']

# NCBI column has NaNs which turns the integers into floats, so drop NaNs, change floats --> int --> str, and create dict
ncbi_gene_id_map = mapping.dropna(subset='NCBI gene (formerly Entrezgene) ID')
ncbi_gene_id_map['NCBI gene (formerly Entrezgene) ID'] = ncbi_gene_id_map['NCBI gene (formerly Entrezgene) ID'].astype(int).astype(str)
ncbi_gene_id_map = ncbi_gene_id_map.set_index('NCBI gene (formerly Entrezgene) ID').to_dict()['Gene name']

# Combine the transcript IDs, gene IDs, and NCBI IDs dictionaries for all possible mappings
mapping_dict = ensembl_transcript_id_map | ensembl_gene_id_map | ncbi_gene_id_map 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ncbi_gene_id_map['NCBI gene (formerly Entrezgene) ID'] = ncbi_gene_id_map['NCBI gene (formerly Entrezgene) ID'].astype(int).astype(str)


In [46]:
rna_seq['gene_name'] = rna_seq['gene_id'].map(mapping_dict)
feature_matrix['gene_name'] = feature_matrix.index.astype(str).map(mapping_dict)


In [47]:
rna_seq.dropna(subset='gene_name', inplace = True)

### The below seems like a lot of genes aren't mapped between the epigenetics and RNA-seq datasets... this limits our training data quite a bit

In [48]:
pd.merge(feature_matrix,rna_seq,on='gene_name')

Unnamed: 0,3' UTR_H2AFZ,3' UTR_H3K27a,3' UTR_H3K27me3,3' UTR_H3K36me3,3' UTR_H3K4me1,3' UTR_H3K4me2,3' UTR_H3K4me3,3' UTR_H3K79me2,3' UTR_H3K9a,3' UTR_H3K9me3,...,posterior_mean_count,posterior_standard_deviation_of_count,pme_TPM,pme_FPKM,TPM_ci_lower_bound,TPM_ci_upper_bound,TPM_coefficient_of_quartile_variation,FPKM_ci_lower_bound,FPKM_ci_upper_bound,FPKM_coefficient_of_quartile_variation
0,,,,,,,,,,,...,236.0,0.0,8.73,9.28,7.589790,9.902190,0.045273,8.060280,10.516200,0.045239
1,,,,,,,,,,,...,0.0,0.0,0.37,0.39,0.110687,0.676154,0.274713,0.117439,0.718343,0.274743
2,1.0,1.0,,2.0,3.0,2.0,,1.0,,,...,11444.0,0.0,99.75,105.99,96.428500,103.203000,0.011759,102.481000,109.678000,0.011786
3,,,,3.0,,,,,,,...,3032.0,0.0,27.99,29.74,26.401500,29.539100,0.019641,28.076200,31.414700,0.019630
4,,,,,,,,,,,...,0.0,0.0,0.16,0.17,0.016836,0.352125,0.389764,0.017881,0.374190,0.389839
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3997,,,,,,,,,,,...,0.0,0.0,0.03,0.04,0.000592,0.078898,0.474567,0.000847,0.084061,0.474802
3998,,,,,,,,,,,...,0.0,0.0,0.15,0.16,0.003261,0.359758,0.480782,0.001904,0.380769,0.480732
3999,,,,,,,,,,,...,1.0,0.0,0.09,0.09,0.002408,0.211425,0.471208,0.002557,0.224624,0.470914
4000,,,,,,,,,,,...,4.0,0.0,0.63,0.67,0.177834,1.164040,0.286296,0.188051,1.236190,0.286235
