In [1]:
import pandas as pd
import muon as mu
import scanpy as sc
from muon import atac as ac
import numpy as np
import warnings
import pychromvar as pc
import sys
import os
import pickle


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
file_dir = os.path.abspath('')
data_path = os.path.join(file_dir, '..', '..', 're_design', '10x_data')

h5_file_path = os.path.join(data_path, 'pbmc3k_multi.h5mu')
save_path = os.path.join(file_dir, 'generated_data')

In [3]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    mdata = mu.read_h5mu(h5_file_path)
mdata

In [4]:
mdata['cre'].var

Unnamed: 0,directionality,orientation,bias_strand,fwd_count,rev_count,chrom,summit_center,abs_summit_diff,oritn_summit_diff,start,...,n_cells_by_counts,mean_counts,pct_dropout_by_counts,total_counts,highly_variable,means,dispersions,dispersions_norm,frag_length,gc_bias
chr1_29236_29737_-,0.000000,unidirectional,-,0,68,chr1,29337,-1,0,29236,...,68,0.017951,98.204857,68.0,False,0.035708,0.799343,-0.712017,501,0.738523
chr1_199774_200275_-,0.021505,divergent,-,2,184,chr1,200006,262,262,199774,...,177,0.048046,95.327350,182.0,False,0.098490,0.913435,-0.315133,501,0.742515
chr1_629102_629950_+,-0.012048,convergent,+,6765,41,chr1,629793,307,-307,629102,...,2715,1.748944,28.326294,6625.0,False,1.550868,1.426006,-0.121210,848,0.417453
chr1_630597_631171_+,0.000383,divergent,+,192997,37,chr1,630948,246,246,630597,...,3766,49.658924,0.580781,188108.0,False,4.682851,3.148479,0.070629,574,0.451220
chr1_632356_633841_+,0.100447,divergent,+,7224,382,chr1,633051,963,963,632356,...,3024,2.062830,20.168955,7814.0,False,1.718540,1.464683,-0.212451,1485,0.426936
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chrY_21028687_21029188_-,0.000000,unidirectional,-,0,22,chrY,21028788,-1,0,21028687,...,15,0.005016,99.604013,19.0,False,0.010252,1.222718,0.760744,501,0.415170
chrY_21138232_21138733_+,0.043478,divergent,+,90,2,chrY,21138531,204,204,21138232,...,68,0.023231,98.204857,88.0,False,0.040800,0.979056,-0.086863,501,0.379242
chrY_21254335_21254836_+,0.089552,divergent,+,64,3,chrY,21254594,284,284,21254335,...,57,0.017423,98.495248,66.0,False,0.032475,0.960442,-0.151615,501,0.317365
chrY_57067464_57067965_+,0.000000,unidirectional,+,406,0,chrY,57067865,-1,0,57067464,...,356,0.104805,90.601901,397.0,False,0.196913,1.011132,0.024714,501,0.590818


# Differentially accessible peaks based on Leiden clustering on tCRE

In [5]:
# rank the differentially expressed genes  
sc.tl.rank_genes_groups(mdata.mod['cre'], 'leiden_euclidean', method='t-test')

In [6]:
# transform into format peak,cluster and filter by p-value and take only first n
from collections import defaultdict 
rank_peak_groups = mdata.mod['cre'].uns['rank_genes_groups']
filter_by_pvalue = pd.DataFrame(rank_peak_groups['pvals_adj'])<0.05
de_cre = pd.DataFrame(rank_peak_groups['names'])[filter_by_pvalue].iloc[1:10000,:]
cre_cluster  = defaultdict(list)
# flatten into list of unique genes in format gene:cluster 
for _, row in de_cre.iterrows():
    for cluster, peak in enumerate(row):
        if cluster is not None and cluster not in cre_cluster[peak]:
            cre_cluster[peak].append(cluster)

In [7]:
print("Total cre:", len(cre_cluster), "; more than 1 cluster cre:",sum([len(x)>1 for x in cre_cluster.values()]))

Total cre: 12409 ; more than 1 cluster cre: 2402


# Final dataset

In [33]:
mdata['cre'].var['peak_seq'] = mdata.mod['cre'].uns['peak_seq']
full_data = mdata['cre'].var.reset_index(names='interval')
# subset only required columns
full_data = full_data[['interval','chrom','start','end','summit_center','peak_seq']]

# new column cluster by mapping genes to cluster with help of gene_cluster dict
full_data['cell_type'] = full_data['interval'].map(cre_cluster)

full_data = full_data.explode('cell_type')
full_data = full_data[full_data.cell_type.notna()]

full_data['cell_type'] = "ct"+(full_data.cell_type.astype(int)).astype(str)
full_data.reset_index(drop=True,inplace=True)
# for data constitencty with legacy code
full_data.rename(columns={'peak_seq':'sequence', 'interval':'peak'}, inplace=True)

full_data = full_data.drop(full_data[full_data.sequence.str.contains("N")].index).reset_index(drop=True)
full_data

Unnamed: 0,peak,chrom,start,end,summit_center,sequence,cell_type
0,chr1_29236_29737_-,chr1,29236,29737,29337,CTCCCTCCAGCCCCTCCGGGTCCCCTACTTCGCCCCGCCAGGCCCC...,ct2
1,chr1_199774_200275_-,chr1,199774,200275,200006,CCCTACTTCGCCCCGCCAGGCCCCCACGACCCTACTTCCCGCGGCC...,ct1
2,chr1_199774_200275_-,chr1,199774,200275,200006,CCCTACTTCGCCCCGCCAGGCCCCCACGACCCTACTTCCCGCGGCC...,ct2
3,chr1_629102_629950_+,chr1,629102,629950,629793,AGGCTTCAACATCGAATACGCCGCAGGCCCCTTCGCCCTATTCTTC...,ct2
4,chr1_630597_631171_+,chr1,630597,631171,630948,CAAAACCCACCCCATTCCTCCCCACACTCATCGCCCTTACCACACT...,ct1
...,...,...,...,...,...,...,...
14800,chrY_20575222_20575876_+,chrY,20575222,20575876,20575706,GTCAGGGGTTTGAGAGCCTGGCCAACATGGTGAAACCCCATCTCTA...,ct2
14801,chrY_21028687_21029188_-,chrY,21028687,21029188,21028788,TGTGGTGAGTGTTATAGCTCATAATGTTGGCACGGACCCAAACAGT...,ct2
14802,chrY_21138232_21138733_+,chrY,21138232,21138733,21138531,ACATGACTTGCATATTTAGCATGTTAACTGCTTCATTTGGGGAGCT...,ct0
14803,chrY_21254335_21254836_+,chrY,21254335,21254836,21254594,AAATAATAAAGTGTATTATTTATCTGTTTTACATACTGTTGGTTTT...,ct0


In [34]:
# check for nan values
full_data.isna().any()

peak             False
chrom            False
start            False
end              False
summit_center    False
sequence         False
cell_type        False
dtype: bool

In [35]:
print("Total peaks:", full_data.shape[0],"; more than 1 cluster:" ,full_data[full_data['peak'].duplicated()].shape[0])

Total peaks: 14805 ; more than 1 cluster: 2400


In [36]:
print("File statistics")
print(full_data['cell_type'].value_counts())
print(full_data['chrom'].value_counts())

File statistics
cell_type
ct2    5968
ct1    5443
ct0    3394
Name: count, dtype: int64
chrom
chr1     1484
chr19    1061
chr2      997
chr17     969
chr11     812
chr12     801
chr6      794
chr7      777
chr3      770
chr5      673
chr16     658
chr14     580
chr9      557
chr10     541
chr4      513
chr8      491
chr15     459
chrX      421
chr20     387
chr22     352
chr13     259
chr21     201
chr18     194
chrY       30
chrM       24
Name: count, dtype: int64


In [37]:
full_data['sequence'].str.len().describe()

count    14805.000000
mean       567.715907
std        159.502815
min        501.000000
25%        501.000000
50%        501.000000
75%        501.000000
max       3654.000000
Name: sequence, dtype: float64

In [38]:
full_data[full_data['chrom']=='chr1']['cell_type'].value_counts()

cell_type
ct2    579
ct1    569
ct0    336
Name: count, dtype: int64

In [23]:
file_path = os.path.join(save_path, 'tcre_seq_leiden_cluster.csv')

full_data.to_csv(file_path, index=False)

# Explore

In [47]:
full_data = pd.read_csv(f'{save_path}/tcre_seq_leiden_cluster.csv')
full_data

Unnamed: 0,peak,chrom,start,end,summit_center,sequence,cell_type
0,chr1_29236_29737_-,chr1,29236,29737,29337,CTCCCTCCAGCCCCTCCGGGTCCCCTACTTCGCCCCGCCAGGCCCC...,ct2
1,chr1_199774_200275_-,chr1,199774,200275,200006,CCCTACTTCGCCCCGCCAGGCCCCCACGACCCTACTTCCCGCGGCC...,ct1
2,chr1_199774_200275_-,chr1,199774,200275,200006,CCCTACTTCGCCCCGCCAGGCCCCCACGACCCTACTTCCCGCGGCC...,ct2
3,chr1_629102_629950_+,chr1,629102,629950,629793,AGGCTTCAACATCGAATACGCCGCAGGCCCCTTCGCCCTATTCTTC...,ct2
4,chr1_630597_631171_+,chr1,630597,631171,630948,CAAAACCCACCCCATTCCTCCCCACACTCATCGCCCTTACCACACT...,ct1
...,...,...,...,...,...,...,...
14800,chrY_20575222_20575876_+,chrY,20575222,20575876,20575706,GTCAGGGGTTTGAGAGCCTGGCCAACATGGTGAAACCCCATCTCTA...,ct2
14801,chrY_21028687_21029188_-,chrY,21028687,21029188,21028788,TGTGGTGAGTGTTATAGCTCATAATGTTGGCACGGACCCAAACAGT...,ct2
14802,chrY_21138232_21138733_+,chrY,21138232,21138733,21138531,ACATGACTTGCATATTTAGCATGTTAACTGCTTCATTTGGGGAGCT...,ct0
14803,chrY_21254335_21254836_+,chrY,21254335,21254836,21254594,AAATAATAAAGTGTATTATTTATCTGTTTTACATACTGTTGGTTTT...,ct0


In [48]:
data = full_data.drop(full_data[full_data.sequence.str.contains("N")].index).reset_index(drop=True)
data['sequence'] = data['sequence'].str[-200:]
data

Unnamed: 0,peak,chrom,start,end,summit_center,sequence,cell_type
0,chr1_29236_29737_-,chr1,29236,29737,29337,ACTCCGAGCTCCCGACGTGCACACGGCTCCCATGCGTTGTCTTCCG...,ct2
1,chr1_199774_200275_-,chr1,199774,200275,200006,ACGGCTCCCATGCGTTGTCTTCCGAGCGTCAGGCCGCCCCTACCCG...,ct1
2,chr1_199774_200275_-,chr1,199774,200275,200006,ACGGCTCCCATGCGTTGTCTTCCGAGCGTCAGGCCGCCCCTACCCG...,ct2
3,chr1_629102_629950_+,chr1,629102,629950,629793,CTAGCTTTTATTCCAGTTCTAACCAAAAAAATAAACCCTCGTTCCA...,ct2
4,chr1_630597_631171_+,chr1,630597,631171,630948,TCAATATGAAAATCACCTCAGAGCTGGTAAAAAGAGGCTTAACCCC...,ct1
...,...,...,...,...,...,...,...
14800,chrY_20575222_20575876_+,chrY,20575222,20575876,20575706,AGGCGGGGAAAAGCATCGTAATCAGCTGCGTCGCCTTTTGGTGACG...,ct2
14801,chrY_21028687_21029188_-,chrY,21028687,21029188,21028788,GATAGATAGAAAAGTTATCCCAGTCCCCACCCAAACCAGAAGCCCA...,ct2
14802,chrY_21138232_21138733_+,chrY,21138232,21138733,21138531,GGAGTTGCACACACAGGTTTACTGATAAGAGAAGTTACTCAAACTG...,ct0
14803,chrY_21254335_21254836_+,chrY,21254335,21254836,21254594,TTTTTGTCGGGTGGAAGCATGAATACTTGTTATTCAAGTGTTCAGG...,ct0


In [49]:
train_data = data[(data["chrom"]!= "chr1") & (data["chrom"] != "chr2")].reset_index(drop=True)
train_data['cell_type'].value_counts()

cell_type
ct2    5007
ct1    4494
ct0    2823
Name: count, dtype: int64

In [50]:
test_data = data[data['chrom'] == "chr1"].reset_index(drop=True)
test_data['cell_type'].value_counts()

cell_type
ct2    579
ct1    569
ct0    336
Name: count, dtype: int64