In [1]:
import pandas as pd
import cassiopeia as cas

In [2]:
input_files = [
    "/media/chang/HDD-1/chang/weissman/fastq/A22_S1_L001_R1_001.fastq.gz", "/media/chang/HDD-1/chang/weissman/fastq/A22_S1_L001_R2_001.fastq.gz"
]

In [3]:
bam_fp = cas.pp.convert_fastqs_to_unmapped_bam(
    input_files,
    chemistry='10xv3',
    output_directory="/media/chang/HDD-1/chang/weissman/",
    name="Brain",
    n_threads=16
)

[2022-02-22 00:00:54,030]    INFO [convert] Starting...
[2022-02-22 00:00:54,031]    INFO [convert] Using 10xv3 chemistry.
Writing BAM: 3673393it [04:19, 14172.34it/s]
[2022-02-22 00:05:14,162]    INFO [convert] Finished in 260.13217997550964 s.


In [4]:
bam_fp = cas.pp.filter_bam(
    bam_fp,
    output_directory="/media/chang/HDD-1/chang/weissman/",
    quality_threshold=10,
    n_threads=16,
)

[2022-02-22 00:05:35,366]    INFO [filter_bam] Starting...
Applying BAM: 3673393it [01:07, 54238.63it/s]
[2022-02-22 00:06:43,617]    INFO [filter_bam] Filtered 1129 reads that didn't pass the filter.
[2022-02-22 00:06:43,618]    INFO [filter_bam] Finished in 68.2519052028656 s.


In [None]:
bam_fp = cas.pp.error_correct_cellbcs_to_whitelist(
    bam_fp,
    whitelist='/home/chang/3M-february-2018.txt',
    output_directory="/media/chang/HDD-1/chang/weissman/",
    n_threads=16,
)

[2022-02-22 00:06:43,625]    INFO [error_correct_cellbcs_to_whitelist] Starting...
[2022-02-22 00:06:54,924]    INFO [error_correct_cellbcs_to_whitelist] Detected 80243 raw barcodes.
[1/4] Finding exact matches: 100%|##########| 80243/80243 [00:00<00:00, 501426.58it/s]
[2/4] Constructing masks: 100%|##########| 6794880/6794880 [03:15<00:00, 34733.46it/s]
[3/4] Finding mismatches:   3%|3         | 1102/34524 [02:43<1:22:28,  6.75it/s]

In [188]:
umi_table = cas.pp.collapse_umis(
    bam_fp,
    output_directory="/media/chang/HDD-1/chang/weissman/",
    max_hq_mismatches=3,
    max_indels=2,
    method='likelihood',
    n_threads=16,
)

[2022-02-22 20:03:49,444]    INFO [collapse] Starting...
[2022-02-22 20:03:49,447]    INFO [collapse] Using BAM tag `CB` as cell barcodes


Merging sorted chunks:   0%|          | 0/3640746 [00:00<?, ?it/s]

[2022-02-22 20:07:02,627]    INFO [collapse] Sorted bam directory saved to /media/chang/HDD-1/chang/weissman/Brain_unmapped_filtered_corrected_sorted.bam
[2022-02-22 20:07:02,628]    INFO [collapse] Max read length of 301
[2022-02-22 20:07:02,629]    INFO [collapse] Total reads: 3640746
Collapsing UMIs: 100%|##########| 532555/532555 [03:50<00:00, 2310.38it/s]


Writing collapsed UMIs:   0%|          | 0/532555 [00:00<?, ?it/s]

[2022-02-22 20:11:45,658]    INFO [collapse] Collapsed bam directory saved to /media/chang/HDD-1/chang/weissman/Brain_unmapped_filtered_corrected_sorted.collapsed.bam
[2022-02-22 20:11:45,660]    INFO [collapse] Converted dataframe saved to /media/chang/HDD-1/chang/weissman/Brain_unmapped_filtered_corrected_sorted.collapsed.txt
[2022-02-22 20:11:45,661]    INFO [collapse] Finished in 476.2165410518646 s.


In [72]:
umi_table2 = cas.pp.resolve_umi_sequence(
    umi_table,
    output_directory="/media/chang/HDD-1/chang/weissman/",
    min_umi_per_cell=1,
    min_avg_reads_per_umi=1.0,
    plot=False)

[2022-02-22 17:22:48,332]    INFO [resolve] Starting...


Resolving UMI sequences:   0%|          | 0/532555 [00:00<?, ?it/s]

[2022-02-22 17:24:19,893]    INFO [resolve] Filtered out 170292 reads.
[2022-02-22 17:24:20,660]    INFO [resolve] Filtered out 0 cells with too few UMIs or too few average number of reads per UMI.
[2022-02-22 17:24:20,662]    INFO [resolve] Filtered out 0 UMIs as a result.
[2022-02-22 17:24:22,132]    INFO [resolve] Finished in 93.7993597984314 s.


In [None]:
umi_table2.to_csv("/media/chang/HDD-1/chang/weissman/raw_umi_table_min1.csv")

In [189]:
umi_table = cas.pp.resolve_umi_sequence(
    umi_table,
    output_directory="/media/chang/HDD-1/chang/weissman/",
    min_umi_per_cell=5,
    min_avg_reads_per_umi=2.0,
    plot=False)

[2022-02-22 20:11:45,668]    INFO [resolve] Starting...


Resolving UMI sequences:   0%|          | 0/532555 [00:00<?, ?it/s]

[2022-02-22 20:13:17,651]    INFO [resolve] Filtered out 170292 reads.
[2022-02-22 20:13:18,472]    INFO [resolve] Filtered out 45917 cells with too few UMIs or too few average number of reads per UMI.
[2022-02-22 20:13:18,489]    INFO [resolve] Filtered out 70860 UMIs as a result.
[2022-02-22 20:13:19,630]    INFO [resolve] Finished in 93.96177959442139 s.


In [190]:
umi_table = cas.pp.align_sequences(
    umi_table,
    ref_filepath='/media/chang/HDD-1/chang/weissman/PCT48-long.ref.fa',
    gap_open_penalty=6,
    gap_extend_penalty=1,
    n_threads=16,
)

[2022-02-22 20:13:19,636]    INFO [align] Starting...
Aligning sequences to reference: 100%|##########| 190371/190371 [06:44<00:00, 470.25it/s]
[2022-02-22 20:20:05,561]    INFO [align] Finished in 405.9250557422638 s.


In [73]:
umi_table2 = cas.pp.align_sequences(
    umi_table2,
    ref_filepath='/media/chang/HDD-1/chang/weissman/PCT48-long.ref.fa',
    gap_open_penalty=7,
    gap_extend_penalty=1,
    n_threads=16,
)

[2022-02-22 17:24:22,138]    INFO [align] Starting...
Aligning sequences to reference: 100%|##########| 235583/235583 [07:59<00:00, 491.53it/s]
[2022-02-22 17:32:22,759]    INFO [align] Finished in 480.62117528915405 s.


In [79]:
umi_table2.to_csv("/media/chang/HDD-1/chang/weissman/raw_umi_table_min1.csv")

In [191]:
umi_table = cas.pp.call_alleles(
    umi_table,
    ref_filepath='/media/chang/HDD-1/chang/weissman/PCT48-long.ref.fa',
    barcode_interval=(20, 34),
    cutsite_locations=[112, 166, 220],
    cutsite_width=12,
    context=True,
    context_size=5,
)

[2022-02-22 20:20:05,609]    INFO [call_alleles] Starting...


Parsing CIGAR strings into indels:   0%|          | 0/461695 [00:00<?, ?it/s]

[2022-02-22 20:21:16,637]    INFO [call_alleles] Finished in 71.02811765670776 s.


In [74]:
umi_table2 = cas.pp.call_alleles(
    umi_table2,
    ref_filepath='/media/chang/HDD-1/chang/weissman/PCT48-long.ref.fa',
    barcode_interval=(20, 34),
    cutsite_locations=[112, 166, 220],
    cutsite_width=12,
    context=True,
    context_size=5,
)

[2022-02-22 17:32:22,815]    INFO [call_alleles] Starting...


Parsing CIGAR strings into indels:   0%|          | 0/532555 [00:00<?, ?it/s]

[2022-02-22 17:33:41,278]    INFO [call_alleles] Finished in 78.46285104751587 s.


In [192]:
umi_table = cas.pp.error_correct_umis(
    umi_table,
    max_umi_distance=2,
    allow_allele_conflicts=False,
    n_threads=16,
)

[2022-02-22 20:21:16,678]    INFO [error_correct_umis] Starting...
Error-correcting UMIs: 100%|##########| 116784/116784 [01:52<00:00, 1039.00it/s]
[2022-02-22 20:24:28,495]    INFO [error_correct_umis] 22506 UMIs Corrected of 439189(5.124%)
[2022-02-22 20:24:39,963]    INFO [error_correct_umis] Finished in 203.28502321243286 s.


In [80]:
umi_table2 = cas.pp.error_correct_umis(
    umi_table2,
    max_umi_distance=2,
    allow_allele_conflicts=False,
    n_threads=16,
)

[2022-02-22 17:49:34,919]    INFO [error_correct_umis] Starting...
Error-correcting UMIs: 100%|##########| 180771/180771 [03:02<00:00, 992.30it/s] 
[2022-02-22 17:54:38,546]    INFO [error_correct_umis] 24377 UMIs Corrected of 508178(4.797%)
[2022-02-22 17:54:52,376]    INFO [error_correct_umis] Finished in 317.4563179016113 s.


In [82]:
umi_table2.to_csv("/media/chang/HDD-1/chang/weissman/raw_umi_table_min1_ec.csv")

In [193]:
umi_table = cas.pp.filter_molecule_table(
    umi_table,
    output_directory='/media/chang/HDD-1/chang/weissman/',
    min_umi_per_cell=5,
    min_avg_reads_per_umi=2.0,
    min_reads_per_umi=-1,
    intbc_prop_thresh=0.5,
    intbc_umi_thresh=5,
    intbc_dist_thresh=1,
    doublet_threshold=0.35,
    allow_allele_conflicts=False,
    plot=False,
)

[2022-02-22 20:24:40,259]    INFO [filter_molecule_table] Starting...
[2022-02-22 20:24:40,527]    INFO [filter_molecule_table] Logging initial stats...
[2022-02-22 20:24:40,568]    INFO [filter_molecule_table] Filtering UMIs with less than 2.0 reads...
[2022-02-22 20:24:41,068]    INFO [filter_molecule_table] Filtering out cellBCs with fewer than 5 UMIs andless than 2.0 average reads per UMI...
[2022-02-22 20:24:41,367]    INFO [filter_molecule_table] Filtered out 3202 cells with too few UMIs or too few average number of reads per UMI.
[2022-02-22 20:24:41,374]    INFO [filter_molecule_table] Filtered out 8987 UMIs as a result.
[2022-02-22 20:24:41,732]    INFO [filter_molecule_table] Error correcting intBCs...


Error Correcting intBCs:   0%|          | 0/47228 [00:00<?, ?it/s]

[2022-02-22 20:25:45,259]    INFO [filter_molecule_table] Filtering out intra-lineage group doublets with proportion 0.35...
[2022-02-22 20:25:45,888]    INFO [filter_molecule_table] Mapping remaining intBC conflicts...
[2022-02-22 20:25:49,079]    INFO [filter_molecule_table] Overall, filtered 9247 cells, with 228595 UMIs.
[2022-02-22 20:25:49,122]    INFO [filter_molecule_table] Finished in 68.86279153823853 s.


In [83]:
umi_table2 = cas.pp.filter_molecule_table(
    umi_table2,
    output_directory='/media/chang/HDD-1/chang/weissman/raw',
    min_umi_per_cell=1,
    min_avg_reads_per_umi=1.0,
    min_reads_per_umi=1,
    intbc_prop_thresh=0.5,
    intbc_umi_thresh=1,
    intbc_dist_thresh=1,
    doublet_threshold=None,
    allow_allele_conflicts=False,
    plot=False,
)

[2022-02-22 17:55:07,534]    INFO [filter_molecule_table] Starting...
[2022-02-22 17:55:07,879]    INFO [filter_molecule_table] Logging initial stats...
[2022-02-22 17:55:07,880]    INFO [filter_molecule_table] Filtering UMIs with less than 1 reads...
[2022-02-22 17:55:08,656]    INFO [filter_molecule_table] Filtering out cellBCs with fewer than 1 UMIs andless than 1.0 average reads per UMI...
[2022-02-22 17:55:09,314]    INFO [filter_molecule_table] Filtered out 0 cells with too few UMIs or too few average number of reads per UMI.
[2022-02-22 17:55:09,315]    INFO [filter_molecule_table] Filtered out 0 UMIs as a result.
[2022-02-22 17:55:09,988]    INFO [filter_molecule_table] Error correcting intBCs...


Error Correcting intBCs:   0%|          | 0/197239 [00:00<?, ?it/s]

[2022-02-22 18:00:29,071]    INFO [filter_molecule_table] Mapping remaining intBC conflicts...
[2022-02-22 18:00:37,237]    INFO [filter_molecule_table] Overall, filtered 59731 cells, with 430189 UMIs.
[2022-02-22 18:00:37,332]    INFO [filter_molecule_table] Finished in 329.79862427711487 s.


In [84]:
umi_table2

Unnamed: 0,readName,AlignmentScore,CIGAR,QueryBegin,ReferenceBegin,Seq,UMI,allele,cellBC,intBC,r1,r2,r3,readCount,status
0,ACAAGCTTCCTTAGTC_GCTAATAGGTGA_67,1269,115M1I51M9D1M1D103M15D4M5D7M1D3M5I5M,0,0,AATCCAGCTAGCTGTGCAGCCATCCGGTCCTTCAATTCAACTGCAG...,GCTAATAGGTGA,AAAAA[116:1I]ATGGCCGATAT[167:9D]ACCGTATATA[177...,ACAAGCTTCCTTAGTC,CATCCGGTCCTTCA,AAAAA[116:1I]ATGGCC,GATAT[167:9D]ACCGTATATA[177:1D]CCGTG,ATTCG[None]CGGAG,67,good
1,TCTCAGCTCGGCTGTG_ACCGGGGCTAGC_66,1009,113M12I40M19D48M2D25M,0,0,AATCCAGCTAGCTGTGCAGCCATCCGGTCCTTCAATTCAACTGCAG...,ACCGGGGCTAGC,CGAAA[114:12I]GTGTACGCCGAAAATGGGTTGG[154:19D]T...,TCTCAGCTCGGCTGTG,CATCCGGTCCTTCA,CGAAA[114:12I]GTGTACGCCGAAAATGG,GTTGG[154:19D]TTCAT,ATTCG[221:2D]GAGGA,66,good
2,CCGATCTTCATTGTTC_GTGATCCATTGG_56,1061,112M53D115M15D4M5D7M1D3M5I2M9I3M1D2M2D2M1D2M1D...,0,0,AATCCAGCTAGCTGTGCAGCCATCCGGTCCTTCAATTCAACTGCAG...,GTGATCCATTGG,CCGAA[113:53D]TCTCTCCGAA[113:53D]TCTCTATTCG[No...,CCGATCTTCATTGTTC,CATCCGGTCCTTCA,CCGAA[113:53D]TCTCT,CCGAA[113:53D]TCTCT,ATTCG[None]CGGAG,56,good
3,ACTGTCCCACACAGAG_TATACTTTTTGG_54,1273,106M13D161M15D4M5D7M1D3M5I5M,0,0,AATCCAGCTAGCTGTGCAGCCGTTACAAGCTAACATTCAACTGCAG...,TATACTTTTTGG,TGTAC[107:13D]CTGACGATAT[None]CTCTGATTCG[None]...,ACTGTCCCACACAGAG,CGTTACAAGCTAAC,TGTAC[107:13D]CTGAC,GATAT[None]CTCTG,ATTCG[None]CGGAG,54,good
4,TCAGCAAGTCAGGCAA_AGTCCGAATACA_51,739,112M110D58M15D4M5D7M1D3M2D1M7D2M2D8M1I7M2D6M4D...,0,0,AATCCAGCTAGCTGTGCAGCGTTCTGATCAGAGCATTCAACTGCAG...,AGTCCGAATACA,CCGAA[113:110D]GAGGACCGAA[113:110D]GAGGACCGAA[...,TCAGCAAGTCAGGCAA,GTTCTGATCAGAGC,CCGAA[113:110D]GAGGA,CCGAA[113:110D]GAGGA,CCGAA[113:110D]GAGGA,51,good
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
430184,ATACCGAAGTATGGAT_GACAAGGTAATT_1,1148,106M21D153M15D4M5D7M1D3M2D1M7D2M2D8M1I8M2I6M,0,0,AATACAGATAGCAGAACAGCAAAATAAACAGAACATACAACTACAG...,GACAAGGTAATT,TGTAC[107:21D]TAAGCGATAT[None]CTCTGATTCG[None]...,ATACCGAAGTATGGAT,AAAATAAACAGAAC,TGTAC[107:21D]TAAGC,GATAT[None]CTCTG,ATTCG[None]CGGAG,1,good
430185,TACCCACGTGTTAAAG_CTATATCTGACT_1,1180,115M2I166M2D2M1D7M6D3M4D3M5D3M,0,0,AATACAGCTAGCAAAACAGACGATAAAAACTAAAATTAAACTGCAG...,CTATATCTGACT,AAAAA[116:2I]AATAGCCGATAT[None]CTCTGATTCG[None...,TACCCACGTGTTAAAG,CGATAAAAACTAAA,AAAAA[116:2I]AATAGCC,GATAT[None]CTCTG,ATTCG[None]CGGAG,1,good
430186,TACCCACGTGTTAAAG_TTGGGCCGGATT_1,1191,115M2I165M15D4M5D7M1D3M,0,0,AATACAGCTAGCAAAGCAGCCGATACAAGCAAACATACAAATGCAG...,TTGGGCCGGATT,AAAAA[116:2I]AATGGCCGATAT[None]CTCTGATTCG[None...,TACCCACGTGTTAAAG,CGATACAAGCAAAC,AAAAA[116:2I]AATGGCC,GATAT[None]CTCTG,ATTCG[None]CGGAG,1,good
430187,TACCCACGTGTTAAAG_TCGGCAATCCAT_1,1101,2M2D111M2I165M15D5M5D6M1D3M,2,0,AAAACAGATAGAAATAAAGCCGTTAAAAACTAAAATACAACAGAAG...,TCGGCAATCCAT,AAAAA[116:2I]AATGGCCAATAT[None]ATCTGATTCG[None...,TACCCACGTGTTAAAG,CGTTAAAAACTAAA,AAAAA[116:2I]AATGGCC,AATAT[None]ATCTG,ATTCG[None]CGGAG,1,good


In [85]:
umi_table2.to_csv("/media/chang/HDD-1/chang/weissman/raw_umi_table_min1_ec_filtered.csv")

In [194]:
allele_table = cas.pp.call_lineage_groups(
    umi_table,
    output_directory='/media/chang/HDD-1/chang/weissman/',
    min_umi_per_cell=5,
    min_avg_reads_per_umi=2.0,
    min_cluster_prop=0.005,
    min_intbc_thresh=0.05,
    inter_doublet_threshold=0.35,
    kinship_thresh=0.25,
    plot=True,
)

[2022-02-22 20:25:49,364]    INFO [call_lineages] Starting...
[2022-02-22 20:25:49,366]    INFO [call_lineages] 228595 UMIs (rows), with 15 attributes (columns)
[2022-02-22 20:25:49,436]    INFO [call_lineages] 9247 Cells
[2022-02-22 20:25:50,260]    INFO [call_lineages] Assigning initial lineage groups...
[2022-02-22 20:25:50,261]    INFO [call_lineages] Clustering with minimum cluster size 46...
[2022-02-22 20:25:50,539]    INFO [call_lineages] Refining lineage groups...
[2022-02-22 20:25:50,539]    INFO [call_lineages] Redefining lineage groups by removing low proportion intBCs...
[2022-02-22 20:25:50,748]    INFO [call_lineages] Reassigning cells to refined lineage groups by kinship...
[2022-02-22 20:25:50,774]    INFO [call_lineages] Annotating alignment table with refined lineage groups...
[2022-02-22 20:25:53,550]    INFO [call_lineages] Filtering out inter-lineage group doublets with proportion 0.35...
[2022-02-22 20:25:56,365]    INFO [call_lineages] Filtering out low proporti

In [195]:
allele_table

Unnamed: 0,cellBC,intBC,allele,r1,r2,r3,lineageGrp,UMI,readCount
0,AAACCCAAGCACTCTA,CATCCGGTCCTTCA,CGAAA[114:2D]TGGCCGATAT[167:2I]GACCTGGATGAC[16...,CGAAA[114:2D]TGGCC,GATAT[167:2I]GACCTGGATGAC[168:1D]CTGGT,ATTCG[221:2D]GAGGA,1,8,92
1,AAACCCAAGCACTCTA,GTTAATGAGAAATT,CGAAA[114:2D]TGGCCGATAT[None]CTCTGATTCG[None]C...,CGAAA[114:2D]TGGCC,GATAT[None]CTCTG,ATTCG[None]CGGAG,1,5,33
2,AAACCCAAGCACTCTA,TTCACTTCTCACGC,CGAAA[114:2D]TGGCCGATAT[None]CTCTGATTCG[None]C...,CGAAA[114:2D]TGGCC,GATAT[None]CTCTG,ATTCG[None]CGGAG,1,2,10
3,AAACCCAAGCCAGTAG,CATTTATCACCTGA,CCGAA[None]AAATGAACGA[164:1D]ACATGACGAA[166:3D...,CCGAA[None]AAATG,AACGA[164:1D]ACATGACGAA[166:3D]CATGGCGAAC[170:...,ATTCG[None]CGGAG,1,1,9
4,AAACCCAAGCCAGTAG,GTTAATGAGAAATT,AAAAA[116:1I]ATGGCCGATAT[None]CTCTGATTCG[None]...,AAAAA[116:1I]ATGGCC,GATAT[None]CTCTG,ATTCG[None]CGGAG,1,1,8
...,...,...,...,...,...,...,...,...,...
26025,TTTGTTGTCTAATTCC,TTCACTTCTCACGC,CCGAA[None]AAATGGATAT[None]CTCTGATTCG[None]CGGAG,CCGAA[None]AAATG,GATAT[None]CTCTG,ATTCG[None]CGGAG,1,2,11
26026,TTTGTTGTCTTAGTTC,CATTTATCACCTGA,CCGAA[None]AAATGGATAT[None]CTCTGATTCG[None]CGGAG,CCGAA[None]AAATG,GATAT[None]CTCTG,ATTCG[None]CGGAG,1,1,8
26027,TTTGTTGTCTTAGTTC,CGTTACAAGCTAAC,CCGAA[113:3D]TGGCCGATAT[None]CTCTGATTCG[None]C...,CCGAA[113:3D]TGGCC,GATAT[None]CTCTG,ATTCG[None]CGGAG,1,2,26
26028,TTTGTTGTCTTAGTTC,GTTAATGAGAAATT,AAAAA[116:1I]ATGGCCGATAT[None]CTCTGATTCG[None]...,AAAAA[116:1I]ATGGCC,GATAT[None]CTCTG,ATTCG[None]CGGAG,1,1,5


In [196]:
allele_table.to_csv("/media/chang/HDD-1/chang/weissman/filtered_allele_table.csv")

In [86]:
allele_table2 = cas.pp.call_lineage_groups(
    umi_table2,
    output_directory='/media/chang/HDD-1/chang/weissman/raw',
    min_umi_per_cell=1,
    min_avg_reads_per_umi=1.0,
    min_cluster_prop=0.005,
    min_intbc_thresh=0.05,
    inter_doublet_threshold=0.35,
    kinship_thresh=0.25,
    plot=True,
)

[2022-02-22 18:00:52,015]    INFO [call_lineages] Starting...
[2022-02-22 18:00:52,016]    INFO [call_lineages] 430189 UMIs (rows), with 15 attributes (columns)
[2022-02-22 18:00:52,182]    INFO [call_lineages] 59731 Cells
[2022-02-22 18:01:31,753]    INFO [call_lineages] Assigning initial lineage groups...
[2022-02-22 18:01:31,754]    INFO [call_lineages] Clustering with minimum cluster size 298...
[2022-02-22 18:01:48,933]    INFO [call_lineages] Refining lineage groups...
[2022-02-22 18:01:48,934]    INFO [call_lineages] Redefining lineage groups by removing low proportion intBCs...
[2022-02-22 18:02:09,005]    INFO [call_lineages] Reassigning cells to refined lineage groups by kinship...
[2022-02-22 18:02:09,175]    INFO [call_lineages] Annotating alignment table with refined lineage groups...
[2022-02-22 18:02:14,835]    INFO [call_lineages] Filtering out inter-lineage group doublets with proportion 0.35...
[2022-02-22 18:03:25,194]    INFO [call_lineages] Filtering out low propor

In [87]:
allele_table2

Unnamed: 0,cellBC,intBC,allele,r1,r2,r3,lineageGrp,UMI,readCount
0,AAACCCAAGAAATTCG,GTTAATGAGAAATT,CGAAA[114:2D]TGGCCGATAT[167:2I]GACCCAAGACCC[17...,CGAAA[114:2D]TGGCC,GATAT[167:2I]GACCCAAGACCC[170:4I]AACATGGTT,ATTCG[None]CGGAG,1,1,6
1,AAACCCAAGACCAGAC,TATCCAGCATTCTG,CCGAA[None]AAATGGATAT[None]CTCTGATTCG[None]CGGAG,CCGAA[None]AAATG,GATAT[None]CTCTG,ATTCG[None]CGGAG,1,1,1
2,AAACCCAAGAGAGGTA,CTTTTAGCGGTGTG,CGTCG[89:26D]ATGGCGATAT[None]CTCTGATTCG[None]C...,CGTCG[89:26D]ATGGC,GATAT[None]CTCTG,ATTCG[None]CGGAG,1,1,10
3,AAACCCAAGCAAACAT,GTTAATGAGAAATT,CCTTT[103:9D]AAAATGATAT[167:1I]TCTCTGATTCG[Non...,CCTTT[103:9D]AAAAT,GATAT[167:1I]TCTCTG,ATTCG[None]CGGAG,1,1,1
4,AAACCCAAGCACTCTA,CATCCGGTCCTTCA,CGAAA[114:2D]TGGCCGATAT[167:2I]GACCTGGATGAC[16...,CGAAA[114:2D]TGGCC,GATAT[167:2I]GACCTGGATGAC[168:1D]CTGGT,ATTCG[221:2D]GAGGA,1,8,92
...,...,...,...,...,...,...,...,...,...
90117,TTTGTTGTCTTAGTTC,CATTTATCACCTGA,CCGAA[None]AAATGGATAT[None]CTCTGATTCG[None]CGGAG,CCGAA[None]AAATG,GATAT[None]CTCTG,ATTCG[None]CGGAG,1,1,8
90118,TTTGTTGTCTTAGTTC,CGTTACAAGCTAAC,CCGAA[113:3D]TGGCCGATAT[None]CTCTGATTCG[None]C...,CCGAA[113:3D]TGGCC,GATAT[None]CTCTG,ATTCG[None]CGGAG,1,2,26
90119,TTTGTTGTCTTAGTTC,GTTAATGAGAAATT,AAAAA[116:1I]ATGGCCGATAT[None]CTCTGATTCG[None]...,AAAAA[116:1I]ATGGCC,GATAT[None]CTCTG,ATTCG[None]CGGAG,1,1,5
90120,TTTGTTGTCTTAGTTC,TTCACTTCTCACGC,CCGAA[113:54D]CTCTGCCGAA[113:54D]CTCTGATTCG[No...,CCGAA[113:54D]CTCTG,CCGAA[113:54D]CTCTG,ATTCG[None]CGGAG,1,3,13


In [88]:
allele_table2.to_csv("/media/chang/HDD-1/chang/weissman/raw_allele_table.csv")

In [89]:
umi_table2

Unnamed: 0,readName,AlignmentScore,CIGAR,QueryBegin,ReferenceBegin,Seq,UMI,allele,cellBC,intBC,r1,r2,r3,readCount,status,lineageGrp
0,ACAAGCTTCCTTAGTC_GCTAATAGGTGA_67,1269,115M1I51M9D1M1D103M15D4M5D7M1D3M5I5M,0,0,AATCCAGCTAGCTGTGCAGCCATCCGGTCCTTCAATTCAACTGCAG...,GCTAATAGGTGA,AAAAA[116:1I]ATGGCCGATAT[167:9D]ACCGTATATA[177...,ACAAGCTTCCTTAGTC,CATCCGGTCCTTCA,AAAAA[116:1I]ATGGCC,GATAT[167:9D]ACCGTATATA[177:1D]CCGTG,ATTCG[None]CGGAG,67,good,1.0
1,TCTCAGCTCGGCTGTG_ACCGGGGCTAGC_66,1009,113M12I40M19D48M2D25M,0,0,AATCCAGCTAGCTGTGCAGCCATCCGGTCCTTCAATTCAACTGCAG...,ACCGGGGCTAGC,CGAAA[114:12I]GTGTACGCCGAAAATGGGTTGG[154:19D]T...,TCTCAGCTCGGCTGTG,CATCCGGTCCTTCA,CGAAA[114:12I]GTGTACGCCGAAAATGG,GTTGG[154:19D]TTCAT,ATTCG[221:2D]GAGGA,66,good,1.0
2,CCGATCTTCATTGTTC_GTGATCCATTGG_56,1061,112M53D115M15D4M5D7M1D3M5I2M9I3M1D2M2D2M1D2M1D...,0,0,AATCCAGCTAGCTGTGCAGCCATCCGGTCCTTCAATTCAACTGCAG...,GTGATCCATTGG,CCGAA[113:53D]TCTCTCCGAA[113:53D]TCTCTATTCG[No...,CCGATCTTCATTGTTC,CATCCGGTCCTTCA,CCGAA[113:53D]TCTCT,CCGAA[113:53D]TCTCT,ATTCG[None]CGGAG,56,good,1.0
3,ACTGTCCCACACAGAG_TATACTTTTTGG_54,1273,106M13D161M15D4M5D7M1D3M5I5M,0,0,AATCCAGCTAGCTGTGCAGCCGTTACAAGCTAACATTCAACTGCAG...,TATACTTTTTGG,TGTAC[107:13D]CTGACGATAT[None]CTCTGATTCG[None]...,ACTGTCCCACACAGAG,CGTTACAAGCTAAC,TGTAC[107:13D]CTGAC,GATAT[None]CTCTG,ATTCG[None]CGGAG,54,good,1.0
4,TCAGCAAGTCAGGCAA_AGTCCGAATACA_51,739,112M110D58M15D4M5D7M1D3M2D1M7D2M2D8M1I7M2D6M4D...,0,0,AATCCAGCTAGCTGTGCAGCGTTCTGATCAGAGCATTCAACTGCAG...,AGTCCGAATACA,CCGAA[113:110D]GAGGACCGAA[113:110D]GAGGACCGAA[...,TCAGCAAGTCAGGCAA,GTTCTGATCAGAGC,CCGAA[113:110D]GAGGA,CCGAA[113:110D]GAGGA,CCGAA[113:110D]GAGGA,51,good,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
430184,ATACCGAAGTATGGAT_GACAAGGTAATT_1,1148,106M21D153M15D4M5D7M1D3M2D1M7D2M2D8M1I8M2I6M,0,0,AATACAGATAGCAGAACAGCAAAATAAACAGAACATACAACTACAG...,GACAAGGTAATT,TGTAC[107:21D]TAAGCGATAT[None]CTCTGATTCG[None]...,ATACCGAAGTATGGAT,AAAATAAACAGAAC,TGTAC[107:21D]TAAGC,GATAT[None]CTCTG,ATTCG[None]CGGAG,1,good,1.0
430185,TACCCACGTGTTAAAG_CTATATCTGACT_1,1180,115M2I166M2D2M1D7M6D3M4D3M5D3M,0,0,AATACAGCTAGCAAAACAGACGATAAAAACTAAAATTAAACTGCAG...,CTATATCTGACT,AAAAA[116:2I]AATAGCCGATAT[None]CTCTGATTCG[None...,TACCCACGTGTTAAAG,CGATAAAAACTAAA,AAAAA[116:2I]AATAGCC,GATAT[None]CTCTG,ATTCG[None]CGGAG,1,good,1.0
430186,TACCCACGTGTTAAAG_TTGGGCCGGATT_1,1191,115M2I165M15D4M5D7M1D3M,0,0,AATACAGCTAGCAAAGCAGCCGATACAAGCAAACATACAAATGCAG...,TTGGGCCGGATT,AAAAA[116:2I]AATGGCCGATAT[None]CTCTGATTCG[None...,TACCCACGTGTTAAAG,CGATACAAGCAAAC,AAAAA[116:2I]AATGGCC,GATAT[None]CTCTG,ATTCG[None]CGGAG,1,good,1.0
430187,TACCCACGTGTTAAAG_TCGGCAATCCAT_1,1101,2M2D111M2I165M15D5M5D6M1D3M,2,0,AAAACAGATAGAAATAAAGCCGTTAAAAACTAAAATACAACAGAAG...,TCGGCAATCCAT,AAAAA[116:2I]AATGGCCAATAT[None]ATCTGATTCG[None...,TACCCACGTGTTAAAG,CGTTAAAAACTAAA,AAAAA[116:2I]AATGGCC,AATAT[None]ATCTG,ATTCG[None]CGGAG,1,good,1.0


In [101]:
df = pd.DataFrame(0, columns=pd.unique(umi_table2['cellBC']), index=pd.unique(umi_table2['allele']))

In [103]:
umi = set()
for row in umi_table2.itertuples():
    if row.UMI not in umi:
        df.at[row.allele, row.cellBC] += 1
        umi.add(row.UMI)

In [109]:
bc = [line.rstrip() for line in open("/media/chang/HDD-1/chang/weissman/cell_bc.txt")]

In [150]:
filtered_bc = df[set(bc).intersection(set(df.columns.to_list()))]

In [172]:
filtered_bc

Unnamed: 0,GTAGGAGTCATTTCCA,AGATGAAGTTGCGGAA,CTCCAACCAGAGTTCT,TCTCACGCATAAGCGG,TGTGCGGTCCGTGGGT,AGCCAGCAGCGAAACC,CCGATCTGTTAGCGGA,GTTCGCTGTCAACGCC,TCTGCCATCCGATTAG,GACACGCGTAGTGATA,...,TGTGATGAGTTGTAAG,CTAAGTGAGGGAGAAT,AAGACAATCCGAGAAG,ATCGGCGCAATCTCGA,GCTTCACTCGCAATTG,TAACACGAGGGCTAAC,AATGGCTTCGCAATGT,ATGCATGTCCAAAGGG,ATCGGATTCGTTCCTG,GACTATGGTCCACGCA
AAAAA[116:1I]ATGGCCGATAT[167:9D]ACCGTATATA[177:1D]CCGTGATTCG[None]CGGAG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CGAAA[114:12I]GTGTACGCCGAAAATGGGTTGG[154:19D]TTCATATTCG[221:2D]GAGGA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CCGAA[113:53D]TCTCTCCGAA[113:53D]TCTCTATTCG[None]CGGAG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TGTAC[107:13D]CTGACGATAT[None]CTCTGATTCG[None]CGGAG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CCGAA[113:110D]GAGGACCGAA[113:110D]GAGGACCGAA[113:110D]GAGGA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCCTT[102:7D]CAAAAAAAAA[116:7D]AAAAAAAAAA[164:2I]AGTATTCAGTAT[167:1D]TCGGCGGCTT[175:1D]ATACAACAAT[214:3I]ACATATCAACATA[216:2I]TCATTCT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GTACG[108:8D]TGGCCGATAT[167:5D]GTTCAATTAG[None]CGGAG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ATAAG[104:3I]AAATAGAAAAATA[106:4D]GAAAAAAAAA[116:5I]AAAAATAACCAACCT[122:1I]CGTAAACCTCG[123:1I]TAAAACCATTG[153:4D]ATTTCTTTCG[163:1I]CATACTGCATA[166:1D]CTCCTATAAT[178:5D]ACAGAATCAA[200:10D]CTATCACTAT[214:5D]CCCCAATCCC[222:6D]CAACCCCCAA[231:2D]CCACT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CGCCG[111:1I]CAAAACCAAAA[115:1I]CATGGCGATCT[None]CTCTGATTCG[None]CGGAG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [160]:
diff = (set(bc) ^ set(df.columns.to_list())) & set(df.columns.to_list())
len(diff)

41956

In [169]:
empty_droplets = df[diff].T
empty_profile = empty_droplets.sum()/empty_droplets.sum().sum() 

In [170]:
import scAR

In [171]:
scarObj = scAR.model(raw_count=filtered_bc.T,
                     empty_profile=empty_profile.values,
                     scRNAseq_tech='CROPseq')

scarObj.train(epochs=400)

..Running VAE using the following param set:
......scAR mode:  CROPseq
......count model:  binomial
......num_input_feature:  66250
......NN_layer1:  150
......NN_layer2:  100
......latent_space:  15
......dropout_prob:  0
......kld_weight:  1e-05
......lr:  0.001
......lr_step_size:  5
......lr_gamma:  0.97
  Training.....
100%|██████████| 400/400 [28:40<00:00,  4.30s/it]


In [173]:
scarObj.inference(feature_type='sgRNAs', cutoff=3)

  Inferring .....


In [176]:
denoised_count = pd.DataFrame(scarObj.native_counts, index=filtered_bc.columns, columns=filtered_bc.index)

In [178]:
denoised_count.max()

AAAAA[116:1I]ATGGCCGATAT[167:9D]ACCGTATATA[177:1D]CCGTGATTCG[None]CGGAG                                                                                                                                                                                     0.000000
CGAAA[114:12I]GTGTACGCCGAAAATGGGTTGG[154:19D]TTCATATTCG[221:2D]GAGGA                                                                                                                                                                                       76.490435
CCGAA[113:53D]TCTCTCCGAA[113:53D]TCTCTATTCG[None]CGGAG                                                                                                                                                                                                    219.344134
TGTAC[107:13D]CTGACGATAT[None]CTCTGATTCG[None]CGGAG                                                                                                                                                                      