From d4891960db538566f7c92f268cbd08c2abcccb54 Mon Sep 17 00:00:00 2001 From: roryk Date: Tue, 13 Jun 2017 15:49:01 -0400 Subject: [PATCH] Add barcode files internal to bcbio-nextgen for inDrop sequencing. This provides support for v2 and v3 of the Harvard inDrop protocol. --- MANIFEST.in | 2 + bcbio/data/umis/harvard-indrop-v2-cb1.txt | 384 +++++++++++++++++++++ bcbio/data/umis/harvard-indrop-v2-cb2.txt | 384 +++++++++++++++++++++ bcbio/data/umis/harvard-indrop-v2-transform.json | 4 + bcbio/data/umis/harvard-indrop-v3-cb1.txt | 384 +++++++++++++++++++++ bcbio/data/umis/harvard-indrop-v3-cb2.txt | 384 +++++++++++++++++++++ .../umis/harvard-indrop-v3-sample_barcodes.txt | 24 ++ bcbio/data/umis/harvard-indrop-v3-transform.json | 6 + bcbio/rnaseq/umi.py | 59 ++-- setup.py | 3 +- tests/data/Harvard-inDrop/bc1.txt | 4 - tests/data/Harvard-inDrop/bc2.txt | 4 - tests/data/Harvard-inDrop/harvard-indrop-v3-b1.txt | 384 +++++++++++++++++++++ tests/data/Harvard-inDrop/harvard-indrop-v3-b2.txt | 384 +++++++++++++++++++++ .../harvard-indrop-v3-sample_barcodes.txt | 24 ++ tests/data/automated/run_info-scrnaseq.yaml | 4 +- 16 files changed, 2395 insertions(+), 43 deletions(-) create mode 100644 bcbio/data/umis/harvard-indrop-v2-cb1.txt create mode 100644 bcbio/data/umis/harvard-indrop-v2-cb2.txt create mode 100644 bcbio/data/umis/harvard-indrop-v2-transform.json create mode 100644 bcbio/data/umis/harvard-indrop-v3-cb1.txt create mode 100644 bcbio/data/umis/harvard-indrop-v3-cb2.txt create mode 100644 bcbio/data/umis/harvard-indrop-v3-sample_barcodes.txt create mode 100644 bcbio/data/umis/harvard-indrop-v3-transform.json delete mode 100644 tests/data/Harvard-inDrop/bc1.txt delete mode 100644 tests/data/Harvard-inDrop/bc2.txt create mode 100644 tests/data/Harvard-inDrop/harvard-indrop-v3-b1.txt create mode 100644 tests/data/Harvard-inDrop/harvard-indrop-v3-b2.txt create mode 100644 tests/data/Harvard-inDrop/harvard-indrop-v3-sample_barcodes.txt diff --git a/MANIFEST.in b/MANIFEST.in index 4076b76ee..3a22d754b 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,5 +1,7 @@ include *.txt include *.md +include bcbio/data/umis/*.txt +include bcbio/data/umis/*.json include config/*.yaml include config/*.ini include tests/*.py diff --git a/bcbio/data/umis/harvard-indrop-v2-cb1.txt b/bcbio/data/umis/harvard-indrop-v2-cb1.txt new file mode 100644 index 000000000..edb81e38a --- /dev/null +++ b/bcbio/data/umis/harvard-indrop-v2-cb1.txt @@ -0,0 +1,384 @@ +GTTTGTTT +ACCGTGTTT +GATAGTGTTT +TGAGGCGGTTT +GATCGTTT +ATCACGTTT +GATGTAGTTT +TGACACAGTTT +CTTTCTTT +AGCCTCTTT +GACGGGCTTT +TGAATGACTTT +TGCTATTT +ACGGAATTT +GACATTTGTT +TGAGTTCTGTT +CCGCTGTT +AAAATCGTT +GATTGGCGTT +TGACTACCGTT +GTAACGTT +AACTGAGTT +GAAGGCAGTT +TGACTGTTCTT +ACCTTCTT +AATACTCTT +GAGAAGGCTT +TGAAGGAGCTT +TCATCCTT +AAGCGCCTT +GAGGTCCCTT +TGACAATACTT +TTGGACTT +ACCCGACTT +GATCTCACTT +TGAGACAACTT +TCCTTATT +AGATGTATT +GAGTCATATT +TGAGCCGGATT +CTTCGATT +AGAACGATT +GAACGCCATT +TGACATACATT +ATCTTTGT +ACTACTTGT +GAAAGATTGT +TGACTTGGTGT +TTATCTGT +ATGGCCTGT +GACGAGATGT +TGAGTCCATGT +GGGTTGGT +ACCCTTGGT +GATCTGTGGT +TGAAAACTGGT +GCATGGGT +AAATCGGGT +GATTGAGGGT +TGATCGACGGT +CTTCAGGT +AGGGAAGGT +GAGAATTCGT +TGAGTCGTCGT +TTAAGCGT +ATGCTCCGT +GAACTGCCGT +TGATAACCCGT +CCAACCGT +AGTTTACGT +GACAATTAGT +TGACGGGTAGT +GCTCTAGT +AGTATGAGT +GATTCCGAGT +TGACCAGCAGT +TGACCAGT +AAGCGAAGT +GATGGTTTCT +TGACACTTTCT +AAGCTTCT +ATTGATTCT +GATGAGGTCT +TGACCTCGTCT +GTCTCTCT +AGCACCTCT +GAGCGTTGCT +TGATACGTGCT +GGCATGCT +AAGATGGCT +GAACCACGCT +TGAGTGGAGCT +TCGAAGCT +ATGTGTCCT +GACGACTCCT +TGATATTGCCT +TTCGGCCT +AAAACGCCT +GACAGTCCCT +TGATTTACCCT +GCTTACCT +AATATACCT +GAGGGAACCT +TGACCATTACT +TAACTACT +ATTGTGACT +GACACGGACT +TGAGAAGCACT +GTTCAACT +ACCGCAACT +GATACAAACT +TGACCTGTTAT +TAGCTTAT +AGGGTGTAT +GAGAGAGTAT +TGAACATCTAT +TTGCATAT +AACCCATAT +GACGATTGAT +TGATCCCTGAT +GGTGGGAT +AATGCGGAT +GAACTAGGAT +TGAAGCGCGAT +GTTACGAT +AGCCAAGAT +GAGTTGTCAT +TGACAAGTCAT +ATATGCAT +ACTCCGCAT +GAGAGCCCAT +TGACAGACCAT +CGGCACAT +AAAGGTAAT +GACGAATAAT +TGACTCAGAAT +ACTTCAAT +AGGGCCAAT +GAATGGAAAT +TGACAACAAAT +AATGTTTG +ACTGCGTTG +GAATTCCTTG +TGAAACCCTTG +GTACCTTG +ACTAGATTG +GAGAGAATTG +TGAAGGTTGTG +TACTTGTG +AGGTTAGTG +GAATCAAGTG +TGACGAGTCTG +CCCATCTG +AGCAACCTG +GATTAAACTG +TGATCGTCATG +GCAGCATG +AAATGAATG +GACCCGAATG +TGATAGAAATG +AGAGGTGG +ACAACGTGG +GACTGTCTGG +TGATTCGCTGG +TCATATGG +AGTGGATGG +GAGACGATGG +TGAATGCATGG +CTTACGGG +AAGAACGGG +GACAAGAGGG +TGAAAACAGGG +TGCAAGGG +AAAAGTCGG +GAGATCTCGG +TGACGTATCGG +ATTTCCGG +AAGCTACGG +GATAAGACGG +TGAAGCGTAGG +TAAATAGG +ATCATGAGG +GATGTAAAGG +TGAGACAAAGG +GAGTTTCG +ATCGGTTCG +GACTTCTTCG +TGAAAATGTCG +TAGCCTCG +ATTGGATCG +GATGCCATCG +TGATTAGTCCG +TACAGCCG +AACTCACCG +GATCGGTACG +TGAATTCGACG +GTTGCACG +AATCCCACG +GATGTACACG +TGAAACACACG +AGGCAACG +AACGAAACG +GAGGCGTTAG +TGATCCCGTAG +TAGTCTAG +ACGTGCTAG +GACCTACTAG +TGATGTTTGAG +GATGTGAG +ATTTGGGAG +GATGGAGGAG +TGATCACCGAG +CTATAGAG +AACGCAGAG +GACCCTTCAG +TGAACGCTCAG +CATCGCAG +ATCTAGCAG +GATGTTCCAG +TGAATACCCAG +TGCGACAG +AGGTCACAG +GATTTAACAG +TGACACAACAG +GGAAACAG +AGGCCTAAG +GAACACTAAG +TGACGTAGAAG +GGATAAAG +AAGTGAAAG +GAGTCCAAAG +TGATGTCTTTC +CGTATTTC +AATATCTTC +GATGGGATTC +TGAGCGCATTC +TTTGTGTC +ACAGGTGTC +GACGCTAGTC +TGAGGTTTCTC +TTCCGCTC +ACACTCCTC +GATGACCCTC +TGAGTACACTC +TGCGTATC +ATCTGCATC +GATAACCATC +TGAGCCACATC +CTTTAATC +AAAGTAATC +GATCCCAATC +TGAGGGAAATC +CAGTTTGC +ACTGAGTGC +GAAGTGATGC +TGACTCGATGC +GCTTTGGC +AATGTTGGC +GATACCAGGC +TGACACAAGGC +ATCAGCGC +AGTTACCGC +GAGAATACGC +TGATTGCACGC +AACTTAGC +AACGGTAGC +GACCCATAGC +TGACTACGAGC +GGAGAAGC +ATTCGTTCC +GAGGACTTCC +TGATCCAGTCC +AGAAGTCC +AAAACCTCC +GACTTACTCC +TGAAACAATCC +ACCTTGCC +AGAAGTGCC +GAATTGGGCC +TGATTGTCGCC +TTATAGCC +AGCAAAGCC +GACATCTCCC +TGAGTAATCCC +TGATGCCC +AAATGACCC +GACTAGACCC +TGAGATTTACC +TGGCTACC +ATTAGGACC +GAGAAAGACC +TGATCGACACC +GTGTAACC +ACCCTAACC +GATCTCAACC +TGATTGTTTAC +CGGCTTAC +ACAGATTAC +GAAAGCGTAC +TGAGTCCGTAC +ACGTATAC +AGTCAATAC +GACTCTTGAC +TGAGGTCTGAC +AACCTGAC +ATAGTGGAC +GATGACGGAC +TGAGCAAGGAC +GATTAGAC +ATTCCAGAC +GAAGGAAGAC +TGAGAGTTCAC +TGCCTCAC +ATTTATCAC +GAATGGGCAC +TGACTTCGCAC +AGCACCAC +AGGTGACAC +GACCTGACAC +TGACTAGTAAC +AGCAGAAC +ACGGACAAC +GATCGGTTTA +TGAAGAAGTTA +GGCCCTTA +AATGGATTA +GACCACATTA +TGAGCAGGGTA +GAGCGGTA +ACTTAGGTA +GAGGGAGGTA +TGACTCGCGTA +CGAACGTA +AATTCAGTA +GATTGATCTA +TGATGTGGCTA +ATCCGCTA +AAAAGCCTA +GACGTACCTA +TGAGGCTACTA +AGAGACTA +ACGTGGATA +GAGACAGATA +TGATTCACATA +CGCTAATA +ACCATTTGA +GACGCCTTGA +TGAGAGGCTGA +TGGTATGA +AAGCTATGA +GATGAAATGA +TGACTTCTGGA +TCCAGGGA +AGTGTCGGA +GAACAGCGGA +TGAATATAGGA +GCAGTCGA +AAAACTCGA +GAGATTGCGA +TGAATGACCGA +ACCCACGA +AGGGAACGA +GAAGTTTAGA +TGAGGAATAGA +AAATCAGA +AGTCAAAGA +GACCTATTCA +TGAAGGATTCA +CGACGTCA +ACGCTCTCA +GATGTGCTCA +TGACTGGTGCA +TACCGGCA +ATAGTCGCA +GACGTCAGCA +TGAATGAAGCA +CCCAAGCA +AGCTTTCCA +GATCCGTCCA +TGAACTAGCCA +AATTCCCA +AAGACACCA +GAGTTAACCA +TGATGATAACA diff --git a/bcbio/data/umis/harvard-indrop-v2-cb2.txt b/bcbio/data/umis/harvard-indrop-v2-cb2.txt new file mode 100644 index 000000000..b2cf276eb --- /dev/null +++ b/bcbio/data/umis/harvard-indrop-v2-cb2.txt @@ -0,0 +1,384 @@ +GTTTGTTT +CCGTGTTT +TAGTGTTT +GGCGGTTT +GATCGTTT +TCACGTTT +TGTAGTTT +CACAGTTT +CTTTCTTT +GCCTCTTT +CGGGCTTT +ATGACTTT +TGCTATTT +CGGAATTT +CATTTGTT +GTTCTGTT +CCGCTGTT +AAATCGTT +TTGGCGTT +CTACCGTT +GTAACGTT +ACTGAGTT +AGGCAGTT +CTGTTCTT +ACCTTCTT +ATACTCTT +GAAGGCTT +AGGAGCTT +TCATCCTT +AGCGCCTT +GGTCCCTT +CAATACTT +TTGGACTT +CCCGACTT +TCTCACTT +GACAACTT +TCCTTATT +GATGTATT +GTCATATT +GCCGGATT +CTTCGATT +GAACGATT +ACGCCATT +CATACATT +ATCTTTGT +CTACTTGT +AAGATTGT +CTTGGTGT +TTATCTGT +TGGCCTGT +CGAGATGT +GTCCATGT +GGGTTGGT +CCCTTGGT +TCTGTGGT +AAACTGGT +GCATGGGT +AATCGGGT +TTGAGGGT +TCGACGGT +CTTCAGGT +GGGAAGGT +GAATTCGT +GTCGTCGT +TTAAGCGT +TGCTCCGT +ACTGCCGT +TAACCCGT +CCAACCGT +GTTTACGT +CAATTAGT +CGGGTAGT +GCTCTAGT +GTATGAGT +TTCCGAGT +CCAGCAGT +TGACCAGT +AGCGAAGT +TGGTTTCT +CACTTTCT +AAGCTTCT +TTGATTCT +TGAGGTCT +CCTCGTCT +GTCTCTCT +GCACCTCT +GCGTTGCT +TACGTGCT +GGCATGCT +AGATGGCT +ACCACGCT +GTGGAGCT +TCGAAGCT +TGTGTCCT +CGACTCCT +TATTGCCT +TTCGGCCT +AAACGCCT +CAGTCCCT +TTTACCCT +GCTTACCT +ATATACCT +GGGAACCT +CCATTACT +TAACTACT +TTGTGACT +CACGGACT +GAAGCACT +GTTCAACT +CCGCAACT +TACAAACT +CCTGTTAT +TAGCTTAT +GGGTGTAT +GAGAGTAT +ACATCTAT +TTGCATAT +ACCCATAT +CGATTGAT +TCCCTGAT +GGTGGGAT +ATGCGGAT +ACTAGGAT +AGCGCGAT +GTTACGAT +GCCAAGAT +GTTGTCAT +CAAGTCAT +ATATGCAT +CTCCGCAT +GAGCCCAT +CAGACCAT +CGGCACAT +AAGGTAAT +CGAATAAT +CTCAGAAT +ACTTCAAT +GGGCCAAT +ATGGAAAT +CAACAAAT +AATGTTTG +CTGCGTTG +ATTCCTTG +AACCCTTG +GTACCTTG +CTAGATTG +GAGAATTG +AGGTTGTG +TACTTGTG +GGTTAGTG +ATCAAGTG +CGAGTCTG +CCCATCTG +GCAACCTG +TTAAACTG +TCGTCATG +GCAGCATG +AATGAATG +CCCGAATG +TAGAAATG +AGAGGTGG +CAACGTGG +CTGTCTGG +TTCGCTGG +TCATATGG +GTGGATGG +GACGATGG +ATGCATGG +CTTACGGG +AGAACGGG +CAAGAGGG +AAACAGGG +TGCAAGGG +AAAGTCGG +GATCTCGG +CGTATCGG +ATTTCCGG +AGCTACGG +TAAGACGG +AGCGTAGG +TAAATAGG +TCATGAGG +TGTAAAGG +GACAAAGG +GAGTTTCG +TCGGTTCG +CTTCTTCG +AAATGTCG +TAGCCTCG +TTGGATCG +TGCCATCG +TTAGTCCG +TACAGCCG +ACTCACCG +TCGGTACG +ATTCGACG +GTTGCACG +ATCCCACG +TGTACACG +AACACACG +AGGCAACG +ACGAAACG +GGCGTTAG +TCCCGTAG +TAGTCTAG +CGTGCTAG +CCTACTAG +TGTTTGAG +GATGTGAG +TTTGGGAG +TGGAGGAG +TCACCGAG +CTATAGAG +ACGCAGAG +CCCTTCAG +ACGCTCAG +CATCGCAG +TCTAGCAG +TGTTCCAG +ATACCCAG +TGCGACAG +GGTCACAG +TTTAACAG +CACAACAG +GGAAACAG +GGCCTAAG +ACACTAAG +CGTAGAAG +GGATAAAG +AGTGAAAG +GTCCAAAG +TGTCTTTC +CGTATTTC +ATATCTTC +TGGGATTC +GCGCATTC +TTTGTGTC +CAGGTGTC +CGCTAGTC +GGTTTCTC +TTCCGCTC +CACTCCTC +TGACCCTC +GTACACTC +TGCGTATC +TCTGCATC +TAACCATC +GCCACATC +CTTTAATC +AAGTAATC +TCCCAATC +GGGAAATC +CAGTTTGC +CTGAGTGC +AGTGATGC +CTCGATGC +GCTTTGGC +ATGTTGGC +TACCAGGC +CACAAGGC +ATCAGCGC +GTTACCGC +GAATACGC +TTGCACGC +AACTTAGC +ACGGTAGC +CCCATAGC +CTACGAGC +GGAGAAGC +TTCGTTCC +GGACTTCC +TCCAGTCC +AGAAGTCC +AAACCTCC +CTTACTCC +AACAATCC +ACCTTGCC +GAAGTGCC +ATTGGGCC +TTGTCGCC +TTATAGCC +GCAAAGCC +CATCTCCC +GTAATCCC +TGATGCCC +AATGACCC +CTAGACCC +GATTTACC +TGGCTACC +TTAGGACC +GAAAGACC +TCGACACC +GTGTAACC +CCCTAACC +TCTCAACC +TTGTTTAC +CGGCTTAC +CAGATTAC +AAGCGTAC +GTCCGTAC +ACGTATAC +GTCAATAC +CTCTTGAC +GGTCTGAC +AACCTGAC +TAGTGGAC +TGACGGAC +GCAAGGAC +GATTAGAC +TTCCAGAC +AGGAAGAC +GAGTTCAC +TGCCTCAC +TTTATCAC +ATGGGCAC +CTTCGCAC +AGCACCAC +GGTGACAC +CCTGACAC +CTAGTAAC +AGCAGAAC +CGGACAAC +TCGGTTTA +AGAAGTTA +GGCCCTTA +ATGGATTA +CCACATTA +GCAGGGTA +GAGCGGTA +CTTAGGTA +GGGAGGTA +CTCGCGTA +CGAACGTA +ATTCAGTA +TTGATCTA +TGTGGCTA +ATCCGCTA +AAAGCCTA +CGTACCTA +GGCTACTA +AGAGACTA +CGTGGATA +GACAGATA +TTCACATA +CGCTAATA +CCATTTGA +CGCCTTGA +GAGGCTGA +TGGTATGA +AGCTATGA +TGAAATGA +CTTCTGGA +TCCAGGGA +GTGTCGGA +ACAGCGGA +ATATAGGA +GCAGTCGA +AAACTCGA +GATTGCGA +ATGACCGA +ACCCACGA +GGGAACGA +AGTTTAGA +GGAATAGA +AAATCAGA +GTCAAAGA +CCTATTCA +AGGATTCA +CGACGTCA +CGCTCTCA +TGTGCTCA +CTGGTGCA +TACCGGCA +TAGTCGCA +CGTCAGCA +ATGAAGCA +CCCAAGCA +GCTTTCCA +TCCGTCCA +ACTAGCCA +AATTCCCA +AGACACCA +GTTAACCA +TGATAACA diff --git a/bcbio/data/umis/harvard-indrop-v2-transform.json b/bcbio/data/umis/harvard-indrop-v2-transform.json new file mode 100644 index 000000000..e78078560 --- /dev/null +++ b/bcbio/data/umis/harvard-indrop-v2-transform.json @@ -0,0 +1,4 @@ +{ + "read1": "(@.*)\\n(?P.*)\\n\\+(.*)\\n(?P.*)\\n", + "read2": "(?P^@.*)\\n(?P\\w{8,11})(GAGTGATTGCTTGTGACGCCTT){s<=3}(?P\\w{8})(?P\\w{6})(.*)\\n+(.*)\\n(.*)\\n" +} diff --git a/bcbio/data/umis/harvard-indrop-v3-cb1.txt b/bcbio/data/umis/harvard-indrop-v3-cb1.txt new file mode 100644 index 000000000..f9d3b141d --- /dev/null +++ b/bcbio/data/umis/harvard-indrop-v3-cb1.txt @@ -0,0 +1,384 @@ +AAACAAAC +AAACACGG +AAACACTA +AAACCGCC +AAACGATC +AAACGTGA +AAACTACA +AAACTGTG +AAAGAAAG +AAAGAGGC +AAAGCCCG +AAAGTCAT +AAATAGCA +AAATTCCG +AACAAATG +AACAGAAC +AACAGCGG +AACGATTT +AACGCCAA +AACGGTAG +AACGTTAC +AACTCAGT +AACTGCCT +AAGAACAG +AAGAAGGT +AAGAGTAT +AAGCCTTC +AAGCTCCT +AAGGATGA +AAGGCGCT +AAGGGACC +AAGTATTG +AAGTCCAA +AAGTCGGG +AAGTGAGA +AAGTTGTC +AATAAGGA +AATACATC +AATATGAC +AATCCGGC +AATCGAAG +AATCGTTC +AATGGCGT +AATGTATG +ACAAAGAT +ACAAGTAG +ACAATCTT +ACACCAAG +ACAGATAA +ACAGGCCA +ACATCTCG +ACATGGAC +ACCAACCC +ACCAAGGG +ACCACAGA +ACCAGTTT +ACCCATGC +ACCCGATT +ACCCTCAA +ACCGTCGA +ACCTGAAG +ACCTTCCC +ACGAATTC +ACGACGAC +ACGCTTAA +ACGGAGCA +ACGGCAGT +ACGGGTTA +ACGGTTGG +ACGTAAAC +ACTAATTG +ACTACCCG +ACTAGAGC +ACTCATAC +ACTCGGAA +ACTGCTGG +ACTGGTCA +ACTTCGCT +AGAAACCA +AGAAAGTG +AGAAGCTT +AGAATCAA +AGACCTCA +AGACGAGG +AGAGAGAC +AGAGGTGC +AGCAACGC +AGCACGTA +AGCATGCC +AGCCATCT +AGCGTGGT +AGCTCCAC +AGCTTCGA +AGGACACA +AGGAGTCG +AGGCAATA +AGGCCGAA +AGGCGTTT +AGGGACTG +AGGGTAAA +AGGTAAGC +AGGTATAT +AGGTTCCC +AGTAATGG +AGTAGTTA +AGTCACAA +AGTCCGTG +AGTGCTTC +AGTTGAAC +AGTTGCGG +AGTTTGTA +ATAACAGG +ATAAGCTA +ATACACCC +ATACTCTC +ATAGATGT +ATATGCAA +ATATGGGT +ATCAATCG +ATCAGGGA +ATCCCACC +ATCCGCAT +ATCCTAGT +ATCGCGCT +ATCGTAAC +ATCTTGGC +ATGACAAC +ATGACTTG +ATGCATAT +ATGCGGAG +ATGGGCTC +ATGGTCTG +ATGTGCCG +ATTACCTT +ATTATTCG +ATTCTGAG +ATTGAAGT +ATTGGCCC +ATTTCCAT +ATTTGTTG +CAAACATT +CAACGCAG +CAAGGAAT +CAAGGGTT +CAAGGTAC +CAATCTAG +CAATTCTC +CACAACCT +CACAAGTA +CACTAACC +CACTTGAT +CAGACTCG +CAGATGGG +CAGGTTGC +CAGTTTAA +CATGACGA +CATGCTGC +CATTCATT +CATTCGGG +CATTTCTA +CCACCTCT +CCACGTTG +CCAGACAG +CCAGCGAA +CCATATGA +CCATCCAC +CCATCGTC +CCATGCAT +CCCGTAAG +CCCGTTCT +CCCTCTTG +CCCTGTTT +CCCTTGCA +CCGACTTT +CCGAGATC +CCGATACG +CCGGAAAT +CCGTAGCT +CCGTCTTA +CCTACGCT +CCTATTTA +CCTCATGA +CCTTTACA +CCTTTGTC +CGAAACTC +CGAACCGA +CGAAGAAG +CGACATTT +CGAGGCTA +CGATCCAA +CGATGGCA +CGGACTAA +CGGCTGTA +CGGTGAGT +CGTACCGA +CGTCGAAT +CGTGCAAC +CGTGGGAT +CGTGTACA +CGTGTGTT +CGTTGCCT +CGTTTCGT +CTAACGCC +CTACGGGA +CTAGACTA +CTAGCACG +CTAGTAGG +CTCAAACA +CTCACATC +CTCCCAAA +CTCCTCCA +CTCGGTGA +CTCTATAG +CTCTGCGT +CTGAAGGG +CTGAGCGT +CTGCGATG +CTGCTAGA +CTGGAACA +CTGGGTAT +CTGTCGCA +CTGTGACC +CTGTTAAA +CTGTTGTG +CTGTTTCC +CTTAGGCC +CTTAGTGT +CTTCTACG +CTTTATCC +CTTTCACT +CTTTGGAC +GAAAGACA +GAAATACG +GAAGATAT +GAATCCCA +GAATGCGC +GACACAAA +GACACCTG +GACTAGCG +GAGAAACC +GAGCGGAA +GAGGAGTG +GAGGGTCA +GAGTGTAC +GATACGCA +GATGCAGA +GATGGTTA +GATGTGGC +GATTAAAG +GATTACTT +GATTGGGA +GATTTCCC +GCAAACTG +GCACTCAG +GCATCACT +GCATCGAG +GCCAAAGC +GCCAACAT +GCCTGGTA +GCCTTGTG +GCGCTGAT +GCGGTAAC +GCGTATTC +GCGTGCAA +GCTAAGTT +GCTACCGT +GCTATGGG +GCTCGTAG +GCTTCTCC +GGAACGAA +GGAAGTCC +GGACTGGA +GGACTTCT +GGAGGTTT +GGAGTAAG +GGATTGTT +GGCAAGGT +GGCACTTC +GGCCCAAT +GGCGACAA +GGCTATAA +GGCTTTGC +GGGAGATG +GGGATTAC +GGGCATCA +GGGTCATT +GGGTCTAG +GGTAAATC +GGTAGCCA +GGTCCTAA +GGTCTTTC +GGTGTCGA +GGTTACAC +GGTTAGGG +GGTTGAGA +GTAAACAA +GTAAGCCG +GTAATCTG +GTACGCTT +GTACGGAC +GTATACGT +GTATTGAC +GTCAAGAG +GTCAGACC +GTCAGGTT +GTCCACTA +GTCCGTCA +GTCCTTGC +GTCTAATC +GTCTGGAA +GTCTTCCT +GTGAACTC +GTGAGGCA +GTGATAAA +GTGCCCAT +GTGCGAAG +GTGGTGCT +GTGTCACC +GTGTCAGG +GTTACTAG +GTTCTGCT +GTTGTCCG +TAAACCGA +TAACTTCT +TAAGGGCC +TAATCCAT +TAATGTGG +TACCCTGC +TACCGCTC +TACCTAAG +TACCTCCC +TACGCGAG +TACGTTCG +TACTGAAT +TAGATCAA +TAGCCACA +TAGCGGAT +TAGGCTTT +TAGGTACG +TAGTAGCC +TAGTCTCT +TATCCACG +TATCTGTC +TATGTGAA +TATTAGCG +TCAAATGG +TCAAGGCG +TCAGCCTC +TCATACCA +TCATAGCT +TCATTTCA +TCCAGAAG +TCCCTGGA +TCCGACAC +TCCGCTGT +TCCTATAT +TCGACTGC +TCGAGTTT +TCGCAATC +TCGGTCAT +TCGTGGGT +TCGTTCCC +TCTAAACT +TCTATTCC +TCTGATTT +TCTTTGAC +TGAATAGG +TGAATCCT +TGACGTCG +TGAGAGCG +TGAGCACA +TGCACCAG +TGCCGGTA +TGCGACTA +TGCTGACG +TGCTTCAT +TGCTTGGG +TGGAAAGC +TGGACGGA +TGGCTAGT +TGGGAATT +TGGTGTCT +TGGTTAAC +TGTTATCA diff --git a/bcbio/data/umis/harvard-indrop-v3-cb2.txt b/bcbio/data/umis/harvard-indrop-v3-cb2.txt new file mode 100644 index 000000000..b2cf276eb --- /dev/null +++ b/bcbio/data/umis/harvard-indrop-v3-cb2.txt @@ -0,0 +1,384 @@ +GTTTGTTT +CCGTGTTT +TAGTGTTT +GGCGGTTT +GATCGTTT +TCACGTTT +TGTAGTTT +CACAGTTT +CTTTCTTT +GCCTCTTT +CGGGCTTT +ATGACTTT +TGCTATTT +CGGAATTT +CATTTGTT +GTTCTGTT +CCGCTGTT +AAATCGTT +TTGGCGTT +CTACCGTT +GTAACGTT +ACTGAGTT +AGGCAGTT +CTGTTCTT +ACCTTCTT +ATACTCTT +GAAGGCTT +AGGAGCTT +TCATCCTT +AGCGCCTT +GGTCCCTT +CAATACTT +TTGGACTT +CCCGACTT +TCTCACTT +GACAACTT +TCCTTATT +GATGTATT +GTCATATT +GCCGGATT +CTTCGATT +GAACGATT +ACGCCATT +CATACATT +ATCTTTGT +CTACTTGT +AAGATTGT +CTTGGTGT +TTATCTGT +TGGCCTGT +CGAGATGT +GTCCATGT +GGGTTGGT +CCCTTGGT +TCTGTGGT +AAACTGGT +GCATGGGT +AATCGGGT +TTGAGGGT +TCGACGGT +CTTCAGGT +GGGAAGGT +GAATTCGT +GTCGTCGT +TTAAGCGT +TGCTCCGT +ACTGCCGT +TAACCCGT +CCAACCGT +GTTTACGT +CAATTAGT +CGGGTAGT +GCTCTAGT +GTATGAGT +TTCCGAGT +CCAGCAGT +TGACCAGT +AGCGAAGT +TGGTTTCT +CACTTTCT +AAGCTTCT +TTGATTCT +TGAGGTCT +CCTCGTCT +GTCTCTCT +GCACCTCT +GCGTTGCT +TACGTGCT +GGCATGCT +AGATGGCT +ACCACGCT +GTGGAGCT +TCGAAGCT +TGTGTCCT +CGACTCCT +TATTGCCT +TTCGGCCT +AAACGCCT +CAGTCCCT +TTTACCCT +GCTTACCT +ATATACCT +GGGAACCT +CCATTACT +TAACTACT +TTGTGACT +CACGGACT +GAAGCACT +GTTCAACT +CCGCAACT +TACAAACT +CCTGTTAT +TAGCTTAT +GGGTGTAT +GAGAGTAT +ACATCTAT +TTGCATAT +ACCCATAT +CGATTGAT +TCCCTGAT +GGTGGGAT +ATGCGGAT +ACTAGGAT +AGCGCGAT +GTTACGAT +GCCAAGAT +GTTGTCAT +CAAGTCAT +ATATGCAT +CTCCGCAT +GAGCCCAT +CAGACCAT +CGGCACAT +AAGGTAAT +CGAATAAT +CTCAGAAT +ACTTCAAT +GGGCCAAT +ATGGAAAT +CAACAAAT +AATGTTTG +CTGCGTTG +ATTCCTTG +AACCCTTG +GTACCTTG +CTAGATTG +GAGAATTG +AGGTTGTG +TACTTGTG +GGTTAGTG +ATCAAGTG +CGAGTCTG +CCCATCTG +GCAACCTG +TTAAACTG +TCGTCATG +GCAGCATG +AATGAATG +CCCGAATG +TAGAAATG +AGAGGTGG +CAACGTGG +CTGTCTGG +TTCGCTGG +TCATATGG +GTGGATGG +GACGATGG +ATGCATGG +CTTACGGG +AGAACGGG +CAAGAGGG +AAACAGGG +TGCAAGGG +AAAGTCGG +GATCTCGG +CGTATCGG +ATTTCCGG +AGCTACGG +TAAGACGG +AGCGTAGG +TAAATAGG +TCATGAGG +TGTAAAGG +GACAAAGG +GAGTTTCG +TCGGTTCG +CTTCTTCG +AAATGTCG +TAGCCTCG +TTGGATCG +TGCCATCG +TTAGTCCG +TACAGCCG +ACTCACCG +TCGGTACG +ATTCGACG +GTTGCACG +ATCCCACG +TGTACACG +AACACACG +AGGCAACG +ACGAAACG +GGCGTTAG +TCCCGTAG +TAGTCTAG +CGTGCTAG +CCTACTAG +TGTTTGAG +GATGTGAG +TTTGGGAG +TGGAGGAG +TCACCGAG +CTATAGAG +ACGCAGAG +CCCTTCAG +ACGCTCAG +CATCGCAG +TCTAGCAG +TGTTCCAG +ATACCCAG +TGCGACAG +GGTCACAG +TTTAACAG +CACAACAG +GGAAACAG +GGCCTAAG +ACACTAAG +CGTAGAAG +GGATAAAG +AGTGAAAG +GTCCAAAG +TGTCTTTC +CGTATTTC +ATATCTTC +TGGGATTC +GCGCATTC +TTTGTGTC +CAGGTGTC +CGCTAGTC +GGTTTCTC +TTCCGCTC +CACTCCTC +TGACCCTC +GTACACTC +TGCGTATC +TCTGCATC +TAACCATC +GCCACATC +CTTTAATC +AAGTAATC +TCCCAATC +GGGAAATC +CAGTTTGC +CTGAGTGC +AGTGATGC +CTCGATGC +GCTTTGGC +ATGTTGGC +TACCAGGC +CACAAGGC +ATCAGCGC +GTTACCGC +GAATACGC +TTGCACGC +AACTTAGC +ACGGTAGC +CCCATAGC +CTACGAGC +GGAGAAGC +TTCGTTCC +GGACTTCC +TCCAGTCC +AGAAGTCC +AAACCTCC +CTTACTCC +AACAATCC +ACCTTGCC +GAAGTGCC +ATTGGGCC +TTGTCGCC +TTATAGCC +GCAAAGCC +CATCTCCC +GTAATCCC +TGATGCCC +AATGACCC +CTAGACCC +GATTTACC +TGGCTACC +TTAGGACC +GAAAGACC +TCGACACC +GTGTAACC +CCCTAACC +TCTCAACC +TTGTTTAC +CGGCTTAC +CAGATTAC +AAGCGTAC +GTCCGTAC +ACGTATAC +GTCAATAC +CTCTTGAC +GGTCTGAC +AACCTGAC +TAGTGGAC +TGACGGAC +GCAAGGAC +GATTAGAC +TTCCAGAC +AGGAAGAC +GAGTTCAC +TGCCTCAC +TTTATCAC +ATGGGCAC +CTTCGCAC +AGCACCAC +GGTGACAC +CCTGACAC +CTAGTAAC +AGCAGAAC +CGGACAAC +TCGGTTTA +AGAAGTTA +GGCCCTTA +ATGGATTA +CCACATTA +GCAGGGTA +GAGCGGTA +CTTAGGTA +GGGAGGTA +CTCGCGTA +CGAACGTA +ATTCAGTA +TTGATCTA +TGTGGCTA +ATCCGCTA +AAAGCCTA +CGTACCTA +GGCTACTA +AGAGACTA +CGTGGATA +GACAGATA +TTCACATA +CGCTAATA +CCATTTGA +CGCCTTGA +GAGGCTGA +TGGTATGA +AGCTATGA +TGAAATGA +CTTCTGGA +TCCAGGGA +GTGTCGGA +ACAGCGGA +ATATAGGA +GCAGTCGA +AAACTCGA +GATTGCGA +ATGACCGA +ACCCACGA +GGGAACGA +AGTTTAGA +GGAATAGA +AAATCAGA +GTCAAAGA +CCTATTCA +AGGATTCA +CGACGTCA +CGCTCTCA +TGTGCTCA +CTGGTGCA +TACCGGCA +TAGTCGCA +CGTCAGCA +ATGAAGCA +CCCAAGCA +GCTTTCCA +TCCGTCCA +ACTAGCCA +AATTCCCA +AGACACCA +GTTAACCA +TGATAACA diff --git a/bcbio/data/umis/harvard-indrop-v3-sample_barcodes.txt b/bcbio/data/umis/harvard-indrop-v3-sample_barcodes.txt new file mode 100644 index 000000000..cab3e3945 --- /dev/null +++ b/bcbio/data/umis/harvard-indrop-v3-sample_barcodes.txt @@ -0,0 +1,24 @@ +ATAGAGAG +AGAGGATA +CTCCTTAC +TATGCAGT +TACTCCTT +AGGCTTAG +ATTAGACG +CGGAGAGA +CTAGTCGA +AGCTAGAA +ACTCTAGG +TCTTACGC +CTTAATAG +ATAGCCTT +TAAGGCTC +TCGCATAA +TTACCTCC +CAGTTATG +CCTTTACT +GACGATTA +GAGACGGA +AGAAAGCT +ACGCTCTT +CGCATTCT diff --git a/bcbio/data/umis/harvard-indrop-v3-transform.json b/bcbio/data/umis/harvard-indrop-v3-transform.json new file mode 100644 index 000000000..4d9eb112a --- /dev/null +++ b/bcbio/data/umis/harvard-indrop-v3-transform.json @@ -0,0 +1,6 @@ +{ + "read1": "(?P[^\\s]+).*\\n(?P.*)\\n\\+(.*)\\n(?P.*)\\n", + "read2": "(.*)\\n(?P.*)\\n(.*)\\n(.*)\\n", + "read3": "(.*)\\n(?P.*)\\n(.*)\\n(.*)\\n", + "read4": "(.*)\\n(?P.{8})(?P.{6})\\n(.*)\\n(.*)\\n" +} diff --git a/bcbio/rnaseq/umi.py b/bcbio/rnaseq/umi.py index 10d0ce7e2..d7ae1bbe6 100644 --- a/bcbio/rnaseq/umi.py +++ b/bcbio/rnaseq/umi.py @@ -73,33 +73,29 @@ def cat(self, newsm, byrow=False): else: self.colnames = self.colnames + newsm.colnames -transforms = {"harvard-indrop": - {"read1": r"""(?P^@.*)\n(?P\w{8,11})(GAGTGATTGCTTGTGACGCCTT){s<=3}(?P\w{8})(?P\w{6})(.*)\n+(.*)\n(.*)\n""", - "read2": r"""(@.*)\n(?P.*)\n\+(.*)\n(?P.*)\n"""}, - "harvard-indrop-v2": - {"read2": r"""(?P^@.*)\n(?P\w{8,11})(GAGTGATTGCTTGTGACGCCTT){s<=3}(?P\w{8})(?P\w{6})(.*)\n+(.*)\n(.*)\n""", - "read1": r"""(@.*)\n(?P.*)\n\+(.*)\n(?P.*)\n"""}, - "CEL-seq": - {"read1": r"""(?P@.*) .*\n(?P.{8})(?P.{4})(.*)\n\+(.*)\n(.*)\n""", - "read2": r"""(@.*)\n(?P.*)\n\+(.*)\n(?P.*)\n"""}, - "harvard-indrop-v3": - {"read1": r"""(?P[^\s]+).*\n(?P.*)\n\+(.*)\n(?P.*)\n""", - "read2": r"""(.*)\n(?P.*)\n(.*)\n(.*)\n""", - "read3": r"""(.*)\n(?P.*)\n(.*)\n(.*)\n""", - "read4": r"""(.*)\n(?P.{8})(?P.{6})\n(.*)\n(.*)\n"""}} +TRANSFORM_DIR = os.path.join(os.path.dirname(__file__), os.pardir, "data", + "umis") +TRANSFORM_FILES = glob.glob(os.path.join(TRANSFORM_DIR, "*-transform.json")) +SUPPORTED_TRANSFORMS = set([os.path.basename(x).replace("-transform.json", "") + for x in TRANSFORM_FILES]) -def write_transform_file(transform_data, out_file): - """ - write out the regex to pull out the UMI and cellular barcodes from - the reads to a JSON file, for use with umis.py - """ - if file_exists(out_file): - return out_file +def is_supported_transform(data): + return dd.get_umi_type(data) in SUPPORTED_TRANSFORMS - with file_transaction(out_file) as tx_out_file: - with open(tx_out_file, "w") as out_handle: - json.dump(transform_data, out_handle) - return out_file +def get_transform_file(stem): + transform_file = os.path.join(TRANSFORM_DIR, stem + "-transform.json") + return transform_file + +def get_cellular_barcodes(data): + if dd.get_cellular_barcodes(data): + return dd.get_cellular_barcodes(data) + if is_supported_transform(data): + stem = dd.get_umi_type(data) + bc1 = os.path.join(TRANSFORM_DIR, stem + "-cb1.txt") + bc2 = os.path.join(TRANSFORM_DIR, stem + "-cb2.txt") + return filter(file_exists, [bc1, bc2]) + else: + return [] def umi_transform(data): """ @@ -115,21 +111,20 @@ def umi_transform(data): if file_exists(transform): transform_file = transform else: - transform_data = transforms.get(transform, "") - if not transform_data: + transform_file = get_transform_file(transform) + if not transform_file: logger.error( "The UMI transform can be specified as either a file or a " "bcbio-supported transform. Either the file %s does not exist " - "or the transform is not supported by bcbio.") + "or the transform is not supported by bcbio." % transform_file) sys.exit(1) - transform_file = os.path.join(umi_dir, transform + ".json") - transform_file = write_transform_file(transform_data, transform_file) out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return [[data]] - if len(dd.get_cellular_barcodes(data)) == 2: + cellular_barcodes = get_cellular_barcodes(data) + if len(cellular_barcodes) == 2: split_option = "--separate_cb" else: split_option = "" @@ -157,7 +152,7 @@ def filter_barcodes(data): fq1 = dd.get_input_sequence_files(data)[0] umi_dir = os.path.join(dd.get_work_dir(data), "umis") correction = dd.get_cellular_barcode_correction(data) - bc = dd.get_cellular_barcodes(data) + bc = get_cellular_barcodes(data) if not bc: return [[data]] bc1 = None diff --git a/setup.py b/setup.py index f30e198f5..83d4ad15f 100755 --- a/setup.py +++ b/setup.py @@ -37,4 +37,5 @@ def write_version_py(): packages=find_packages(exclude=["tests"]), zip_safe=zip_safe, scripts=scripts, - install_requires=install_requires) + install_requires=install_requires, + include_package_data=True) diff --git a/tests/data/Harvard-inDrop/bc1.txt b/tests/data/Harvard-inDrop/bc1.txt deleted file mode 100644 index 2660d4138..000000000 --- a/tests/data/Harvard-inDrop/bc1.txt +++ /dev/null @@ -1,4 +0,0 @@ -AATCGTTC -ATATGCAC -AAGGCCAA -ACGGCAGT diff --git a/tests/data/Harvard-inDrop/bc2.txt b/tests/data/Harvard-inDrop/bc2.txt deleted file mode 100644 index 30a56081f..000000000 --- a/tests/data/Harvard-inDrop/bc2.txt +++ /dev/null @@ -1,4 +0,0 @@ -CGTTCTTC -CTTACAAG -AGAGGAGG -CTGTAACC diff --git a/tests/data/Harvard-inDrop/harvard-indrop-v3-b1.txt b/tests/data/Harvard-inDrop/harvard-indrop-v3-b1.txt new file mode 100644 index 000000000..f9d3b141d --- /dev/null +++ b/tests/data/Harvard-inDrop/harvard-indrop-v3-b1.txt @@ -0,0 +1,384 @@ +AAACAAAC +AAACACGG +AAACACTA +AAACCGCC +AAACGATC +AAACGTGA +AAACTACA +AAACTGTG +AAAGAAAG +AAAGAGGC +AAAGCCCG +AAAGTCAT +AAATAGCA +AAATTCCG +AACAAATG +AACAGAAC +AACAGCGG +AACGATTT +AACGCCAA +AACGGTAG +AACGTTAC +AACTCAGT +AACTGCCT +AAGAACAG +AAGAAGGT +AAGAGTAT +AAGCCTTC +AAGCTCCT +AAGGATGA +AAGGCGCT +AAGGGACC +AAGTATTG +AAGTCCAA +AAGTCGGG +AAGTGAGA +AAGTTGTC +AATAAGGA +AATACATC +AATATGAC +AATCCGGC +AATCGAAG +AATCGTTC +AATGGCGT +AATGTATG +ACAAAGAT +ACAAGTAG +ACAATCTT +ACACCAAG +ACAGATAA +ACAGGCCA +ACATCTCG +ACATGGAC +ACCAACCC +ACCAAGGG +ACCACAGA +ACCAGTTT +ACCCATGC +ACCCGATT +ACCCTCAA +ACCGTCGA +ACCTGAAG +ACCTTCCC +ACGAATTC +ACGACGAC +ACGCTTAA +ACGGAGCA +ACGGCAGT +ACGGGTTA +ACGGTTGG +ACGTAAAC +ACTAATTG +ACTACCCG +ACTAGAGC +ACTCATAC +ACTCGGAA +ACTGCTGG +ACTGGTCA +ACTTCGCT +AGAAACCA +AGAAAGTG +AGAAGCTT +AGAATCAA +AGACCTCA +AGACGAGG +AGAGAGAC +AGAGGTGC +AGCAACGC +AGCACGTA +AGCATGCC +AGCCATCT +AGCGTGGT +AGCTCCAC +AGCTTCGA +AGGACACA +AGGAGTCG +AGGCAATA +AGGCCGAA +AGGCGTTT +AGGGACTG +AGGGTAAA +AGGTAAGC +AGGTATAT +AGGTTCCC +AGTAATGG +AGTAGTTA +AGTCACAA +AGTCCGTG +AGTGCTTC +AGTTGAAC +AGTTGCGG +AGTTTGTA +ATAACAGG +ATAAGCTA +ATACACCC +ATACTCTC +ATAGATGT +ATATGCAA +ATATGGGT +ATCAATCG +ATCAGGGA +ATCCCACC +ATCCGCAT +ATCCTAGT +ATCGCGCT +ATCGTAAC +ATCTTGGC +ATGACAAC +ATGACTTG +ATGCATAT +ATGCGGAG +ATGGGCTC +ATGGTCTG +ATGTGCCG +ATTACCTT +ATTATTCG +ATTCTGAG +ATTGAAGT +ATTGGCCC +ATTTCCAT +ATTTGTTG +CAAACATT +CAACGCAG +CAAGGAAT +CAAGGGTT +CAAGGTAC +CAATCTAG +CAATTCTC +CACAACCT +CACAAGTA +CACTAACC +CACTTGAT +CAGACTCG +CAGATGGG +CAGGTTGC +CAGTTTAA +CATGACGA +CATGCTGC +CATTCATT +CATTCGGG +CATTTCTA +CCACCTCT +CCACGTTG +CCAGACAG +CCAGCGAA +CCATATGA +CCATCCAC +CCATCGTC +CCATGCAT +CCCGTAAG +CCCGTTCT +CCCTCTTG +CCCTGTTT +CCCTTGCA +CCGACTTT +CCGAGATC +CCGATACG +CCGGAAAT +CCGTAGCT +CCGTCTTA +CCTACGCT +CCTATTTA +CCTCATGA +CCTTTACA +CCTTTGTC +CGAAACTC +CGAACCGA +CGAAGAAG +CGACATTT +CGAGGCTA +CGATCCAA +CGATGGCA +CGGACTAA +CGGCTGTA +CGGTGAGT +CGTACCGA +CGTCGAAT +CGTGCAAC +CGTGGGAT +CGTGTACA +CGTGTGTT +CGTTGCCT +CGTTTCGT +CTAACGCC +CTACGGGA +CTAGACTA +CTAGCACG +CTAGTAGG +CTCAAACA +CTCACATC +CTCCCAAA +CTCCTCCA +CTCGGTGA +CTCTATAG +CTCTGCGT +CTGAAGGG +CTGAGCGT +CTGCGATG +CTGCTAGA +CTGGAACA +CTGGGTAT +CTGTCGCA +CTGTGACC +CTGTTAAA +CTGTTGTG +CTGTTTCC +CTTAGGCC +CTTAGTGT +CTTCTACG +CTTTATCC +CTTTCACT +CTTTGGAC +GAAAGACA +GAAATACG +GAAGATAT +GAATCCCA +GAATGCGC +GACACAAA +GACACCTG +GACTAGCG +GAGAAACC +GAGCGGAA +GAGGAGTG +GAGGGTCA +GAGTGTAC +GATACGCA +GATGCAGA +GATGGTTA +GATGTGGC +GATTAAAG +GATTACTT +GATTGGGA +GATTTCCC +GCAAACTG +GCACTCAG +GCATCACT +GCATCGAG +GCCAAAGC +GCCAACAT +GCCTGGTA +GCCTTGTG +GCGCTGAT +GCGGTAAC +GCGTATTC +GCGTGCAA +GCTAAGTT +GCTACCGT +GCTATGGG +GCTCGTAG +GCTTCTCC +GGAACGAA +GGAAGTCC +GGACTGGA +GGACTTCT +GGAGGTTT +GGAGTAAG +GGATTGTT +GGCAAGGT +GGCACTTC +GGCCCAAT +GGCGACAA +GGCTATAA +GGCTTTGC +GGGAGATG +GGGATTAC +GGGCATCA +GGGTCATT +GGGTCTAG +GGTAAATC +GGTAGCCA +GGTCCTAA +GGTCTTTC +GGTGTCGA +GGTTACAC +GGTTAGGG +GGTTGAGA +GTAAACAA +GTAAGCCG +GTAATCTG +GTACGCTT +GTACGGAC +GTATACGT +GTATTGAC +GTCAAGAG +GTCAGACC +GTCAGGTT +GTCCACTA +GTCCGTCA +GTCCTTGC +GTCTAATC +GTCTGGAA +GTCTTCCT +GTGAACTC +GTGAGGCA +GTGATAAA +GTGCCCAT +GTGCGAAG +GTGGTGCT +GTGTCACC +GTGTCAGG +GTTACTAG +GTTCTGCT +GTTGTCCG +TAAACCGA +TAACTTCT +TAAGGGCC +TAATCCAT +TAATGTGG +TACCCTGC +TACCGCTC +TACCTAAG +TACCTCCC +TACGCGAG +TACGTTCG +TACTGAAT +TAGATCAA +TAGCCACA +TAGCGGAT +TAGGCTTT +TAGGTACG +TAGTAGCC +TAGTCTCT +TATCCACG +TATCTGTC +TATGTGAA +TATTAGCG +TCAAATGG +TCAAGGCG +TCAGCCTC +TCATACCA +TCATAGCT +TCATTTCA +TCCAGAAG +TCCCTGGA +TCCGACAC +TCCGCTGT +TCCTATAT +TCGACTGC +TCGAGTTT +TCGCAATC +TCGGTCAT +TCGTGGGT +TCGTTCCC +TCTAAACT +TCTATTCC +TCTGATTT +TCTTTGAC +TGAATAGG +TGAATCCT +TGACGTCG +TGAGAGCG +TGAGCACA +TGCACCAG +TGCCGGTA +TGCGACTA +TGCTGACG +TGCTTCAT +TGCTTGGG +TGGAAAGC +TGGACGGA +TGGCTAGT +TGGGAATT +TGGTGTCT +TGGTTAAC +TGTTATCA diff --git a/tests/data/Harvard-inDrop/harvard-indrop-v3-b2.txt b/tests/data/Harvard-inDrop/harvard-indrop-v3-b2.txt new file mode 100644 index 000000000..b2cf276eb --- /dev/null +++ b/tests/data/Harvard-inDrop/harvard-indrop-v3-b2.txt @@ -0,0 +1,384 @@ +GTTTGTTT +CCGTGTTT +TAGTGTTT +GGCGGTTT +GATCGTTT +TCACGTTT +TGTAGTTT +CACAGTTT +CTTTCTTT +GCCTCTTT +CGGGCTTT +ATGACTTT +TGCTATTT +CGGAATTT +CATTTGTT +GTTCTGTT +CCGCTGTT +AAATCGTT +TTGGCGTT +CTACCGTT +GTAACGTT +ACTGAGTT +AGGCAGTT +CTGTTCTT +ACCTTCTT +ATACTCTT +GAAGGCTT +AGGAGCTT +TCATCCTT +AGCGCCTT +GGTCCCTT +CAATACTT +TTGGACTT +CCCGACTT +TCTCACTT +GACAACTT +TCCTTATT +GATGTATT +GTCATATT +GCCGGATT +CTTCGATT +GAACGATT +ACGCCATT +CATACATT +ATCTTTGT +CTACTTGT +AAGATTGT +CTTGGTGT +TTATCTGT +TGGCCTGT +CGAGATGT +GTCCATGT +GGGTTGGT +CCCTTGGT +TCTGTGGT +AAACTGGT +GCATGGGT +AATCGGGT +TTGAGGGT +TCGACGGT +CTTCAGGT +GGGAAGGT +GAATTCGT +GTCGTCGT +TTAAGCGT +TGCTCCGT +ACTGCCGT +TAACCCGT +CCAACCGT +GTTTACGT +CAATTAGT +CGGGTAGT +GCTCTAGT +GTATGAGT +TTCCGAGT +CCAGCAGT +TGACCAGT +AGCGAAGT +TGGTTTCT +CACTTTCT +AAGCTTCT +TTGATTCT +TGAGGTCT +CCTCGTCT +GTCTCTCT +GCACCTCT +GCGTTGCT +TACGTGCT +GGCATGCT +AGATGGCT +ACCACGCT +GTGGAGCT +TCGAAGCT +TGTGTCCT +CGACTCCT +TATTGCCT +TTCGGCCT +AAACGCCT +CAGTCCCT +TTTACCCT +GCTTACCT +ATATACCT +GGGAACCT +CCATTACT +TAACTACT +TTGTGACT +CACGGACT +GAAGCACT +GTTCAACT +CCGCAACT +TACAAACT +CCTGTTAT +TAGCTTAT +GGGTGTAT +GAGAGTAT +ACATCTAT +TTGCATAT +ACCCATAT +CGATTGAT +TCCCTGAT +GGTGGGAT +ATGCGGAT +ACTAGGAT +AGCGCGAT +GTTACGAT +GCCAAGAT +GTTGTCAT +CAAGTCAT +ATATGCAT +CTCCGCAT +GAGCCCAT +CAGACCAT +CGGCACAT +AAGGTAAT +CGAATAAT +CTCAGAAT +ACTTCAAT +GGGCCAAT +ATGGAAAT +CAACAAAT +AATGTTTG +CTGCGTTG +ATTCCTTG +AACCCTTG +GTACCTTG +CTAGATTG +GAGAATTG +AGGTTGTG +TACTTGTG +GGTTAGTG +ATCAAGTG +CGAGTCTG +CCCATCTG +GCAACCTG +TTAAACTG +TCGTCATG +GCAGCATG +AATGAATG +CCCGAATG +TAGAAATG +AGAGGTGG +CAACGTGG +CTGTCTGG +TTCGCTGG +TCATATGG +GTGGATGG +GACGATGG +ATGCATGG +CTTACGGG +AGAACGGG +CAAGAGGG +AAACAGGG +TGCAAGGG +AAAGTCGG +GATCTCGG +CGTATCGG +ATTTCCGG +AGCTACGG +TAAGACGG +AGCGTAGG +TAAATAGG +TCATGAGG +TGTAAAGG +GACAAAGG +GAGTTTCG +TCGGTTCG +CTTCTTCG +AAATGTCG +TAGCCTCG +TTGGATCG +TGCCATCG +TTAGTCCG +TACAGCCG +ACTCACCG +TCGGTACG +ATTCGACG +GTTGCACG +ATCCCACG +TGTACACG +AACACACG +AGGCAACG +ACGAAACG +GGCGTTAG +TCCCGTAG +TAGTCTAG +CGTGCTAG +CCTACTAG +TGTTTGAG +GATGTGAG +TTTGGGAG +TGGAGGAG +TCACCGAG +CTATAGAG +ACGCAGAG +CCCTTCAG +ACGCTCAG +CATCGCAG +TCTAGCAG +TGTTCCAG +ATACCCAG +TGCGACAG +GGTCACAG +TTTAACAG +CACAACAG +GGAAACAG +GGCCTAAG +ACACTAAG +CGTAGAAG +GGATAAAG +AGTGAAAG +GTCCAAAG +TGTCTTTC +CGTATTTC +ATATCTTC +TGGGATTC +GCGCATTC +TTTGTGTC +CAGGTGTC +CGCTAGTC +GGTTTCTC +TTCCGCTC +CACTCCTC +TGACCCTC +GTACACTC +TGCGTATC +TCTGCATC +TAACCATC +GCCACATC +CTTTAATC +AAGTAATC +TCCCAATC +GGGAAATC +CAGTTTGC +CTGAGTGC +AGTGATGC +CTCGATGC +GCTTTGGC +ATGTTGGC +TACCAGGC +CACAAGGC +ATCAGCGC +GTTACCGC +GAATACGC +TTGCACGC +AACTTAGC +ACGGTAGC +CCCATAGC +CTACGAGC +GGAGAAGC +TTCGTTCC +GGACTTCC +TCCAGTCC +AGAAGTCC +AAACCTCC +CTTACTCC +AACAATCC +ACCTTGCC +GAAGTGCC +ATTGGGCC +TTGTCGCC +TTATAGCC +GCAAAGCC +CATCTCCC +GTAATCCC +TGATGCCC +AATGACCC +CTAGACCC +GATTTACC +TGGCTACC +TTAGGACC +GAAAGACC +TCGACACC +GTGTAACC +CCCTAACC +TCTCAACC +TTGTTTAC +CGGCTTAC +CAGATTAC +AAGCGTAC +GTCCGTAC +ACGTATAC +GTCAATAC +CTCTTGAC +GGTCTGAC +AACCTGAC +TAGTGGAC +TGACGGAC +GCAAGGAC +GATTAGAC +TTCCAGAC +AGGAAGAC +GAGTTCAC +TGCCTCAC +TTTATCAC +ATGGGCAC +CTTCGCAC +AGCACCAC +GGTGACAC +CCTGACAC +CTAGTAAC +AGCAGAAC +CGGACAAC +TCGGTTTA +AGAAGTTA +GGCCCTTA +ATGGATTA +CCACATTA +GCAGGGTA +GAGCGGTA +CTTAGGTA +GGGAGGTA +CTCGCGTA +CGAACGTA +ATTCAGTA +TTGATCTA +TGTGGCTA +ATCCGCTA +AAAGCCTA +CGTACCTA +GGCTACTA +AGAGACTA +CGTGGATA +GACAGATA +TTCACATA +CGCTAATA +CCATTTGA +CGCCTTGA +GAGGCTGA +TGGTATGA +AGCTATGA +TGAAATGA +CTTCTGGA +TCCAGGGA +GTGTCGGA +ACAGCGGA +ATATAGGA +GCAGTCGA +AAACTCGA +GATTGCGA +ATGACCGA +ACCCACGA +GGGAACGA +AGTTTAGA +GGAATAGA +AAATCAGA +GTCAAAGA +CCTATTCA +AGGATTCA +CGACGTCA +CGCTCTCA +TGTGCTCA +CTGGTGCA +TACCGGCA +TAGTCGCA +CGTCAGCA +ATGAAGCA +CCCAAGCA +GCTTTCCA +TCCGTCCA +ACTAGCCA +AATTCCCA +AGACACCA +GTTAACCA +TGATAACA diff --git a/tests/data/Harvard-inDrop/harvard-indrop-v3-sample_barcodes.txt b/tests/data/Harvard-inDrop/harvard-indrop-v3-sample_barcodes.txt new file mode 100644 index 000000000..cab3e3945 --- /dev/null +++ b/tests/data/Harvard-inDrop/harvard-indrop-v3-sample_barcodes.txt @@ -0,0 +1,24 @@ +ATAGAGAG +AGAGGATA +CTCCTTAC +TATGCAGT +TACTCCTT +AGGCTTAG +ATTAGACG +CGGAGAGA +CTAGTCGA +AGCTAGAA +ACTCTAGG +TCTTACGC +CTTAATAG +ATAGCCTT +TAAGGCTC +TCGCATAA +TTACCTCC +CAGTTATG +CCTTTACT +GACGATTA +GAGACGGA +AGAAAGCT +ACGCTCTT +CGCATTCT diff --git a/tests/data/automated/run_info-scrnaseq.yaml b/tests/data/automated/run_info-scrnaseq.yaml index 1e95b6898..bc09f6f9d 100755 --- a/tests/data/automated/run_info-scrnaseq.yaml +++ b/tests/data/automated/run_info-scrnaseq.yaml @@ -7,7 +7,7 @@ details: singlecell_quantifier: rapmap minimum_barcode_depth: 0 sample_barcodes: ../data/Harvard-inDrop/sample-index.txt - cellular_barcodes: [../data/Harvard-inDrop/bc1.txt, ../data/Harvard-inDrop/bc2.txt] +# cellular_barcodes: [../data/Harvard-inDrop/harvard-indrop-v3-b1.txt, ../data/Harvard-inDrop/harvard-indrop-v3-b2.txt] description: Test1 files: - ../data/Harvard-inDrop/klein-v3_R1.fq.gz @@ -21,7 +21,7 @@ details: singlecell_quantifier: rapmap minimum_barcode_depth: 0 sample_barcodes: ../data/Harvard-inDrop/sample-index.txt - cellular_barcodes: [../data/Harvard-inDrop/bc1.txt, ../data/Harvard-inDrop/bc2.txt] +# cellular_barcodes: [../data/Harvard-inDrop/harvard-indrop-v3-b1.txt, ../data/Harvard-inDrop/harvard-indrop-v3-b2.txt] description: Test2 files: - ../data/Harvard-inDrop/klein-v3_R1.fq.gz