In [None]:
%matplotlib inline
from collections import Counter
import screed
from itertools import izip
import gzip
import pandas as pd

import seaborn as sns

In [None]:
def hamming(word1, word2):
    """
    Gets hamming distance between two words, this is an odd implementation because
    we actually want Ns to appear similar to other barcodes as this makes the results more stringent, not less
    :param word1:
    :param word2:
    :return:
    """
    return sum(a != b and not (a == "N" or b == "N") for a, b in izip(word1, word2))

def count_barcodes(fn):
    reads = screed.open(fn)
    count = Counter()
    for read in reads:
        count[read['annotations'].split(":")[-1]] += 1
    return count

def demux_barcodes(fn, outfile):
    reads = screed.open(fn)
    with gzip.open(outfile, 'w') as outfile:
        for read in reads:
            barcode = read['annotations'].split(":")[-1]
            if hamming(barcode, "CGCTCATTCCTATCCT") <= 1:
                read = format_read(read)
                outfile.write(read)
            
def format_read(read):
    result = "@" + read['name'] + " " + read['annotations'] + "\n"
    result += read['sequence'] + "\n"
    result += "+" + "\n"
    result += read['quality'] + "\n"
    return result

In [3]:
!pwd

/nas/nas0/gpratt/iPython_Notebook/taf15


In [None]:
L8_r1 = demux_barcodes("/nas3/gpratt/projects/fet_family/data/unassigned_debugging/lane8_Undetermined_L008_R2.fastq",
                       "F7_IP_IgG_CGCTCATT-CCTATCCT_L008_R2.fastq.gz")


In [None]:
L8_r1 = demux_barcodes("/nas3/gpratt/projects/fet_family/data/unassigned_debugging/lane8_Undetermined_L008_R1.fastq",
                      "F7_IP_IgG_CGCTCATT-CCTATCCT_L008_R1.fastq.gz")

In [7]:
!scp F7_IP_IgG_CGCTCATT-CCTATCCT_L008_R2.fastq.gz tscc-login.sdsc.edu:/projects/ps-yeolab/seqdata/igm-storage1.ucsd.edu/150406_D00611_0100_AC6DB4ANXX/Data/Unaligned_L5/Project_C6DB4ANXX/Sample_F7_IP_IgG
!scp F7_IP_IgG_CGCTCATT-CCTATCCT_L008_R1.fastq.gz tscc-login.sdsc.edu:/projects/ps-yeolab/seqdata/igm-storage1.ucsd.edu/150406_D00611_0100_AC6DB4ANXX/Data/Unaligned_L5/Project_C6DB4ANXX/Sample_F7_IP_IgG

F7_IP_IgG_CGCTCATT-CCTATCCT_L008_R2.fastq.gz  100%   89MB  44.7MB/s   00:02    
F7_IP_IgG_CGCTCATT-CCTATCCT_L008_R1.fastq.gz  100%   88MB  44.0MB/s   00:02    


In [24]:
L5_r1 = count_barcodes("/nas3/gpratt/projects/fet_family/data/unassigned_debugging/lane5_Undetermined_L005_R1.fastq")
L6_r1 = count_barcodes("/nas3/gpratt/projects/fet_family/data/unassigned_debugging/lane6_Undetermined_L006_R1.fastq")
L7_r1 = count_barcodes("/nas3/gpratt/projects/fet_family/data/unassigned_debugging/lane7_Undetermined_L007_R1.fastq")
L8_r1 = count_barcodes("/nas3/gpratt/projects/fet_family/data/unassigned_debugging/lane8_Undetermined_L008_R2.fastq")

In [28]:
L6_r1.most_common(10)

[('ATTCAGAANNNNNNNN', 56582),
 ('ATTACTCGNNNNNNNN', 49682),
 ('GAATTCGTNNNNNNNN', 49118),
 ('TCCGGAGANNNNNNNN', 40834),
 ('GAGATTCCNNNNNNNN', 39353),
 ('CGCTCATTNNNNNNNN', 28782),
 ('ATTCAGAACTATCCTA', 28028),
 ('TCCGGAGACTATCCTA', 24801),
 ('AATTCGTAGGCTCTGA', 23756),
 ('GAATCGTAGGCTCTGA', 22350)]

In [27]:
L7_r1.most_common(10)

[('ATTCAGAANNNNNNNN', 73790),
 ('GAATTCGTNNNNNNNN', 64230),
 ('ATTACTCGNNNNNNNN', 60373),
 ('TCCGGAGANNNNNNNN', 51972),
 ('GAGATTCCNNNNNNNN', 51135),
 ('CGCTCATTNNNNNNNN', 35882),
 ('ATTCAGAACTATCCTA', 28943),
 ('TCCGGAGACTATCCTA', 25661),
 ('AATTCGTAGGCTCTGA', 23936),
 ('ATTACTCGCTATCCTA', 23552)]

In [25]:
L8_r1.most_common(10)

[('CGCTCATTCCTATCCT', 2043629),
 ('CGCTCATTCCNATCCT', 409549),
 ('ATTCAGAATNTAGCCN', 136843),
 ('GAATTCGTANAGAGGN', 127580),
 ('ATTACTCGANAGAGGN', 127069),
 ('TCCGGAGAANAGAGGN', 121628),
 ('GAATTCGTTNTAGCCN', 109525),
 ('ATTACTCGTNTAGCCN', 107319),
 ('GAGATTCCANAGAGGN', 107212),
 ('TCCGGAGATNTAGCCN', 93769)]

In [38]:
foo = """D701	ATTACTCG
D702	TCCGGAGA
D703	CGCTCATT
D704	GAGATTCC
D705	ATTCAGAA
D706	GAATTCGT
D701	ATTACTCG
D702	TCCGGAGA
D703	CGCTCATT
D704	GAGATTCC
D705	ATTCAGAA
D706	GAATTCGT
D701	ATTACTCG
D702	TCCGGAGA
D703	CGCTCATT
D704	GAGATTCC
D705	ATTCAGAA
D706	GAATTCGT
D701	ATTACTCG
D702	TCCGGAGA
D703	CGCTCATT
D704	GAGATTCC
D705	ATTCAGAA
D706	GAATTCGT
D701	ATTACTCG
D702	TCCGGAGA
D703	CGCTCATT
D704	GAGATTCC
D705	ATTCAGAA
D706	GAATTCGT
D701	ATTACTCG
D702	TCCGGAGA
D703	CGCTCATT
D704	GAGATTCC
D705	ATTCAGAA
D706	GAATTCGT
D701	ATTACTCG
D702	TCCGGAGA
D703	CGCTCATT
D704	GAGATTCC
D705	ATTCAGAA
D706	GAATTCGT
D701	ATTACTCG
D702	TCCGGAGA
D703	CGCTCATT
D704	GAGATTCC
D705	ATTCAGAA
D706	GAATTCGT
D701	ATTACTCG
D702	TCCGGAGA
D703	CGCTCATT
D704	GAGATTCC
D705	ATTCAGAA
D706	GAATTCGT
D701	ATTACTCG
D702	TCCGGAGA
D703	CGCTCATT
D704	GAGATTCC
D705	ATTCAGAA
D706	GAATTCGT
D701	ATTACTCG
D705	ATTCAGAA"""

In [40]:
items = [item.split() for item in foo.split("\n")]

In [41]:
for barcode_id, barcode_sequence in items:
    print barcode_id, barcode_sequence

D701 ATTACTCG
D702 TCCGGAGA
D703 CGCTCATT
D704 GAGATTCC
D705 ATTCAGAA
D706 GAATTCGT
D701 ATTACTCG
D702 TCCGGAGA
D703 CGCTCATT
D704 GAGATTCC
D705 ATTCAGAA
D706 GAATTCGT
D701 ATTACTCG
D702 TCCGGAGA
D703 CGCTCATT
D704 GAGATTCC
D705 ATTCAGAA
D706 GAATTCGT
D701 ATTACTCG
D702 TCCGGAGA
D703 CGCTCATT
D704 GAGATTCC
D705 ATTCAGAA
D706 GAATTCGT
D701 ATTACTCG
D702 TCCGGAGA
D703 CGCTCATT
D704 GAGATTCC
D705 ATTCAGAA
D706 GAATTCGT
D701 ATTACTCG
D702 TCCGGAGA
D703 CGCTCATT
D704 GAGATTCC
D705 ATTCAGAA
D706 GAATTCGT
D701 ATTACTCG
D702 TCCGGAGA
D703 CGCTCATT
D704 GAGATTCC
D705 ATTCAGAA
D706 GAATTCGT
D701 ATTACTCG
D702 TCCGGAGA
D703 CGCTCATT
D704 GAGATTCC
D705 ATTCAGAA
D706 GAATTCGT
D701 ATTACTCG
D702 TCCGGAGA
D703 CGCTCATT
D704 GAGATTCC
D705 ATTCAGAA
D706 GAATTCGT
D701 ATTACTCG
D702 TCCGGAGA
D703 CGCTCATT
D704 GAGATTCC
D705 ATTCAGAA
D706 GAATTCGT
D701 ATTACTCG
D705 ATTCAGAA


In [42]:
illumina_adapters = """501 TATAGCCT 
502 ATAGAGGC 
503 CCTATCCT 
504 GGCTCTGA 
505 AGGCGAAG 
506 TAATCTTA 
507 CAGGACGT 
508 GTACTGAC 
701 ATTACTCG
702 TCCGGAGA
703 CGCTCATT
704 GAGATTCC
705 ATTCAGAA
706 GAATTCGT""".split("\n")

illumina_adapters = pd.DataFrame([item.strip().split() for item in illumina_adapters], columns=["label", "barcode"])
illumina_adapters = illumina_adapters.set_index("label")

NameError: name 'pd' is not defined

#Lets figure out KK's stuff

See if barcodes cluster by illumina barcode
--

ENSG00000227232.4 ENSG00000227232.4
ENSG00000227232.4 ENSG00000233750.3
ENSG00000227232.4 ENSG00000237683.5
ENSG00000227232.4 ENSG00000239906.1
ENSG00000227232.4 ENSG00000241860.2
ENSG00000227232.4 ENSG00000228463.4
ENSG00000227232.4 ENSG00000237094.6
ENSG00000227232.4 ENSG00000231709.1
ENSG00000227232.4 ENSG00000225630.1
ENSG00000227232.4 ENSG00000237973.1
ENSG00000227232.4 ENSG00000229344.1
ENSG00000227232.4 ENSG00000248527.1
ENSG00000227232.4 ENSG00000198744.5
ENSG00000227232.4 ENSG00000229376.3
ENSG00000227232.4 ENSG00000224956.5
ENSG00000227232.4 ENSG00000235373.1
ENSG00000227232.4 ENSG00000228327.2
ENSG00000227232.4 ENSG00000237491.3
ENSG00000227232.4 ENSG00000230092.3
ENSG00000227232.4 ENSG00000269831.1
ENSG00000227232.4 ENSG00000177757.1
ENSG00000227232.4 ENSG00000225880.4
ENSG00000227232.4 ENSG00000228794.3
ENSG00000227232.4 ENSG00000230368.2
ENSG00000227232.4 ENSG00000234711.1
ENSG00000227232.4 ENSG00000230699.2
ENSG00000227232.4 ENSG00000241180.1
ENSG00000227232.4 ENSG000002