In [7]:
%matplotlib inline

from collections import defaultdict, Counter
import glob
import gzip
import json
import os
import subprocess
import itertools
import gspread
import pandas as pd
import screed.fastq
import numpy as np

In [15]:
files = pd.read_table("/home/gpratt/projects/encode/scripts/encode_v7.txt", header=None, names=['fastqs', "species", "unique_id", "adapter", "nothing", "nothing"])

In [9]:
def check_proper_pairing_gzip(mate1, mate2):
    worked = True
    try:
        with gzip.open(mate1) as fastq_file_1, gzip.open(mate2) as fastq_file_2:
            while True:
                try:
                    name_1 = fastq_file_1.next()
                    seq_1 = fastq_file_1.next()
                    fastq_file_1.next() #got to consume the read
                    plus = "+\n" #sometimes the descriptor is here, don't want it
                    quality_1 = fastq_file_1.next()

                    name_2 = fastq_file_2.next()
                    seq_2 = fastq_file_2.next()
                    fastq_file_2.next() #got to consume the read
                    plus = "+\n" #sometimes the descriptor is here, don't want it
                    quality_2 = fastq_file_2.next()

                    if name_1.split()[0] != name_2.split()[0]:
                        print name_1, name_2
                        return False
                except StopIteration:
                    break
    except IOError:
        return True
    return worked

def check_proper_pairing(mate1, mate2):
    worked = True
    try:
        with open(mate1) as fastq_file_1, open(mate2) as fastq_file_2:
            while True:
                try:
                    name_1 = fastq_file_1.next()
                    seq_1 = fastq_file_1.next()
                    fastq_file_1.next() #got to consume the read
                    plus = "+\n" #sometimes the descriptor is here, don't want it
                    quality_1 = fastq_file_1.next()

                    name_2 = fastq_file_2.next()
                    seq_2 = fastq_file_2.next()
                    fastq_file_2.next() #got to consume the read
                    plus = "+\n" #sometimes the descriptor is here, don't want it
                    quality_2 = fastq_file_2.next()

                    if name_1.split("/")[0] != name_2.split("/")[0]:
                        print name_1.split("/")[0], name_2.split("/")[0]
                        return False
                except StopIteration:
                    break
    except IOError:
        return True
    return worked

def sort_fastq(in_fastq, out_fastq):
    """ Sorts fastq in memory, this will fail for large fastq files """
    reads = screed.open(in_fastq)
    sorted_reads = sorted(reads, key=lambda read: read['name'].split("/")[0])

    with open(out_fastq, 'wb') as outfile:
        for read in sorted_reads:
            outfile.write("@" + read.name + "\n")
            outfile.write(read.sequence + "\n")
            outfile.write("+\n")
            outfile.write(read.quality + "\n")
            
def check_fastq(in_fastq):
    """ Sorts fastq in memory, this will fail for large fastq files """
    reads = screed.open(in_fastq)
    read_names = set([])

    for read in reads:
        read_names.add(read['name'].split("/")[0])
    return read_names

def sort_fastq_inplace(fastq):
    call = "cat {0} | sed  's/\t/ /' | paste  - - - - | sort -k1,1 -S 3G | tr '\t' '\n' > foo.fq && mv foo.fq {0}".format(fastq)
    subprocess.call(call, shell=True)

In [21]:
files['r1'] = files.fastqs.apply(lambda x: x.split(";")[0])
files['r2'] = files.fastqs.apply(lambda x: x.split(";")[1])

In [43]:
files['mate1'] = files.r1.apply(lambda x: "/home/gpratt/projects/encode/analysis/encode_v7/" + ".".join(os.path.basename(x).split(".")[:2]) + ".adapterTrim.round2.rep.bamUnmapped.out.mate1")
files['mate2'] = files.r1.apply(lambda x: "/home/gpratt/projects/encode/analysis/encode_v7/" + ".".join(os.path.basename(x).split(".")[:2]) + ".adapterTrim.round2.rep.bamUnmapped.out.mate2")

files['rmRepMate1'] = files.r1.apply(lambda x: "/home/gpratt/projects/encode/analysis/encode_v7/" + ".".join(os.path.basename(x).split(".")[:2]) + ".adapterTrim.round2.rmRep.bamUnmapped.out.mate1")
files['rmRepMate2'] = files.r1.apply(lambda x: "/home/gpratt/projects/encode/analysis/encode_v7/" + ".".join(os.path.basename(x).split(".")[:2]) + ".adapterTrim.round2.rmRep.bamUnmapped.out.mate2")

In [23]:
worked_list = []
for name, row in files.iterrows():
    try: 
        worked = check_proper_pairing(row.mate1, row.mate2)
        print row.mate1, worked
        worked_list.append(worked)
        if not worked:
            print "THE ABOVE FAILED"
    except IOError:
        print IOError
        worked_list.append(np.nan)

/home/gpratt/projects/encode/analysis/encode_v7/222_CLIP_ATTACTCG-CCTATCCT_L001_R1.C01_222_01_HNRNPM.adapterTrim.round2.rep.bamUnmapped.out.mate1 True
/home/gpratt/projects/encode/analysis/encode_v7/222_CLIP_ATTACTCG-CCTATCCT_L001_R1.D08fixed_222_01_HNRNPM.adapterTrim.round2.rep.bamUnmapped.out.mate1 True
/home/gpratt/projects/encode/analysis/encode_v7/222_CLIP_ATTACTCG-CCTATCCT_L001_R1.A01_222_02_HNRNPM.adapterTrim.round2.rep.bamUnmapped.out.mate1 True
/home/gpratt/projects/encode/analysis/encode_v7/222_CLIP_ATTACTCG-CCTATCCT_L001_R1.B06_222_02_HNRNPM.adapterTrim.round2.rep.bamUnmapped.out.mate1 True
/home/gpratt/projects/encode/analysis/encode_v7/223_CLIP_TCCGGAGA-CCTATCCT_L001_R1.A01_223_01_FKBP4.adapterTrim.round2.rep.bamUnmapped.out.mate1 True
/home/gpratt/projects/encode/analysis/encode_v7/223_CLIP_TCCGGAGA-CCTATCCT_L001_R1.B06_223_01_FKBP4.adapterTrim.round2.rep.bamUnmapped.out.mate1 True
/home/gpratt/projects/encode/analysis/encode_v7/223_CLIP_TCCGGAGA-CCTATCCT_L001_R1.C01_223_

In [None]:
/home/gpratt/projects/encode/analysis/encode_v8/255_01_CLIP_CGCTCATT-TAATCTTA_L004_R1.F05_255_01_EIF4A3.adapterTrim.round2.fastq.gz
/home/gpratt/projects/encode/analysis/encode_v8/255_01_CLIP_CGCTCATT-TAATCTTA_L004_R2.F05_255_01_EIF4A3.adapterTrim.round2.fastq.gz


In [58]:
check_proper_pairing("/home/gpratt/projects/encode/analysis/encode_v8/255_01_CLIP_CGCTCATT-TAATCTTA_L004_R1.F05_255_01_EIF4A3.adapterTrim.fastq.gz",
                     "/home/gpratt/projects/encode/analysis/encode_v8/255_01_CLIP_CGCTCATT-TAATCTTA_L004_R2.F05_255_01_EIF4A3.adapterTrim.fastq.gz"
)

True

In [64]:
check_proper_pairing_gzip("/home/gpratt/projects/encode/analysis/encode_v8/255_01_CLIP_CGCTCATT-TAATCTTA_L004_R1.F05_255_01_EIF4A3.adapterTrim.round2.fastq.gz",
                          "/home/gpratt/projects/encode/analysis/encode_v8/255_01_CLIP_CGCTCATT-TAATCTTA_L004_R1.F05_255_01_EIF4A3.adapterTrim.round2.fastq.gz"
)

True

In [74]:
check_proper_pairing("/home/gpratt/projects/encode/analysis/encode_v8/255_01_CLIP_CGCTCATT-TAATCTTA_L004_R1.F05_255_01_EIF4A3.adapterTrim.round2.rep.bamUnmapped.out.sorted.mate1",
                     "/home/gpratt/projects/encode/analysis/encode_v8/255_01_CLIP_CGCTCATT-TAATCTTA_L004_R1.F05_255_01_EIF4A3.adapterTrim.round2.rep.bamUnmapped.out.sorted.mate2"
)

True

In [None]:
import itertools

In [67]:
foo = glob.glob("/projects/ps-yeolab/seqdata/igm-storage1.ucsd.edu/150406_D00611_0100_AC6DB4ANXX/Data/Unaligned_L5/Project_C6DB4ANXX/Sample_KK_17_CLIPin_tdp/*.gz")
print foo.pop(1)
print foo.pop(-1)

/projects/ps-yeolab/seqdata/igm-storage1.ucsd.edu/150406_D00611_0100_AC6DB4ANXX/Data/Unaligned_L5/Project_C6DB4ANXX/Sample_KK_17_CLIPin_tdp/KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_R1.fastq.gz
/projects/ps-yeolab/seqdata/igm-storage1.ucsd.edu/150406_D00611_0100_AC6DB4ANXX/Data/Unaligned_L5/Project_C6DB4ANXX/Sample_KK_17_CLIPin_tdp/KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_R2.fastq.gz


In [72]:
sorter = lambda item: "_".join([os.path.basename(item).split("_")[5], os.path.basename(item).split("_")[7]])

In [75]:
for name, lst in itertools.groupby(sorted(foo, key=sorter), key=sorter):
    lst = list(lst)
    print lst
    r1, r2 = lst
    
    print check_proper_pairing_gzip(r1, r2)

['/projects/ps-yeolab/seqdata/igm-storage1.ucsd.edu/150406_D00611_0100_AC6DB4ANXX/Data/Unaligned_L5/Project_C6DB4ANXX/Sample_KK_17_CLIPin_tdp/KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L006_R2_001.fastq.gz', '/projects/ps-yeolab/seqdata/igm-storage1.ucsd.edu/150406_D00611_0100_AC6DB4ANXX/Data/Unaligned_L5/Project_C6DB4ANXX/Sample_KK_17_CLIPin_tdp/KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L006_R1_001.fastq.gz']
True
['/projects/ps-yeolab/seqdata/igm-storage1.ucsd.edu/150406_D00611_0100_AC6DB4ANXX/Data/Unaligned_L5/Project_C6DB4ANXX/Sample_KK_17_CLIPin_tdp/KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L006_R1_002.fastq.gz', '/projects/ps-yeolab/seqdata/igm-storage1.ucsd.edu/150406_D00611_0100_AC6DB4ANXX/Data/Unaligned_L5/Project_C6DB4ANXX/Sample_KK_17_CLIPin_tdp/KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L006_R2_002.fastq.gz']
True
['/projects/ps-yeolab/seqdata/igm-storage1.ucsd.edu/150406_D00611_0100_AC6DB4ANXX/Data/Unaligned_L5/Project_C6DB4ANXX/Sample_KK_17_CLIPin_tdp/KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L006_R2_003.fas

In [3]:
import pysam

In [4]:
in_bam = "/home/gpratt/projects/encode/analysis/encode_v8/255_01_CLIP_CGCTCATT-TAATCTTA_L004_R1.F05_255_01_EIF4A3.adapterTrim.round2.rmRep.bam"

In [6]:
with pysam.Samfile(in_bam, 'r') as samfile1:
    with pysam.Samfile(in_bam, 'r') as samfile2:
        samfile_read1 = itertools.islice(samfile1, 0, None, 2)
        samfile_read2 = itertools.islice(samfile2, 1, None, 2)
        for read1, read2 in itertools.izip(samfile_read1, samfile_read2):
            if not read1.qname == read2.qname:
                print read1.qname, read2.qname


GACCC:HWI-D00611:119:C6K7PANXX:4:2208:11076:40753 CCTGC:HWI-D00611:119:C6K7PANXX:4:2309:2609:99228
CCTGC:HWI-D00611:119:C6K7PANXX:4:2309:2609:99228 CCTGC:HWI-D00611:119:C6K7PANXX:4:2309:5151:40054
CCTGC:HWI-D00611:119:C6K7PANXX:4:2309:5151:40054 CCTGC:HWI-D00611:119:C6K7PANXX:4:2309:5301:50075
CCTGC:HWI-D00611:119:C6K7PANXX:4:2309:5301:50075 CCTGC:HWI-D00611:119:C6K7PANXX:4:2309:5653:6303
CCTGC:HWI-D00611:119:C6K7PANXX:4:2309:5653:6303 CCTGC:HWI-D00611:119:C6K7PANXX:4:2309:6006:51711
CCTGC:HWI-D00611:119:C6K7PANXX:4:2309:6006:51711 CCTGC:HWI-D00611:119:C6K7PANXX:4:2309:8300:52662
CCTGC:HWI-D00611:119:C6K7PANXX:4:2309:8300:52662 CCTGC:HWI-D00611:119:C6K7PANXX:4:2309:9002:88815
CCTGC:HWI-D00611:119:C6K7PANXX:4:2309:9002:88815 CCTGC:HWI-D00611:119:C6K7PANXX:4:2309:9021:32581
CCTGC:HWI-D00611:119:C6K7PANXX:4:2309:9021:32581 CCTGC:HWI-D00611:119:C6K7PANXX:4:2309:9152:23567
CCTGC:HWI-D00611:119:C6K7PANXX:4:2309:9152:23567 CCTGC:HWI-D00611:119:C6K7PANXX:4:2309:9303:92831
CCTGC:HWI-D00611:119:

KeyboardInterrupt: 

In [None]:
['./KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L007_R1_001.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L006_R1_001.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L006_R1_003.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L007_R1_004.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L006_R1_004.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L007_R1_003.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L006_R1_002.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L007_R1_002.fastq.gz']
['./KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L007_R2_001.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L006_R2_001.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L007_R2_004.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L006_R2_003.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L007_R2_003.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L006_R2_004.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L006_R2_002.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L007_R2_002.fastq.gz']


In [None]:
['./KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L006_R2_001.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L006_R2_002.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L006_R2_003.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L006_R2_004.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L007_R2_001.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L007_R2_002.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L007_R2_003.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L007_R2_004.fastq.gz']
processing ['./KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L006_R1_001.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L006_R1_002.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L006_R1_003.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L006_R1_004.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L007_R1_001.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L007_R1_002.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L007_R1_003.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L007_R1_004.fastq.gz'] to ./KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_R1.fastq.gz


In [None]:
['./KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L006_R1_001.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L006_R1_002.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L006_R1_003.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L006_R1_004.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L007_R1_001.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L007_R1_002.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L007_R1_003.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L007_R1_004.fastq.gz']
['./KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L006_R2_001.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L006_R2_002.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L006_R2_003.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L006_R2_004.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L007_R2_001.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L007_R2_002.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L007_R2_003.fastq.gz', './KK_17_CLIPin_tdp_ATTCAGAA-AGGCGAAG_L007_R2_004.fastq.gz']
