In [1]:
from __future__ import print_function
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import pylab
import pandas as pd
import numpy as np
import os
import sys
import gzip
import itertools
import operator
import subprocess
import twobitreader
from Bio.Alphabet import IUPAC
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import pysam

In [3]:
#not sure if I need these but they were in the uditas software
class Error(Exception):
    """Base class for exceptions in this module."""
    pass

class StrandError(Error):
    """Exception raised for errors in the strand information.
    Attributes:
        expression -- input expression in which the error occurred
        message -- explanation of the error
    """
    def __init__(self, message):
        self.message = message

class ReactionTypeError(Error):
    """Exception raised for errors in the reaction type to be processed.
    Attributes:
        expression -- input expression in which the error occurred
        message -- explanation of the error
    """
    def __init__(self, message):
        self.message = message

In [4]:
#these are copied and unchanged from the Uditas v1 software

################################################################################
# Open .fastq or .fastq.gz files for reading
################################################################################
def open_fastq_or_gz(filename):
    if filename.endswith(".fastq") and os.access(filename, os.F_OK):
        return open(filename, "rU")
    elif filename.endswith(".fastq.gz") and os.access(filename, os.F_OK):
        return gzip.open(filename, "rb")
    elif filename.endswith(".fastq") and os.access(filename + ".gz", os.F_OK):
        return gzip.open(filename + ".gz", "rb")
    elif filename.endswith(".fastq.gz") and os.access(filename[:-3], os.F_OK):
        return open(filename[:-3], "rU")
    raise IOError("Unknown file: " + filename)

################################################################################
# Hamming distance
# From http://code.activestate.com/recipes/499304-hamming-distance/
################################################################################
def hamm_dist(str1, str2):
    assert len(str1) == len(str2)
    ne = operator.ne
    return sum(itertools.imap(ne, str1, str2))

################################################################################
# Select closest barcode with a maximum number of mismatches
# By default it returns barcodes with a maximum of n_max_mismatches=2 mismatches
################################################################################
def select_barcode(seq, barcode_list, n_max_mismatches=1):
    # This compares with all barcodes and selects the one with the smallest hamming distance
    # Before calling this function check if the sequence is already a barcode
    matched_barcodes = list()
    distances = list()
    for barcode in barcode_list:
        h_d = hamm_dist(seq, barcode)
        if h_d <= n_max_mismatches:
            matched_barcodes.append(barcode)
            distances.append(h_d)
    indices = [i for i, x in enumerate(distances) if x == min(distances)]
    return [matched_barcodes[i] for i in indices]


################################################################################
# Mask sequence by quality score
################################################################################
def mask(seq, qual, min_qual=12):

    return "".join((b if (ord(q) - 33) >= min_qual else "N") for b, q in itertools.izip(seq, qual))


################################################################################
# get the reverse-complement DNA sequence
################################################################################
def reverse_complement(seq):
    seq_dict = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', 'N': 'N', 'a': 't', 't': 'a', 'g': 'c', 'c': 'g'}
    return "".join([seq_dict[base] for base in reversed(seq)])


################################################################################
# Create umi dict
################################################################################
def create_umi_dict(filename):

    umi_file = open_fastq_or_gz(filename)

    umi_dict = dict()

    umi_reads = itertools.izip(umi_file)

    for header_umi in umi_reads:

        seq_umi = umi_reads.next()
        umi_reads.next()
        qual_umi = umi_reads.next()
        umi_dict[header_umi[0].split()[0][1:]] = [seq_umi[0].rstrip(), qual_umi[0].rstrip()]

    return umi_dict


################################################################################
# create list of output files
################################################################################
def create_filename(dir_sample, N7, N5, filetype):
    main_folder = os.path.join(dir_sample, N7 + '_' + N5)
    if filetype == 'mainfolder':
        return main_folder
    elif filetype == 'amplicons':
        return os.path.join(main_folder, 'amplicons')
    elif filetype == 'R1fastq':
        return os.path.join(main_folder, 'fastq_files', N7 + '_' + N5 + '_R1.fastq')
    elif filetype == 'R1fastqgz':
        return os.path.join(main_folder, 'fastq_files', N7 + '_' + N5 + '_R1.fastq.gz')
    elif filetype == 'R2fastq':
        return os.path.join(main_folder, 'fastq_files', N7 + '_' + N5 + '_R2.fastq')
    elif filetype == 'R2fastqgz':
        return os.path.join(main_folder, 'fastq_files', N7 + '_' + N5 + '_R2.fastq.gz')
    elif filetype == 'umifastq':
        return os.path.join(main_folder, 'fastq_files', N7 + '_' + N5 + '_umi.fastq')
    elif filetype == 'umifastqgz':
        return os.path.join(main_folder, 'fastq_files', N7 + '_' + N5 + '_umi.fastq.gz')
    elif filetype == 'R1trimmed':
        return os.path.join(main_folder, 'cutadapt_files', N7 + '_' + N5 + '_R1.trimmed.fastq.gz')
    elif filetype == 'R2trimmed':
        return os.path.join(main_folder, 'cutadapt_files', N7 + '_' + N5 + '_R2.trimmed.fastq.gz')
    elif filetype == 'trimmed_report':
        return os.path.join(main_folder, 'cutadapt_files', N7 + '_' + N5 + '.trimmed.report.txt')
    elif filetype == 'sam_genome_local':
        return os.path.join(main_folder, 'sam_genome_local_files', N7 + '_' + N5 + '.sam')
    elif filetype == 'sam_report_genome_local':
        return os.path.join(main_folder, 'sam_genome_local_files', N7 + '_' + N5 + '.sam.report.txt')
    elif filetype == 'bam_genome_local':
        return os.path.join(main_folder, 'bam_genome_local_files', N7 + '_' + N5 + '.bam')
    elif filetype == 'sorted_bam_genome_local':
        return os.path.join(main_folder, 'bam_genome_local_files', N7 + '_' + N5 + '.sorted.bam')
    elif filetype == 'sorted_bai_genome_local':
        return os.path.join(main_folder, 'bam_genome_local_files', N7 + '_' + N5 + '.sorted.bam.bai')
    elif filetype == 'sam_plasmid_local':
        return os.path.join(main_folder, 'sam_plasmid_local_files', N7 + '_' + N5 + '.sam')
    elif filetype == 'sam_report_plasmid_local':
        return os.path.join(main_folder, 'sam_plasmid_local_files', N7 + '_' + N5 + '.sam.report.txt')
    elif filetype == 'bam_plasmid_local':
        return os.path.join(main_folder, 'bam_plasmid_local_files', N7 + '_' + N5 + '.bam')
    elif filetype == 'sorted_bam_plasmid_local':
        return os.path.join(main_folder, 'bam_plasmid_local_files', N7 + '_' + N5 + '.sorted.bam')
    elif filetype == 'sorted_bai_plasmid_local':
        return os.path.join(main_folder, 'bam_plasmid_local_files', N7 + '_' + N5 + '.sorted.bam.bai')
    elif filetype == 'unmapped_bam_plasmid_local':
        return os.path.join(main_folder, 'bam_plasmid_local_files', N7 + '_' + N5 + '_unmapped.bam')
    elif filetype == 'qsorted_unmapped_bam_plasmid_local':
        return os.path.join(main_folder, 'bam_plasmid_local_files', N7 + '_' + N5 + '_qsorted_unmapped.bam')
    elif filetype == 'unmapped_plasmid_R1fastq':
        return os.path.join(main_folder, 'plasmid_unmapped_fastq_files', N7 + '_' + N5 + '_plasmid_unmapped_R1.fastq')
    elif filetype == 'unmapped_plasmid_R2fastq':
        return os.path.join(main_folder, 'plasmid_unmapped_fastq_files', N7 + '_' + N5 + '_plasmid_unmapped_R2.fastq')
    elif filetype == 'unmapped_plasmid_R1fastqgz':
        return os.path.join(main_folder, 'plasmid_unmapped_fastq_files', N7 + '_' + N5 + '_plasmid_unmapped_R1.fastq.gz')
    elif filetype == 'unmapped_plasmid_R2fastqgz':
        return os.path.join(main_folder, 'plasmid_unmapped_fastq_files', N7 + '_' + N5 + '_plasmid_unmapped_R2.fastq.gz')
    elif filetype == 'sam_amplicons':
        return os.path.join(main_folder, 'sam_amplicon_files', N7 + '_' + N5 + '.sam')
    elif filetype == 'sam_report_amplicons':
        return os.path.join(main_folder, 'sam_amplicon_files', N7 + '_' + N5 + '.sam.report.txt')
    elif filetype == 'bam_amplicons':
        return os.path.join(main_folder, 'bam_amplicon_files', N7 + '_' + N5 + '.bam')
    elif filetype == 'sorted_bam_amplicons':
        return os.path.join(main_folder, 'bam_amplicon_files', N7 + '_' + N5 + '.sorted.bam')
    elif filetype == 'sorted_bai_amplicons':
        return os.path.join(main_folder, 'bam_amplicon_files', N7 + '_' + N5 + '.sorted.bam.bai')
    elif filetype == 'unmapped_bam_amplicons':
        return os.path.join(main_folder, 'bam_amplicon_files', N7 + '_' + N5 + '_amplicons_unmapped.bam')
    elif filetype == 'qsorted_unmapped_bam_amplicons':
        return os.path.join(main_folder, 'bam_amplicon_files', N7 + '_' + N5 + '_qsorted_amplicons_unmapped.bam')
    elif filetype == 'unmapped_amplicons_R1fastq':
        return os.path.join(main_folder, 'amplicons_unmapped_fastq_files', N7 + '_' + N5 + '_amplicons_unmapped_R1.fastq')
    elif filetype == 'unmapped_amplicons_R2fastq':
        return os.path.join(main_folder, 'amplicons_unmapped_fastq_files', N7 + '_' + N5 + '_amplicons_unmapped_R2.fastq')
    elif filetype == 'unmapped_amplicons_R1fastqgz':
        return os.path.join(main_folder, 'amplicons_unmapped_fastq_files',
                            N7 + '_' + N5 + '_amplicons_unmapped_R1.fastq.gz')
    elif filetype == 'unmapped_amplicons_R2fastqgz':
        return os.path.join(main_folder, 'amplicons_unmapped_fastq_files',
                            N7 + '_' + N5 + '_amplicons_unmapped_R2.fastq.gz')
    elif filetype == 'unmapped_amplicons_report':
        return os.path.join(main_folder, 'amplicons_unmapped_fastq_files', N7 + '_' + N5 + '.unmapped.report.txt')
    elif filetype == 'sam_genome_global':
        return os.path.join(main_folder, 'sam_genome_global_files', N7 + '_' + N5 + '.sam')
    elif filetype == 'sam_report_genome_global':
        return os.path.join(main_folder, 'sam_genome_global_files', N7 + '_' + N5 + '.sam.report.txt')
    elif filetype == 'bam_genome_global':
        return os.path.join(main_folder, 'bam_genome_global_files', N7 + '_' + N5 + '.bam')
    elif filetype == 'sorted_bam_genome_global':
        return os.path.join(main_folder, 'bam_genome_global_files', N7 + '_' + N5 + '.sorted.bam')
    elif filetype == 'sorted_bai_genome_global':
        return os.path.join(main_folder, 'bam_genome_global_files', N7 + '_' + N5 + '.sorted.bam.bai')
    elif filetype == 'results_amplicons':
        return os.path.join(main_folder, 'results', N7 + '_' + N5)  # We will append the window size later
    elif filetype == 'results_plasmid':
        return os.path.join(main_folder, 'results', N7 + '_' + N5 + '_results_plasmid.xlsx')
    elif filetype == 'results_all_amplicons':
        return os.path.join(main_folder, 'results', N7 + '_' + N5 + '_results_all_amplicons.xlsx')
    elif filetype == 'results_genomewide':
        return os.path.join(main_folder, 'results', N7 + '_' + N5 + '_results_genomewide.xlsx')
    elif filetype == 'summary_all_alignments':
        return os.path.join(main_folder, 'results', N7 + '_' + N5 + '_summary_all_alignments.xlsx')
    elif filetype == 'read_counts':
        return os.path.join(main_folder, 'results', N7 + '_' + N5 + '_read_counts.xlsx')


In [37]:
index_i1_set

{'N701', 'N702', 'N703', 'N704', 'N705', 'N706'}

In [42]:
#This is changed from the uditas software. They had the index2 read before UMI but ours is UMI->Index2 
#so the function was tweaked accordingly

############################
#
# Demultiplexer
# Input: folder to demultiplex, with Undetermined fastq files and sample info in sample_info.csv
#
# ##########################
def demultiplex_no_UMIs(dir_sample):

    # Read indices
    sample_info_filename = os.path.join(dir_sample, 'sample_info.csv')

    experiments = pd.read_csv(sample_info_filename)

    #this is what uditas has but my files are named different after BCL2FASTQ conv
    '''
    r1_fastq = os.path.join(dir_sample, 'Undetermined_S0_L001_R1_001.fastq.gz')
    r2_fastq = os.path.join(dir_sample, 'Undetermined_S0_L001_R2_001.fastq.gz')
    i1_fastq = os.path.join(dir_sample, 'Undetermined_S0_L001_I1_001.fastq.gz')
    i2_fastq = os.path.join(dir_sample, 'Undetermined_S0_L001_I2_001.fastq.gz')
    '''
    #names of my files
    r1_fastq = os.path.join(dir_sample, 'Undetermined_S0_R1_001.fastq.gz')
    r2_fastq = os.path.join(dir_sample, 'Undetermined_S0_R2_001.fastq.gz')
    i1_fastq = os.path.join(dir_sample, 'Undetermined_S0_I1_001.fastq.gz')
    i2_fastq = os.path.join(dir_sample, 'Undetermined_S0_I2_001.fastq.gz')

    index_i1_list = list(experiments['index_I1'])
    barcode_i1_list = list(experiments['barcode_I1'])
    i1_dict = dict(zip(barcode_i1_list, index_i1_list))
    index_i2_list = list(experiments['index_I2'])
    barcode_i2_list = list(experiments['barcode_I2'])
    i2_dict = dict(zip(barcode_i2_list, index_i2_list))

    
    index_i1_set = set(index_i1_list)

    good_barcode_pairs = dict()

    
    for bc in index_i1_set:
        good_barcode_pairs[bc] = list(experiments.loc[bc == experiments['index_I1']]['index_I2'])

    barcode_i2_length = len(barcode_i2_list[0])

    files_out = list()

    # Create all directories if necessary
    N7_N5 = itertools.izip(index_i1_list, index_i2_list)
    for (N7, N5) in N7_N5:
        exp_dir = create_filename(dir_sample, N7, N5, 'mainfolder')
        if not os.path.exists(exp_dir):
            os.mkdir(exp_dir)
        if not os.path.exists(os.path.dirname(create_filename(dir_sample, N7, N5, 'R1fastq'))):
            os.mkdir(os.path.dirname(create_filename(dir_sample, N7, N5, 'R1fastq')))
        files_out.append(create_filename(dir_sample, N7, N5, 'R1fastq'))
        files_out.append(create_filename(dir_sample, N7, N5, 'R2fastq'))

    # create dict of output basename files, to map to opened files
    n_file = 0
    files_out_dict = dict()

    for file_selected in files_out:
        files_out_dict[os.path.basename(file_selected)] = n_file
        n_file += 1

    if not os.path.exists(os.path.join(dir_sample, 'mismatched')):
        os.mkdir(os.path.join(dir_sample, 'mismatched'))

    if not os.path.exists(os.path.join(dir_sample, 'reports')):
        os.mkdir(os.path.join(dir_sample, 'reports'))

    file_out_not_in_exp_list_r1 = os.path.join(dir_sample, 'mismatched', 'not_in_exp_list_R1.fastq')
    file_out_not_in_exp_list_r2 = os.path.join(dir_sample, 'mismatched', 'not_in_exp_list_R2.fastq')
    file_out_not_in_exp_list_i1 = os.path.join(dir_sample, 'mismatched', 'not_in_exp_list_I1.fastq')
    file_out_not_in_exp_list_i2 = os.path.join(dir_sample, 'mismatched', 'not_in_exp_list_I2.fastq')

    file_out_mismatched_adapters_r1 = os.path.join(dir_sample, 'mismatched', 'mismatched_adapters_R1.fastq')
    file_out_mismatched_adapters_r2 = os.path.join(dir_sample, 'mismatched', 'mismatched_adapters_R2.fastq')
    file_out_mismatched_adapters_i1 = os.path.join(dir_sample, 'mismatched', 'mismatched_adapters_I1.fastq')
    file_out_mismatched_adapters_i2 = os.path.join(dir_sample, 'mismatched', 'mismatched_adapters_I2.fastq')

    # We open all output files
    ref_files = [open(filename, "w") for filename in files_out]

    ref_file_out_not_in_exp_list_r1 = open(file_out_not_in_exp_list_r1, "w")
    ref_file_out_not_in_exp_list_r2 = open(file_out_not_in_exp_list_r2, "w")
    ref_file_out_not_in_exp_list_i1 = open(file_out_not_in_exp_list_i1, "w")
    ref_file_out_not_in_exp_list_i2 = open(file_out_not_in_exp_list_i2, "w")

    ref_file_out_mismatched_adapters_r1 = open(file_out_mismatched_adapters_r1, "w")
    ref_file_out_mismatched_adapters_r2 = open(file_out_mismatched_adapters_r2, "w")
    ref_file_out_mismatched_adapters_i1 = open(file_out_mismatched_adapters_i1, "w")
    ref_file_out_mismatched_adapters_i2 = open(file_out_mismatched_adapters_i2, "w")

    file_read_counts = [0] * len(files_out)

    # We open r1,r2,i1,i2 files and distribute reads
    with open_fastq_or_gz(r1_fastq) as r1_file, open_fastq_or_gz(r2_fastq) as r2_file, open_fastq_or_gz(i1_fastq) as i1_file, open_fastq_or_gz(i2_fastq) as i2_file:
        # Add counters for all reads

        reads_in_experiment_list_count = 0

        reads_not_in_experiment_list_count = 0

        mismatch_count = 0
        mismatch_count_i1 = 0
        mismatch_count_i2 = 0

        mismatch_dict_i1 = dict()
        mismatch_dict_i2 = dict()

        r1_r2_i1_i2 = itertools.izip(r1_file, r2_file, i1_file, i2_file)

        for header_r1, header_r2, header_i1, header_i2 in r1_r2_i1_i2:
    
            seq_r1, seq_r2, seq_i1, seq_i2 = r1_r2_i1_i2.next()

            r1_r2_i1_i2.next()

            qual_r1, qual_r2, qual_i1, qual_i2 = r1_r2_i1_i2.next()

            seq_i1, seq_i2 = seq_i1.rstrip(), seq_i2.rstrip()

            qual_i1, qual_i2 = qual_i1.rstrip(), qual_i2.rstrip()

            #We mask with N any bases with scores below or equal to , (11, default in mask)
            #I sequenced using miniseq with the original Uditas so my index2 read was UMI->Index. So I have to reverse the original script which was Index->Umi
            seq_i1 = mask(seq_i1, qual_i1)
            seq_i2 = mask(seq_i2, qual_i2)

            
            '''
            #this is the original Uditas where the Index2->UMI ordering
            seq_i2 = mask(seq_i2_plus_umi[:barcode_i2_length], qual_i2_plus_umi[:barcode_i2_length])
            umi_qual = qual_i2_plus_umi[barcode_i2_length:]
            umi = mask(seq_i2_plus_umi[barcode_i2_length:], umi_qual)
            '''
            
            # change to 1 for reads with perfect indices or match after correction
            is_good_index = 0

            if (seq_i1 in barcode_i1_list) and (seq_i2 in barcode_i2_list):
                # perfect match case
                is_good_index = 1
            else:
                # We look for barcodes with up to two mismatches, default in select_barcode
                seq_i1_match = select_barcode(seq_i1, barcode_i1_list)
                seq_i2_match = select_barcode(seq_i2, barcode_i2_list)
                if len(seq_i2_match) > 0 and len(seq_i1_match) > 0:
                    # match after selecting adapter with up to 2 mismatches (default in select_barcode)
                    is_good_index = 1
                    seq_i1 = seq_i1_match[0]
                    seq_i2 = seq_i2_match[0]

            if is_good_index:
                # We test whether the read has on of the combination of indices from our experiment list
                # If not save in a separate file
                if i2_dict[seq_i2] in good_barcode_pairs[i1_dict[seq_i1]]:

                    r1f = create_filename(dir_sample, i1_dict[seq_i1], i2_dict[seq_i2], 'R1fastq')
                    r2f = create_filename(dir_sample, i1_dict[seq_i1], i2_dict[seq_i2], 'R2fastq')

                    print("\n".join([header_r1.rstrip(), seq_r1.rstrip(), "+", qual_r1.rstrip()]),
                          file=ref_files[files_out_dict[os.path.basename(r1f)]])
                    file_read_counts[files_out_dict[os.path.basename(r1f)]] += 1

                    print("\n".join([header_r2.rstrip(), seq_r2.rstrip(), "+", qual_r2.rstrip()]),
                          file=ref_files[files_out_dict[os.path.basename(r2f)]])
                    file_read_counts[files_out_dict[os.path.basename(r2f)]] += 1

                    reads_in_experiment_list_count += 1

                else:
                    # We print reads with mismatched labels to our experiments
                    print("\n".join([header_r1.rstrip(), seq_r1.rstrip(), "+", qual_r1.rstrip()]),
                          file=ref_file_out_not_in_exp_list_r1)
                    print("\n".join([header_r2.rstrip(), seq_r2.rstrip(), "+", qual_r2.rstrip()]),
                          file=ref_file_out_not_in_exp_list_r2)
                    print("\n".join([header_i1.rstrip(), seq_i1.rstrip(), "+", qual_i1.rstrip()]),
                          file=ref_file_out_not_in_exp_list_i1)
                    print("\n".join([header_i2.rstrip(), seq_i2.rstrip(), "+",
                                     qual_i2.rstrip()]), file=ref_file_out_not_in_exp_list_i2)

                    reads_not_in_experiment_list_count += 1
            else:
                # We print reads with unknown/mismatched adapters
                print("\n".join([header_r1.rstrip(), seq_r1.rstrip(), "+", qual_r1.rstrip()]),
                      file=ref_file_out_mismatched_adapters_r1)
                print("\n".join([header_r2.rstrip(), seq_r2.rstrip(), "+", qual_r2.rstrip()]),
                      file=ref_file_out_mismatched_adapters_r2)
                print("\n".join([header_i1.rstrip(), seq_i1.rstrip(), "+", qual_i1.rstrip()]),
                      file=ref_file_out_mismatched_adapters_i1)
                print("\n".join([header_i2.rstrip(), seq_i2.rstrip(), "+",
                                 qual_i2.rstrip()]), file=ref_file_out_mismatched_adapters_i2)

                if seq_i2 not in i2_dict.keys():
                    if seq_i2 in mismatch_dict_i2.keys():
                        mismatch_dict_i2[seq_i2] += 1
                    else:
                        mismatch_dict_i2[seq_i2] = 1
                    mismatch_count_i2 += 1

                if seq_i1 not in i1_dict.keys():
                    if seq_i1 in mismatch_dict_i1.keys():
                        mismatch_dict_i1[seq_i1] += 1
                    else:
                        mismatch_dict_i1[seq_i1] = 1
                    mismatch_count_i1 += 1

                mismatch_count += 1

    # close all files
    for rf in ref_files:
        rf.close()

    ref_file_out_not_in_exp_list_r1.close()
    ref_file_out_not_in_exp_list_r2.close()
    ref_file_out_not_in_exp_list_i1.close()
    ref_file_out_not_in_exp_list_i2.close()

    ref_file_out_mismatched_adapters_r1.close()
    ref_file_out_mismatched_adapters_r2.close()
    ref_file_out_mismatched_adapters_i1.close()
    ref_file_out_mismatched_adapters_i2.close()

    # print report of counts for individual files
    report_file = os.path.join(dir_sample, 'reports', 'report_individual_files.xls')

    x = np.array(file_read_counts)
    fh = open(report_file, "w")
    print("\t".join(['filename', 'reads_count']), file=fh)

    for i in np.nonzero(x)[0]:
        print("\t".join([os.path.basename(files_out[i]), str(file_read_counts[i])]), file=fh)
    fh.close()

    # print report overall counts
    report_file = os.path.join(dir_sample, 'reports', 'report_overall.xls')

    fh = open(report_file, "w")

    print('Total number of reads:\t' + str(reads_in_experiment_list_count + reads_not_in_experiment_list_count +
                                           mismatch_count) + '\n', file=fh)
    print('Reads without I1 or I2 adapters:\t' + str(mismatch_count) + '\n', file=fh)
    print('Reads without I1 adapters:\t' + str(mismatch_count_i1) + '\n', file=fh)
    print('Reads without I2 adapters:\t' + str(mismatch_count_i2) + '\n', file=fh)
    print('Reads with I1 or I2 adapters in the wrong combination:\t' + str(reads_not_in_experiment_list_count) + '\n',
          file=fh)
    print('Reads with I1 or I2 adapters matching our experiments:\t' + str(reads_in_experiment_list_count) + '\n',
          file=fh)
    fh.close()

    # print report mismatched i1_rc adapters
    report_file = os.path.join(dir_sample, 'reports', 'report_mismatched_adapters_i1.xls')

    fh = open(report_file, "w")

    print('I1_RC\tCount', file=fh)

    for dict_element in mismatch_dict_i1:
        print(dict_element + '\t' + str(mismatch_dict_i1[dict_element]), file=fh)

    fh.close()

    # print report mismatched i2 adapters
    report_file = os.path.join(dir_sample, 'reports', 'report_mismatched_adapters_i2.xls')

    fh = open(report_file, "w")

    print('I2\tCount', file=fh)

    for dict_element in mismatch_dict_i2:
        print(dict_element + '\t' + str(mismatch_dict_i2[dict_element]), file=fh)

    fh.close()

    # gzip fastq files
    for fo in files_out:
        with open(fo) as f_in, gzip.open(fo + '.gz', 'wb') as f_out:
            f_out.writelines(f_in)


### File Data
#unsorted files: /media/edanner/NewUbuntuSpace/Workspace/LinearAmp/Sequence2_191129_MN00157_0047_A000H2GWGF/P_Eric4_BCL2Fastq_only
#Take the first 1000 files (5000 lines)
#command to view first 17 lines of files
$head -17 filelocation_or_name

#### of all 4 file types - If you do it multiple times it just appends text lines to bottom of text editor
#(terminal)
$head -n50000 Undetermined_S0_I1_001.fastq >> ./10000Reads/Undetermined_S0_I1_001.fastq
$head -n50000 Undetermined_S0_I2_001.fastq >> ./10000Reads/Undetermined_S0_I2_001.fastq
$head -n50000 Undetermined_S0_R1_001.fastq >> ./10000Reads/Undetermined_S0_R1_001.fastq
$head -n50000 Undetermined_S0_R2_001.fastq >> ./10000Reads/Undetermined_S0_R2_001.fastq

#### check how many lines: (terminal)
$wc -l Undetermined_S0_I1_001.fastq

wc -l Del_Report_Cntl.fastq

$head -n4000 Del_Report_Cntl.fastq >> Del_Report_Cntl1000.fastq
$head -n4000 Del_Report_Targeted.fastq >> Del_Report_Targeted1000.fastq
$head -n4000 Polb_Cntl.fastq >> Polb_Cntl1000.fastq
$head -n4000 Polb_Targeted.fastq >> Polb_Targeted1000.fastq

### can compress with gZip
$gzip Undetermined_S0_I1_001.fastq
$gzip Undetermined_S0_I2_001.fastq
$gzip Undetermined_S0_R1_001.fastq
$gzip Undetermined_S0_R2_001.fastq

In [40]:
#unsorted 10000 test files:
# * old code *
directory10000 = '/media/edanner/NewUbuntuSpace/Workspace/LinearAmp/Sequence2_191129_MN00157_0047_A000H2GWGF/P_Eric4_BCL2Fastq_only/10000Reads'
print(directory10000)
demultiplex(directory10000)

/media/edanner/NewUbuntuSpace/Workspace/LinearAmp/Sequence2_191129_MN00157_0047_A000H2GWGF/P_Eric4_BCL2Fastq_only/10000Reads


In [29]:
#test all of my files
directory = '/home/eric/Data/Spaced_Nicking/LAM2_Miniseq_ELANE'

print(directory)


/home/eric/Data/Spaced_Nicking/LAM2_Miniseq_ELANE


In [44]:
#runs the demultiplexing for all 
demultiplex_no_UMIs(directory)
#took about 1.5 hr on my personal computer for an entire miniseq run demulitplexing

In [45]:
directory = '/home/eric/Data/Spaced_Nicking/LAM3_IL7R_PRF1'
demultiplex_no_UMIs(directory)
