In [1]:
import subprocess, os, sys, signal, pip, collections, optparse, sys, tempfile, re, optparse
import rpy2
from joblib import Parallel, delayed
import multiprocessing
import threading
import time
import numpy as np

In [2]:
# inputFolder : Folder name with all the fastq files
intputFolder = '/mnt/fls01-bcf01/ngsdata/Analysis/2016/nextseq/161025_NB500968_0056_AHGCMVAFXX_analysis/Raphael_Thuret/fastqs'

# Setting the output folder where all the results would be stored
outputFolder = '/mnt/mr01-home01/mqbsxsm2/scratch/Shane_Herbert/output/'

In [3]:
def expanderuser(path):
    """Replaces the ~ with the users home directory"""
    if path.startswith("~"):
        return os.path.expanduser("~") + path[1: ]
    return path

def multiple_match(regex, old_name, name):
    report_error(regex+" matched both " + old_name + " and " + name + "strickter regex required")

def do_walk(source, regex, target_path, onerror=None, followlinks=False, verbose=True, divider=None, sort=None):
    """
    Walker method
    Inputs are:
    source Parent directory to be walked through
    onerror is a function on underlying os.listdir fails
    followlinks would allow following symbolic links
        WARNING Be aware that setting followlinks to True can lead to infinite
           recursion if a link points to a parent directory of itself.
           walk() does not keep track of the directories it visited already.
    verbose: Flag passed to action methods to provide more output
    """
    source = expanderuser(source)
    (names_number, names_path) = tempfile.mkstemp()
    re_object = re.compile(regex)
    file_paths = []
    with open(names_path, 'w') as f:
        # Must be topdown=True otherwise walk will process subdirectories before checking them
        for root, dirs, files in os.walk(source, topdown=True, onerror=onerror, followlinks=followlinks):
            for name in files:
                if re_object.search(name):
                    file_path = os.path.join(root, name)
                    f.write(file_path)
                    f.write("\n")
                    file_paths.append(file_path)
                    if verbose:
                        print("Merging",file_path)
                #else:
                #    print "NO",name, root
    if len(file_paths) == 0:
        report_error("NO files found to match "+ regex)
    if divider:
        if sort:
            merge_files(file_paths, names_path, target_path, verbose=verbose, divider = divider, sort = sort)
        else:
            merge_files(file_paths, names_path, target_path, verbose=verbose, divider = divider)
    else:
        if sort:
            merge_files(file_paths, names_path, target_path, verbose, sort=sort)
        else:
            merge_files(file_paths, names_path, target_path, verbose)

In [4]:

def report_error(error):
    """Prints the error, and exits -1"""
    print(error) 
    sys.stderr.write(error)
    sys.stderr.write("\n")
    sys.stderr.flush()
    sys.exit(1)


def remove_common(names):
    start = names[0]
    end = names[0]
    for name in names:
        while len(start) > 0 and not(name.startswith(start)):
            start = start[: -1]
        while len(end) > 0 and not(name.endswith(end)):
            end = end[1:]
    new_names = []
    for name in names:
        new_name = name[len(start): -len(end)]
        new_names.append("C"+new_name)
    return new_names


def remove_symbols(s):
    if s.find("__") == -1:
        return s
    # Patterns used by Galaxy
    s = s.replace("__cb__", ']')
    s = s.replace("__cc__", '}')
    s = s.replace("__dq__", '"')
    s = s.replace("__lt__", '<')
    s = s.replace("__gt__", '>')
    s = s.replace("__ob__", '[')
    s = s.replace("__oc__", '{')
    s = s.replace("__sq__", "'")
    # Patterns added by Christian
    s = s.replace("__in__", '%in%')
    s = s.replace("__not__", '!')
    end = -2
    # tab = 9
    # | = 124
    while True:
        start = s.find("__", end + 2) + 2
        if start == 1:
            return s
        end = s.find("__", start)
        if end == -1:
            return s
        part = s[start: end]
        try:
            ascii = int(part)
            s = s.replace("__" + part + "__", chr(ascii))
            end = -2
        except ValueError:
            pass
    return s


def clean_part(part):
    part = part.strip()
    part = part.replace("\t", "__9__")
    return part


def return_blank():
    return "0"


def black_dict():
    return collections.defaultdict(return_blank)


def merge_files(file_paths, names_path, target_path, verbose=False, divider="\t", sort = None):
    if sort:
        if sort == "column_names":
            column_sort = True
            row_sort = False
        elif sort == "row_names":
            column_sort = False
            row_sort = true
        elif sort == "both":
            column_sort = True
            row_sort = True
        elif sort == "none":
            column_sort = False
            row_sort = False
        else:
            report_error("Unexpected value: " + sort + " for sort parameter. Legeal values are: column_names, row_names, both or none")
    else:
        column_sort = False
        row_sort = False

    names = []
    with open(names_path, 'r') as f:
        for line in f:
            line = line.strip()
            if len(line) > 0:
                names.append(line)
    if len(names) != len(file_paths):
        report_error("Found " + str(len(file_paths)) + " file_paths but " + names_path + " contains " + str(len(names)) + " lines.")
    new_names = remove_common(names)
    clean_divider = remove_symbols(divider)
    all_values = collections.defaultdict(black_dict)
    for count, file_path in enumerate(file_paths):
        mis_match = 0
        with open(file_path, 'r') as f:
            for line in f:
                parts = line.strip().split(clean_divider)
                if len(parts) == 2:
                    key = clean_part(parts[0])
                    value = clean_part(parts[1])
                    all_values[key][new_names[count]] = value
                else:
                    mis_match+= 1
                    if verbose:
                        if mis_match < 5:
                            print("ignoring following line from", file_path)
                            print(line)
        if mis_match > 0:
            print("In file " + file_path + " " + str(mis_match) + " lines did not have 1 divider (" + clean_divider + ") " + divider)

    if column_sort:
        new_names = sorted(new_names)
    if row_sort:
        row_names = sorted(all_values.keys())
    else:
        row_names = all_values.keys()

    with open(target_path, 'w') as f:
        for name in new_names:
            f.write("\t")
            f.write(name)
        f.write("\n")
        for key in row_names:
            f.write(key)
            for name in new_names:
                f.write("\t")
                f.write(all_values[key][name])
            f.write("\n")

In [5]:
HTSEQFolder = outputFolder + 'htseq_output/'
HTSEqFile = outputFolder + 'AllHtseqCounts_Shane_Herbert.tsv'
do_walk(HTSEQFolder, '_R1_001.txt', HTSEqFile, onerror=None, followlinks=False, verbose=True, divider=None, sort="both")

Merging /mnt/mr01-home01/mqbsxsm2/scratch/Shane_Herbert/output/htseq_output/C12_S50_R1_001.txt
Merging /mnt/mr01-home01/mqbsxsm2/scratch/Shane_Herbert/output/htseq_output/C05_S57_R1_001.txt
Merging /mnt/mr01-home01/mqbsxsm2/scratch/Shane_Herbert/output/htseq_output/C18_S51_R1_001.txt
Merging /mnt/mr01-home01/mqbsxsm2/scratch/Shane_Herbert/output/htseq_output/C84_S78_R1_001.txt
Merging /mnt/mr01-home01/mqbsxsm2/scratch/Shane_Herbert/output/htseq_output/C31_S6_R1_001.txt
Merging /mnt/mr01-home01/mqbsxsm2/scratch/Shane_Herbert/output/htseq_output/C77_S85_R1_001.txt
Merging /mnt/mr01-home01/mqbsxsm2/scratch/Shane_Herbert/output/htseq_output/C68_S36_R1_001.txt
Merging /mnt/mr01-home01/mqbsxsm2/scratch/Shane_Herbert/output/htseq_output/C22_S68_R1_001.txt
Merging /mnt/mr01-home01/mqbsxsm2/scratch/Shane_Herbert/output/htseq_output/C15_S3_R1_001.txt
Merging /mnt/mr01-home01/mqbsxsm2/scratch/Shane_Herbert/output/htseq_output/C13_S19_R1_001.txt
Merging /mnt/mr01-home01/mqbsxsm2/scratch/Shane_Herb