forked from QBRC/PIPE-CLIP
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pipeclip.py
executable file
·112 lines (104 loc) · 6.17 KB
/
pipeclip.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#Main pipeline connects all the scripts together
#Original Programmer: beibei.chen@utsouthwestern.edu
#Refactored by: eric.roos@utsouthwestern.edu
#Usage: python pipeclip.py input.sam output_prefix match_length mismatch_number pcr_rm fdr_cluster clip_type fdr_mutation species
#Required packages: pysam, ghmm, pybedtools
#Last modification: 18 Sep 2014
import sys
import argparse
import logging
import os
from time import gmtime, strftime
from lib import *
def prepare_argparser():
description = "Find mutations"
epilog = "For command line options of each command, type %(prog)s COMMAND -h"
argparser = argparse.ArgumentParser(description=description, epilog = epilog)
argparser.add_argument("-i","--input",dest = "infile", type = str, required = True, help = "input bam file")
argparser.add_argument("-o","--output",dest = "outfile", type = str,required = True, help = "output file, default is stdout")
argparser.add_argument("-l","--matchLength",dest = "matchLength", type = int ,required = True, help = "shorted matched segment length")
argparser.add_argument("-m","--mismatch",dest = "mismatch", type = int,required = True, help = "maximum mismatch number")
argparser.add_argument("-c","--clipType",dest = "clipType", type = int,required = True, help = "CLIP type (0)HITS-CLIP; (1)PAR-4SU; (2)PAR-6SG; (3)iCLIP", choices=[0,1,2,3])
argparser.add_argument("-r","--rmdup",dest = "dupRemove", type = int,required = True, help = "Remove PCR duplicate (0)No removal; (1)Remove by read start; (2)Remove by sequence; ", choices=[0,1,2])
argparser.add_argument("-M","--fdrMutation",dest = "fdrMutation", type = float,required = True, help = "FDR for reliable mutations")
argparser.add_argument("-C","--fdrCluster",dest = "fdrCluster", type = float,required = True, help = "FDR for enriched clusters")
argparser.add_argument("-s","--species",dest = "species", type = str, help = "Species [\"mm10\",\"hg19\"]",choices=["mm10","hg19"])
return(argparser)
def runPipeClip(infile,outputPrefix,matchLength,mismatch,rmdup,fdrEnrichedCluster,clipType,fdrReliableMutation,species):
myClip = CLIP.CLIP(infile,outputPrefix)
logging.info("Start to run")
if myClip.testInput():#check input
logging.info("Input file OK,start to run PIPE-CLIP")
logging.info("Species info %s" % species)
if myClip.readfile():
myClip.filter(matchLength,mismatch,clipType,rmdup)
#myClip.printMutations()
if len(myClip.clusters)>0:
logging.info("Get enriched clusters")
status = Enrich.clusterEnrich(myClip,fdrEnrichedCluster)
if status:
logging.info("Found %d enriched clusters" % myClip.sigClusterCount)
myClip.printEnrichedClusters()
else:
logging.error("There is no enriched cluster found. Exit program")
sys.exit(1)
else:
logging.error("There is no clusters found. Please check input.Exit program.")
sys.exit(1)
if len(myClip.mutations.keys())>0:
logging.info("Get reliable mutations")
Enrich.mutationEnrich(myClip,fdrReliableMutation)
logging.info("There are %d reliable mutations" % myClip.sigMutationCount)
myClip.printReliableMutations()
else:
logging.warning("There is no mutation found in this BAM file.")
#Start to get crosslinking sites
if myClip.sigClusterCount > 0 and myClip.sigMutationCount>0:
logging.info("Get cross-linking sites")
myClip.getCrosslinking()
if (len(myClip.crosslinking.keys())>0):
outfilelist = myClip.printCrosslinkingSites()
myClip.printCrosslinkingMutations()
else:
logging.warning("There is no crosslinking found. May be caused by no reliable mutations in enriched clusters. Print out enriched clusters instead.")
outfilelist = myClip.printEnrichClusters()
else:
if myClip.sigClusterCount <= 0:
logging.error("There is no enriched clusters for this sample, please check your input file. Exit.")
sys.exit(2)
elif myClip.sigMutationCount <=0:
logging.warning("There is no reliable mutations found. PIPE-CLIP will provide enriched clusters as crosslinking candidates.")
outfilelist = myClip.printEnrichedClusters()
#annotation if possible
if species in ["mm10","mm9","hg19"]:
logging.info("Started to annotate cross-linking sits using HOMER")
for name in outfilelist:
#logging.debug("Start to do annotation for %s" % name)
Utils.annotation(name,species)
#output a status log file
logfile = open(outputPrefix+".pipeclip.summary.log","w")
print >>logfile,"PIPE-CLIP run finished. Parameters are:"
print >> logfile,"Input BAM: %s \nOutput prefix: %s \nMinimum matched length: %d \nMaximum mismatch count: %d \nPCR duplicate removal code: %d \nFDR for enriched clusters: %f \nFDR for reliable mutations: %f" % (infile,outputPrefix,matchLength,mismatch,rmdup,fdrEnrichedCluster,fdrReliableMutation)
print >> logfile, "There are %d mapped reads in input BAM file. After filtering,%d reads left" % (myClip.originalMapped,myClip.filteredAlignment)
print >> logfile, "%d out of %d clusters are enriched." % (myClip.sigClusterCount,len(myClip.clusters))
print >> logfile, "%d out of %d mutations are reliable." % (myClip.sigMutationCount,myClip.mutationCount)
print >> logfile, "%d crosslinking site candidates are found, with %d supporting reliable mutations." % (len(myClip.crosslinking.keys()),len(myClip.crosslinkingMutations))
logfile.close()
logging.info("PIPE-CLIP finished the job, please check your results. :)")
else:
logging.error("File corruputed, program exit.")
sys.exit(0)
if __name__=="__main__":
arg_parser = prepare_argparser()
args = arg_parser.parse_args()
OptValidator.opt_validate()
infile = args.infile # Input SAM/BAM file
outputPrefix = args.outfile # Output prefix
matchLength = args.matchLength # Shorted matched segment length
mismatch = args.mismatch # Maximum mismatch number
rmcode = args.dupRemove
fdrEnrichedCluster = args.fdrCluster # FDR for enriched clusters
clipType =args.clipType # CLIP type (0)HITS-CLIP; (1)PAR-4SU; (2)PAR-6SG; (3)iCLIP
fdrReliableMutation = args.fdrMutation# FDR for reliable mutations
species = args.species # Species ["mm10","hg19"]
runPipeClip(infile,outputPrefix,matchLength,mismatch,rmcode,fdrEnrichedCluster,clipType,fdrReliableMutation,species)