Permalink
Fetching contributors…
Cannot retrieve contributors at this time
executable file 483 lines (407 sloc) 17.8 KB
#!/usr/bin/env python
"""
DuctGenome
Analyze genome(s) and map them to KEGG
"""
from ductape import __version__
from ductape.actions import touchProject, prepareDir
from ductape.common.colorlog import ColorFormatter
from ductape.storage.SQLite.database import Organism, Project, Genome, Kegg
from ductape.terminal import RunThread
import argparse
import logging.handlers
import os
import sys
__author__ = "Marco Galardini"
__prog__ = "dgenome"
################################################################################
# Log setup
logger = logging.getLogger('ductape')
################################################################################
# Methods
def dinit(options, wdir, project):
from ductape.actions import dInit
if not dInit(project, wdir, options.name, options.descr):
logger.warning('You can remove or rename the old project file')
return False
else:
return True
def dadd(options, wdir, project):
from ductape.actions import dGenomeAdd, dGenomeMutAdd
if not touchProject(project):
logger.warning('You can setup a new project by running %s init'%
__prog__)
return False
org = Organism(project)
if org.isOrg(options.orgID):
if not org.isMutant(options.orgID):
return dGenomeAdd(project, options.orgID, options.file)
else:
return dGenomeMutAdd(project, options.orgID, options.file)
else:
logger.warning('Organism %s is not present yet!'%options.orgID)
return False
def daddDir(options, wdir, project):
from ductape.actions import dGenomeDirAdd
if not touchProject(project):
logger.warning('You can setup a new project by running %s init'%
__prog__)
return False
return dGenomeDirAdd(project, options.folder, options.e)
def dstart(options, wdir, project):
from Bio import SeqIO
from ductape.actions import dGetGenomeSteps
if not touchProject(project):
logger.warning('You can setup a new project by running %s init'%
__prog__)
return False
if options.cpu <= 0:
logger.warning('How can i use %d cpus?'%options.cpu)
return False
tmp = os.path.join(wdir,'tmp')
if not prepareDir(wdir, 'tmp'):
return False
proj = Project(project)
org = Organism(project)
gen = Genome(project)
if len(org) == 0:
logger.warning('No organisms are present yet!')
logger.warning('Use %s add or %s add-dir!'%(__prog__, __prog__))
return False
steps = dGetGenomeSteps(project)
if 'pangenome' in steps or 'map2ko' in steps:
# Prepare the genomic files
protdir = os.path.join(tmp,'proteins')
if not os.path.exists(protdir):
try:
os.mkdir(protdir)
except:
logger.error('Could not create tmp directory %s'%protdir)
return False
infiles = {}
for organism in org.getAll():
protfile = os.path.join(protdir, organism.org_id)
infiles[organism.org_id] = protfile
SeqIO.write(gen.getRecords(organism.org_id),
open(protfile, 'w'), 'fasta')
#
for step in steps:
if step == 'pangenome':
if options.s:
logger.warning('Skipping pangenome calculation')
continue
if not doPanGenome(project,infiles,options.cpu,options.prefix,options.matrix,options.evalue):
logger.error('PanGenome could not be calculated!')
return False
elif step == 'map2ko':
if options.g:
logger.warning('Skipping Kegg mapping')
continue
if not doMap2KO(project, infiles, local=options.l, keggdb=options.k):
logger.error('Genome(s) could not be mapped to ko!')
return False
if options.l:
proj.setGenome('map2ko')
org.setAllGenomeStatus('map2ko')
else:
break
elif step == 'map2kegg':
if options.g:
logger.warning('Skipping Kegg mapping')
continue
if not doMap2KEGG(project, options.y):
logger.error('Genome(s) could not be mapped to kegg!')
return False
proj.setGenome('map2kegg')
org.setAllGenomeStatus('map2kegg')
else:
logger.warning('Unrecognized analysis %s'%step)
return True
def dannotate(options, wdir, project):
from ductape.actions import dGenomeAnnotate
if not touchProject(project):
logger.warning('You can setup a new project by running %s init'%
__prog__)
return False
ret = dGenomeAnnotate(project, options.s)
if ret:
logger.info('Information on merged annotations and multiply annotated'+
' orthologs can be obtained with %s export'%__prog__)
return True
def ddeannotate(options, wdir, project):
from ductape.actions import dGenomeDeAnnotate
if not touchProject(project):
logger.warning('You can setup a new project by running %s init'%
__prog__)
return False
return dGenomeDeAnnotate(project)
def dstats(options, wdir, project):
from ductape.actions import dGenomeStats
if not touchProject(project):
logger.warning('You can setup a new project by running %s init'%
__prog__)
return False
return dGenomeStats(project, svg=options.svg)
def dexport(options, wdir, project):
from ductape.actions import dGenomeExport
if not touchProject(project):
logger.warning('You can setup a new project by running %s init'%
__prog__)
return False
return dGenomeExport(project)
def daddKo(options, wdir, project):
from ductape.genome.map2KO import OnlineSearch
if not touchProject(project):
logger.warning('You can setup a new project by running %s init'%
__prog__)
return False
kegg = Kegg(project)
gen = Genome(project)
proj = Project(project)
org = Organism(project)
kaas = OnlineSearch()
for filename in options.kofile:
logger.info('Parsing KAAS output %s'%filename)
kaas.parseKAAS(filename)
# TODO: here check if results are empty
kegg.addDraftKOs( set(kaas.results.values()) )
prot_ko = [[x,y] for x,y in kaas.results.iteritems()]
gen.addKOs( prot_ko )
logger.info('Mapped %d proteins to KO'%len(kaas.results))
proj.setGenome('map2ko')
org.setAllGenomeStatus('map2ko')
return True
def daddPanGenome(options, wdir, project):
from ductape.actions import dPanGenomeAdd
if not touchProject(project):
logger.warning('You can setup a new project by running %s init'%
__prog__)
return False
return dPanGenomeAdd(project, options.orthfile)
def dremove(options, wdir, project):
from ductape.actions import dGenomeRemove
if not touchProject(project):
logger.warning('Nothing to be removed!\n'+
'You can setup a new project by running %s init'%
__prog__)
return False
return dGenomeRemove(project, options.organisms)
def dclear(options, wdir, project):
from ductape.actions import dGenomeClear
if not touchProject(project):
logger.warning('Nothing to be cleaned up!\n'+
'You can setup a new project by running %s init'%
__prog__)
return False
return dGenomeClear(project)
def doPanGenome(project, infiles, cpu=1, prefix='',
matrix='BLOSUM80', evalue=1e-10):
from ductape.genome.pangenome import PanGenomer
pang = PanGenomer(infiles.values(), ncpus=cpu, prefix=prefix,
matrix=matrix, evalue=evalue)
if not RunThread(pang):
return False
gen = Genome(project)
gen.addPanGenome(pang.orthologs)
logger.info('PanGenome size: %d groups'%len(gen.getPanGenome()))
logger.info('Core size: %d groups'%gen.getLenCore())
logger.info('Accessory size: %d groups'%gen.getLenAcc())
logger.info('Unique size: %d groups'%gen.getLenUni())
return True
def doMap2KO(project, infiles, local=False, keggdb='', cpu=1):
from ductape.genome.map2KO import LocalSearch, OnlineSearch
org = Organism(project)
kegg = Kegg(project)
gen = Genome(project)
if local:
for org_id, infile in infiles.iteritems():
komap = LocalSearch(infile, keggdb, ncpus=cpu)
if not RunThread(komap):
return False
org.setGenomeStatus(org_id, 'map2ko')
kegg.addDraftKOs( set(komap.results.values()) )
gen.addKOs( komap.results.iteritems() )
logger.info('%s - mapped %d proteins to KO'%
(org_id, len(komap.results)))
else:
kaas = OnlineSearch()
sys.stdout.write(kaas.getExplanation() + '\n')
sys.stdout.write('When the analysis are finished launch %s add-ko\n'%
__prog__)
return True
def doMap2KEGG(project, keeptrying=False):
from ductape.actionsterm import fetchKegg
from ductape.kegg.kegg import KoMapper
# Fetch the Kegg DB?
if not fetchKegg(project, keeptrying):
logger.error('Could not fetch data from KEGG')
return False
kegg = Kegg(project)
kos = [ko.ko_id for ko in kegg.getKO2Analyze()]
if len(kos) == 0:
logger.warning('No KO entries to be analyzed!')
logger.warning('The KO entries could also have been already analyzed')
return True
avoid = [kid for kid in kegg.getAllIDs()]
komap = KoMapper(kos,avoid=avoid, keeptrying=keeptrying)
if not RunThread(komap):
return False
# Details
kegg.addKOs(komap.result.ko)
logger.info('Added %d KO IDs'%len(komap.result.ko))
kegg.addReactions(komap.result.react)
logger.info('Added %d Re IDs'%len(komap.result.react))
kegg.addCompounds(komap.result.comp)
logger.info('Added %d Co IDs'%len(komap.result.comp))
kegg.addPathways(komap.result.path)
logger.info('Added %d Path IDs'%len(komap.result.path))
kegg.addRPairs(komap.result.rpair)
logger.info('Added %d RPair IDs'%len(komap.result.rpair))
# Links
kegg.addKOReacts(komap.result.koreact)
kegg.addPathComps(komap.result.pathcomp)
kegg.addPathReacts(komap.result.pathreact)
kegg.addReactComps(komap.result.reactcomp)
kegg.addCompReacts(komap.result.compreact)
kegg.addReactRPairs(komap.result.reactrpair)
kegg.addRPairReacts(komap.result.rpairreact)
logger.info('Added Kegg links')
# HTML maps
kegg.addPathHtml(komap.result.pathmaps)
logger.info('Added Kegg maps')
return True
################################################################################
# Options
def getOptions():
description = "Add and analyze genomes"
parser = argparse.ArgumentParser(description = description,
prog=__prog__)
parser.add_argument('-p', metavar='project', action='store',
dest='project',
default='ductape.db',
help='Project file')
parser.add_argument('-w', metavar='workdir', action='store', dest='wdir',
default='.',
help='Working directory')
parser.add_argument('-v', action='count',
default=0,
help='Increase verbosity level')
parser.add_argument('--version', action='version',
version='%(prog)s '+__version__)
subparsers = parser.add_subparsers()
parser_init = subparsers.add_parser('init', help='Initialize the project')
parser_init.add_argument('-n', action="store",
dest='name',
default = 'Project',
help='Project name')
parser_init.add_argument('-d', metavar='descr', action="store",
dest='descr',
default = 'DuctApe project',
help='Project description')
parser_init.set_defaults(func=dinit)
parser_add = subparsers.add_parser('add',
help='Add a single genome')
parser_add.add_argument('file', action="store",
help='Protein fasta file')
parser_add.add_argument('orgID', action='store',
help='Organism ID')
parser_add.set_defaults(func=dadd)
parser_add_dir = subparsers.add_parser('add-dir',
help='Add a series of genomes (orgIDs will be guessed)')
parser_add_dir.add_argument('folder', action="store",
help='Folder where the genomes fasta are stored')
parser_add_dir.add_argument('-e', metavar='extension', action="store",
default = 'faa',
help='Fasta files extension')
parser_add_dir.set_defaults(func=daddDir)
parser_add_ko = subparsers.add_parser('add-ko',
help='Add a KO map (protein code --> KO code)')
parser_add_ko.add_argument('kofile', action='store', nargs='+',
help='KO map')
parser_add_ko.set_defaults(func=daddKo)
parser_add_ko = subparsers.add_parser('add-orth',
help='Add a pangenome (ortholog code --> protein code)')
parser_add_ko.add_argument('orthfile', action='store',
help='Orthologs file')
parser_add_ko.set_defaults(func=daddPanGenome)
parser_start = subparsers.add_parser('start', help='Start the analysis')
parser_start.add_argument('-n', metavar='cpu', action="store", dest='cpu',
type=int,
default=1,
help='Number of CPUs to be used')
parser_start.add_argument('-s', action="store_true",
default=False,
help='Skip pangenome creation')
parser_start.add_argument('-g', action="store_true",
default=False,
help='Skip Kegg mapping')
parser_start.add_argument('-y', action="store_true",
default=False,
help='Try to fetch Kegg data even while encountering failures')
parser_start.add_argument('-x', action="store", dest='prefix',
default='',
help='Orthologous groups prefix')
parser_start.add_argument('-m', action="store", dest='matrix',
default='BLOSUM80',
help='BLAST matrix for pangenome [Default: BLOSUM80]')
parser_start.add_argument('-e', action="store",
dest='evalue',
type=float,
default=1e-10,
help='BLAST E-value threshold for pangenome [Default: 1e-10]')
parser_start.add_argument('-l', action="store_true",
default=False,
help='Local map2ko')
parser_start.add_argument('-k', action="store",
help='Kegg database location (for local map2ko)')
parser_start.set_defaults(func=dstart)
parser_annotate = subparsers.add_parser('annotate', help='Transfer and correct the KEGG annotation')
parser_annotate.add_argument('-s', action="store_true",
default=False,
help='Skip annotation merging (just check)')
parser_annotate.set_defaults(func=dannotate)
parser_deannotate = subparsers.add_parser('deannotate', help='Revert transfered annotations')
parser_deannotate.set_defaults(func=ddeannotate)
parser_stats = subparsers.add_parser('stats', help='Print genomic statistics')
parser_stats.add_argument('-s', '--svg', action="store_true",
default=False,
help='Figures in svg format instead of png')
parser_stats.set_defaults(func=dstats)
parser_export = subparsers.add_parser('export', help='Export genomic data')
parser_export.set_defaults(func=dexport)
parser_rm = subparsers.add_parser('rm', help='Remove genome analysis')
parser_rm.add_argument('organisms', metavar='orgID', nargs='+',
action="store",
help='Organism(s) to be removed')
parser_rm.set_defaults(func=dremove)
parser_clear = subparsers.add_parser('clear',
help='Clear all the genomic results')
parser_clear.set_defaults(func=dclear)
return parser.parse_args()
################################################################################
options = getOptions()
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
if options.v == 0:
ch.setLevel(logging.INFO)
elif options.v >= 1:
ch.setLevel(logging.DEBUG)
formatter = ColorFormatter('%(asctime)s - $COLOR%(message)s$RESET','%H:%M:%S')
ch.setFormatter(formatter)
logger.addHandler(ch)
fh = logging.handlers.RotatingFileHandler('ductape.log', maxBytes=2000000)
formatter = logging.Formatter('%(asctime)s - %(name)s - [%(levelname)s] - %(message)s',
'%Y-%m-%d %H:%M:%S')
fh.setFormatter(formatter)
logger.addHandler(fh)
wdir = os.path.abspath(options.wdir)
if not os.path.exists(wdir):
try:
os.mkdir(wdir)
except:
logger.error('Could not create working directory %s'%wdir)
project = os.path.join(wdir, options.project)
ret = options.func(options, wdir, project)
touchProject(project)
if not ret:
sys.exit(1)