Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
315 lines (285 sloc) 10.8 KB
#!/usr/bin/env python
print "\nScript for parsing GenBank metadata and renaming fasta file for annotation."
#print "Contributors: Javier del Campo and Laura Wegener Parfrey"
#print "22 November 2016"
print "\nrun: python eukref_gbmetadata.py -h for help.\n"
import os
import argparse
import re
from Bio import SeqIO
from Bio.Blast import NCBIXML
parser = argparse.ArgumentParser(
description='Rename fasta sequences and create metadata text file for FigTree annotation')
parser.add_argument(
'-gb',
'--input_gb_file_fp',
help='Path to GenBank formatted file with accessions to be renamed. Get this file from the NCBI nucleotide database online (see pipeline overview)',
required=False)
parser.add_argument(
'-t',
'--ref_tax',
help='Path to reference database taxonomy file formatted accession \t taxonomy. E.g. SIVLA 128 full taxa map file. Reference database taxonomy will be added to the metadata file',
required=False)
parser.add_argument(
'-i',
'--input_fasta_file_fp',
help='Path to Fasta file to be renamed. May be current_DB.fas, current_DB.clustered.fas, or other. Header MUST either be 1) in standard GenBank format (e.g. >gi|ginumber|gb|accession| ) or 2) begin with the accession number. Get this file from the NCBI nucleotide database online (see pipeline overview)',
required=False)
parser.add_argument(
'--outgroup',
help='Path to Fasta file with outgroups.',
required=False)
parser.add_argument(
'-o',
'--output_fasta_file_fp',
help='Path to output fasta file',
required=False)
parser.add_argument(
'-m',
'--output_metadata_fp',
help='Output metadata file in tab delimited format',
required=False)
args = parser.parse_args()
ref_tax = args.ref_tax
input_gb_file_fp = args.input_gb_file_fp
input_fasta_file_fp = args.input_fasta_file_fp
outgroup = args.outgroup
output_fasta_file_fp = args.output_fasta_file_fp
output_metadata_fp = args.output_metadata_fp
outfasta = open(output_fasta_file_fp, "w")
outmeta = open(output_metadata_fp, "w")
##############################
#### function definitions ####
##############################
# function just loops through outgroup fasta file and adds OUTGROUP to line. writes to outfile.
def rename_outgroup(infile, outfile):
for line in open(infile, "U"):
if line.startswith(">"):
line = line.replace(">" , ">OUTGROUP__")
outfile.write(line)
else:
outfile.write(line)
# function to print metadata in tab delimited format based on gb formatted input file.
# version if SILVA or PR2 reference taxonomy file passed
def metadata_retrieve_ref(infile, outfile, ref_accessions):
# original script from here
OUT = outmeta
OUT.write("Accession\tTaxonomy\tReference_taxonomy\tOrganism\tclone\tSource\tEnvironment\tHost\tCountry\tPublication\tAuthors\tJournal\n")
result_handle = open(infile, "U")
# array to make sure each accession is uniq.
uniq_acc = []
gbfiles = SeqIO.parse(result_handle, 'gb')
for rec in gbfiles:
acc = rec.id
# strip off .1 from accessions
clean_acc = re.sub(r'\.[1-9]', '', acc)
#if already have seen accession move onto next record. Else append accession to uniq_acc list and
if clean_acc in uniq_acc:
next
else:
uniq_acc.append(clean_acc)
#default = 'NA'
#reference_taxonomy = ref_accessions.get('clean_acc', default)
if clean_acc in ref_accessions:
reference_taxonomy = ref_accessions[clean_acc]
else:
reference_taxonomy = 'NA'
source = rec.features[0]
if 'taxonomy' in rec.annotations:
taxonomy = ";".join(rec.annotations['taxonomy'])
else:
taxonomy = "NA"
if 'organism' in rec.annotations:
organism = rec.annotations['organism']
else:
organism = "NA"
if 'clone' in source.qualifiers:
clone = source.qualifiers['clone'][0]
else:
clone = "NA"
if 'environmental_sample' in source.qualifiers:
environmental_sample = "Environmental"
else:
environmental_sample = "Isolate"
if 'isolation_source' in source.qualifiers:
isolation_source = source.qualifiers['isolation_source'][0]
else:
isolation_source = "NA"
if 'host' in source.qualifiers:
host = source.qualifiers['host'][0]
else:
host = "NA"
if 'country' in source.qualifiers:
country = source.qualifiers['country'][0]
else:
country = "NA"
if 'references' in rec.annotations:
pubref = rec.annotations['references'][0]
authors = pubref.authors
title = pubref.title
journal = pubref.journal
else:
title = "NA"
authors = "NA"
journal = "NA"
fields = [clean_acc, taxonomy, reference_taxonomy, organism, clone, environmental_sample, isolation_source, host, country, title, authors, journal]
OUT.write("\t".join(fields)+ "\n")
OUT.close()
# function to print metadata in tab delimited format based on gb formatted input file.
def metadata_retrieve(infile, outfile):
accessions = {}
# original script from here
OUT = outmeta
OUT.write("Accession\tTaxonomy\tOrganism\tclone\tSource\tEnvironment\tHost\tCountry\tPublication\tAuthors\tJournal\n")
result_handle = open(infile, "U")
uniq_acc = []
gbfiles = SeqIO.parse(result_handle, 'gb')
for rec in gbfiles:
acc = rec.id
# need to make sure each accession is unique.
# strip off .1 from accessions
clean_acc = re.sub(r'\.[1-9]', '', acc)
#if already have seen accession move onto next record. Else append accession to uniq_acc list and
if clean_acc in uniq_acc:
next
else:
uniq_acc.append(clean_acc)
source = rec.features[0]
if 'taxonomy' in rec.annotations:
taxonomy = ";".join(rec.annotations['taxonomy'])
else:
taxonomy = "NA"
if 'organism' in rec.annotations:
organism = rec.annotations['organism']
else:
organism = "NA"
if 'clone' in source.qualifiers:
clone = source.qualifiers['clone'][0]
else:
clone = "NA"
if 'environmental_sample' in source.qualifiers:
environmental_sample = "Environmental"
else:
environmental_sample = "Isolate"
if 'isolation_source' in source.qualifiers:
isolation_source = source.qualifiers['isolation_source'][0]
else:
isolation_source = "NA"
if 'host' in source.qualifiers:
host = source.qualifiers['host'][0]
else:
host = "NA"
if 'country' in source.qualifiers:
country = source.qualifiers['country'][0]
else:
country = "NA"
if 'references' in rec.annotations:
pubref = rec.annotations['references'][0]
authors = pubref.authors
title = pubref.title
journal = pubref.journal
else:
title = "NA"
authors = "NA"
journal = "NA"
fields = [clean_acc, taxonomy, organism, clone, environmental_sample, isolation_source, host, country, title, authors, journal]
OUT.write("\t".join(fields)+ "\n")
OUT.close()
def rename_sequences_ref(in_gb, infile, ref_acc, outfile):
# initialize gb_tax dict to store accession, last taxonomy level, and organism name
gb_tax = {}
#result_handle = open(in_gb, "U")
# parse gb file; block to go through gb file for acc not in ref
gbfiles = SeqIO.parse(in_gb, 'gb')
# make dict of acc, last level, organism
for rec in gbfiles:
acc = rec.id
clean_acc = re.sub(r'\.[1-9]', '', acc)
#print clean_acc
if clean_acc in ref_acc:
next
else:
if 'organism' in rec.annotations:
organism = rec.annotations['organism']
#if the taxonomy string exists but is empty
if not rec.annotations['taxonomy']:
name = organism
name = name.replace(" ", "_")
#print name
#print organism
gb_tax[clean_acc] = name
elif 'taxonomy' in rec.annotations:
# get last level of taxonomy
taxonomy = rec.annotations['taxonomy']
# add accession plus last taxonomy level and organism name to tax dict
name = taxonomy[-1]+"_"+ organism
#if the taxonomy is just absent entirely
elif 'taxonomy' not in rec.annotations:
name = organism
name = name.replace(" ", "_")
gb_tax[clean_acc] = name
# go through fasta file. Match accessions first to reference taxonomy (ref_acc) then to gb_tax
for line in open(infile, "U"):
if line.startswith(">"):
#seq_acc = None
if "_" in line:
seq_acc = line.split("_")[0]
seq_acc = seq_acc.replace(">", "")
# case of old gb format with >gi|noginumber|gb|KT210044
elif "|" in line:
acc = seq.split('|')[3]
seq_acc = re.sub(r'\.[1-9]', '', acc)
seq_acc = seq_acc.replace(">", "")
# case of just accession # in header, or accession followed by white space
else:
seq_acc = line.split()[0]
seq_acc = seq_acc.replace(">", "")
if seq_acc in ref_acc:
taxonomy = ref_acc[seq_acc]
#name = None
if "; __" in taxonomy:
last_level = taxonomy.split("; __")
name = '_'.join(last_level[-2:])
#tax[seq_acc] = '_'.join(last_level[-2:-1])
else:
last_level = taxonomy.split(";")
name = '_'.join(last_level[-2:])
outfile.write(">%s_%s\n" % (seq_acc,name))
elif seq_acc in gb_tax:
outfile.write(">%s_%s\n" % (seq_acc, gb_tax[seq_acc]))
# incase accession is missing.
else:
outfile.write("%s\n" % (line.strip()))
else:
outfile.write("%s\n" % (line.strip()))
# ###################################################################
# ######################### SCRIPT ITSELF ###########################
# ###################################################################
# Make dict of silva taxonomy (or PR2) if provided.
# how to handle SILVA? read whole taxonomy file into dictionary with key, value as accession, taxonomy?
# Maybe here only want to make a dict of accession and whole taxonomy. do not split unless necessary.
ref_accessions = {}
if args.ref_tax is not None:
for line in open(ref_tax, "U"):
# split by \t
acc = line.strip().split('\t')
ref_accessions[acc[0]] = acc[1]
#print ref_accessions[acc[0]]
# run eukref_gbmetadata.py version that also reports reference taxonomy if given gb file AND given reference taxonomy file (e.g. Silva taxonomy file)
if args.input_gb_file_fp is not None and args.ref_tax is not None:
# accessions is a dictionary of accessions in file (with name?)
metadata_retrieve_ref(input_gb_file_fp, outmeta, ref_accessions)
#run eukref_gbmetadata.py if gb file given.
if args.input_gb_file_fp is not None and args.ref_tax is None:
metadata_retrieve(input_gb_file_fp, outmeta)
print "metadata file is %s" % (outmeta)
if args.outgroup is not None:
# run outgroup renaming and write outgroup seqs to outfile
rename_outgroup(outgroup, outfasta)
# doesn't really make sense to only use ref info - should also use gb record.
if args.input_fasta_file_fp is not None and args.ref_tax is not None and args.input_gb_file_fp is not None:
# need to make this function
rename_sequences_ref(input_gb_file_fp, input_fasta_file_fp, ref_accessions, outfasta)
print "fasta file for tree is %s" % (outfasta)
outfasta.close()
outmeta.close()