Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Created some .gitignores, added a script to try downloading and proce…

…ssing the NCBI Taxonomy files automatically. Work in progress.
  • Loading branch information...
commit dfaf3f4c8446b5f0c9c15205f30aec11bcbf1f70 1 parent 4ee76d2
@gaurav authored
View
2  .gitignore
@@ -0,0 +1,2 @@
+*.swp
+*~
View
2  root/data/ncbi-taxonomy/.gitignore
@@ -0,0 +1,2 @@
+*.dmp
+*.tar.gz*
View
61 root/data/ncbi-taxonomy/README.txt
@@ -0,0 +1,61 @@
+*.dmp files are bcp-like dump from GenBank taxonomy database.
+
+General information.
+Field terminator is "\t|\t"
+Row terminator is "\t|\n"
+
+nodes.dmp file consists of taxonomy nodes. The description for each node includes the following
+fields:
+ tax_id -- node id in GenBank taxonomy database
+ parent tax_id -- parent node id in GenBank taxonomy database
+ rank -- rank of this node (superkingdom, kingdom, ...)
+ embl code -- locus-name prefix; not unique
+ division id -- see division.dmp file
+ inherited div flag (1 or 0) -- 1 if node inherits division from parent
+ genetic code id -- see gencode.dmp file
+ inherited GC flag (1 or 0) -- 1 if node inherits genetic code from parent
+ mitochondrial genetic code id -- see gencode.dmp file
+ inherited MGC flag (1 or 0) -- 1 if node inherits mitochondrial gencode from parent
+ GenBank hidden flag (1 or 0) -- 1 if name is suppressed in GenBank entry lineage
+ hidden subtree root flag (1 or 0) -- 1 if this subtree has no sequence data yet
+ comments -- free-text comments and citations
+
+Taxonomy names file (names.dmp):
+ tax_id -- the id of node associated with this name
+ name_txt -- name itself
+ unique name -- the unique variant of this name if name not unique
+ name class -- (synonym, common name, ...)
+
+Divisions file (division.dmp):
+ division id -- taxonomy database division id
+ division cde -- GenBank division code (three characters)
+ division name -- e.g. BCT, PLN, VRT, MAM, PRI...
+ comments
+
+Genetic codes file:
+ genetic code id -- GenBank genetic code id
+ abbreviation -- genetic code name abbreviation
+ name -- genetic code name
+ cde -- translation table for this genetic code
+ starts -- start codons for this genetic code
+
+Deleted nodes file (delnodes.dmp):
+ tax_id -- deleted node id
+
+Merged nodes file (merged.dmp):
+ old_tax_id -- id of nodes which has been merged
+ new_tax_id -- id of nodes which is result of merging
+
+Citations file (citations.dmp):
+ cit_id -- the unique id of citation
+ cit_key -- citation key
+ pubmed_id -- unique id in PubMed database (0 if not in PubMed)
+ medline_id -- unique id in MedLine database (0 if not in MedLine)
+ url -- URL associated with citation
+ text -- any text (usually article name and authors).
+ -- The following characters are escaped in this text by a backslash:
+ -- newline (appear as "\n"),
+ -- tab character ("\t"),
+ -- double quotes ('\"'),
+ -- backslash character ("\\").
+ taxid_list -- list of node ids separated by a single space
View
149 root/data/ncbi-taxonomy/process.pl
@@ -0,0 +1,149 @@
+#!/usr/bin/perl
+
+=head1 NAME
+
+process.pl - Download NCBI Taxonomy dumps, extract the version number, and check the files.
+
+=head1 SYNOPSIS
+
+ ./process.pl
+
+ (Please ensure that there are no files named 'readme.txt' in the directory
+ where you run this file: it will be overwritten).
+
+=head2 DESCRIPTION
+
+You'll need to edit this file directly to change the filenames being used.
+
+=cut
+
+use 5.0100;
+
+use strict;
+use warnings;
+
+our $VERSION = '0.1';
+
+use Data::Dumper;
+use LWP::UserAgent;
+use Archive::Tar;
+use Bio::DB::Taxonomy;
+
+goto LAST_TEST;
+
+=head2 CONFIGURATION
+
+=head3 TAXDUMP_URL
+
+The URL to use to download the TAXDUMP.TAR.GZ file.
+
+=cut
+
+my $TAXDUMP_URL = "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz";
+
+=head3 LOCAL_FILENAME
+
+The filename to save the TAXDUMP.TAR.GZ file.
+
+=cut
+
+my $LOCAL_FILENAME = "taxdump.tar.gz";
+
+=head2 CODE
+
+Code begins here.
+
+=over 4
+
+=item Clear up our environment.
+
+=cut
+
+unlink(glob "$LOCAL_FILENAME*" );
+unlink(glob "*.dmp" );
+
+=item Download the tar.gz file.
+
+=cut
+
+say "Beginning download of NCBI files from $TAXDUMP_URL ...";
+
+my $ua = LWP::UserAgent->new(
+ agent => "NCBITaxonomyDownload/$VERSION "
+);
+
+my $response = $ua->mirror($TAXDUMP_URL, $LOCAL_FILENAME);
+unless($response->is_success) {
+ die "Could not download file; please check the URL '$TAXDUMP_URL'.";
+}
+
+=item Let's identify this download by its last modified time.
+
+=cut
+
+my $last_modified = scalar(gmtime $response->headers->last_modified) . " GMT";
+
+open(VERSION, ">version.txt") or die "Could not open version.txt: $!";
+say VERSION "Downloaded file $LOCAL_FILENAME last modified on $last_modified.";
+close(VERSION);
+
+say "Downloaded version number: last modified $last_modified";
+
+=item Uncompress the tar.gz file.
+
+=cut
+
+say "Uncompressing $LOCAL_FILENAME.";
+
+my $tar = Archive::Tar->new($LOCAL_FILENAME);
+my @files_extracted = map {$_->full_path} $tar->extract();
+
+say "Files extracted:\n\t" . join("\n\t", @files_extracted);
+
+=item Check the files we've got.
+
+=cut
+
+print "Checking files: ";
+unless(
+ (-r 'citations.dmp') and
+ (-r 'delnodes.dmp') and
+ (-r 'division.dmp') and
+ (-r 'gc.prt') and
+ (-r 'gencode.dmp') and
+ (-r 'merged.dmp') and
+ (-r 'names.dmp') and
+ (-r 'nodes.dmp') and
+ (-r 'readme.txt')
+) {
+ say "ERROR: one or more expected files are not present! Please try again.";
+}
+
+=item Final check: try loading this up in Bio::DB:Taxonomy.
+
+=cut
+
+LAST_TEST:
+
+my $taxonomy = Bio::DB::Taxonomy->new(
+ -source => 'flatfile',
+ -directory => '/tmp',
+ -nodesfile => 'nodes.dmp',
+ -namesfile => 'names.dmp'
+);
+
+die "ERROR: Could not initialize Bio::DB::Taxonomy" unless($taxonomy);
+
+my $felis_catus = $taxonomy->get_taxon('Felis catus');
+say "Got: $felis_catus";
+
+my @lineage;
+my $taxon = $felis_catus;
+while(defined $taxon) {
+ push @lineage, $taxon->scientific_name;
+ $taxon = $felis_catus->ancestor;
+
+ say "Checking: $taxon";
+}
+
+say "Felis catus is: " . join(' ', @lineage);
View
1  root/data/ncbi-taxonomy/version.txt
@@ -0,0 +1 @@
+Downloaded file taxdump.tar.gz last modified on Mon Nov 22 21:20:25 2010 GMT.
Please sign in to comment.
Something went wrong with that request. Please try again.