Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Tree: 12f4581ac7
Fetching contributors…

Cannot retrieve contributors at this time

480 lines (423 sloc) 16.66 kB
module Biomart
# Class represetation for a biomart dataset.
# Can belong to a Biomart::Database and a Biomart::Server.
class Dataset
include Biomart
attr_reader :name, :display_name, :visible
# Creates a new Biomart::Dataset object.
#
# @param [String] url The URL location of the biomart server.
# @param [Hash] args An arguments hash giving details of the dataset.
#
# arguments hash:
#
# {
# :name => String, #
# "name" => String, #
# :display_name => {} #
#
# }
def initialize( url, args )
@url = url or raise ArgumentError, "must pass :url"
unless @url =~ /martservice/
@url = @url + "/martservice"
end
@name = args["name"] || args[:name]
@display_name = args["displayName"] || args[:display_name]
@visible = ( args["visible"] || args[:visible] ) ? true : false
@filters = {}
@attributes = {}
@importables = {}
@exportables = {}
end
# Returns a hash (keyed by the biomart 'internal_name' for the filter)
# of all of the Biomart::Filter objects belonging to this dataset.
#
# @return [Hash] A hash of Biomart::Filter objects keyed by 'internal_name'
def filters
if @filters.empty?
fetch_configuration()
end
return @filters
end
# Returns an array of the filter names (biomart 'internal_name')
# for this dataset.
#
# @return [Array] An array of filters (their 'internal_name's)
def list_filters
if @filters.empty?
fetch_configuration()
end
return @filters.keys
end
# Returns a hash (keyed by the biomart 'internal_name' for the attribute)
# of all of the Biomart::Attribute objects belonging to this dataset.
#
# @return [Hash] A hash of Biomart::Attribute objects keyed by 'internal_name'
def attributes
if @attributes.empty?
fetch_configuration()
end
return @attributes
end
# Returns an array of the attribute names (biomart 'internal_name')
# for this dataset.
#
# @return [Array] An array of attributes (their 'internal_name's)
def list_attributes
if @attributes.empty?
fetch_configuration()
end
return @attributes.keys
end
# Function to perform a Biomart count. Returns an integer value for
# the result of the count query.
#
# arguments:
#
# {
# :timeout => integer, # set a timeout length for the request (secs) - optional
# :filters => {} # hash of key-value pairs (filter => search term) - optional
# }
#
# @param [Hash] args The arguments hash
# @raise Biomart::ArgumentError Raised when un-supported arguments are passed
def count( args={} )
if args[:federate]
raise Biomart::ArgumentError, "You cannot federate a count query."
end
if args[:required_attributes]
raise Biomart::ArgumentError, "The :required_attributes option is not allowed on count queries."
end
result = request(
:method => 'post',
:url => @url,
:timeout => args[:timeout],
:query => generate_xml(
:filters => args[:filters],
:attributes => args[:attributes],
:count => "1"
)
)
return result.to_i
end
# Function to perform a Biomart search.
#
# optional arguments:
#
# {
# :process_results => true/false, # convert search results to object
# :timeout => integer, # set a timeout length for the request (secs)
# :filters => {}, # hash of key-value pairs (filter => search term)
# :attributes => [], # array of attributes to retrieve
# :required_attributes => [], # array of attributes that are required
# :federate => [
# {
# :dataset => Biomart::Dataset, # A dataset object to federate with
# :filters => {}, # hash of key-value pairs (filter => search term)
# :attributes => [] # array of attributes to retrieve
# }
# ]
# }
#
# Note, if you do not pass any filters or attributes arguments, the defaults
# for the dataset shall be used.
#
# Also, using the :required_attributes option - this performs AND logic and will require
# data to be returned in all of the listed attributes in order for it to be returned.
#
# By default will return a hash with the following:
#
# {
# :headers => [], # array of headers
# :data => [] # array of arrays containing search results
# }
#
# But with the :process_results option will return an array of hashes,
# where each hash represents a row of results (keyed by the attribute name).
#
# @param [Hash] args The arguments hash
# @return [Hash/Array] Will return a hash by default (of unprocessed data), or will return an array of hashes
# @raise Biomart::ArgumentError Raised if incorrect arguments are passed
def search( args={} )
if args[:required_attributes] and !args[:required_attributes].is_a?(Array)
raise Biomart::ArgumentError, "The :required_attributes option must be passed as an array."
end
response = request(
:method => 'post',
:url => @url,
:timeout => args[:timeout],
:query => generate_xml( process_xml_args(args) )
)
result = process_tsv( args, response )
result = filter_data_rows( args, result ) if args[:required_attributes]
result = conv_results_to_a_of_h( result ) if args[:process_results]
return result
end
# Utility function to build the Biomart query XML - used by #count and #search.
#
# @see #count
# @see #search
def generate_xml( args={} )
biomart_xml = ""
xml = Builder::XmlMarkup.new( :target => biomart_xml, :indent => 2 )
xml.instruct!
xml.declare!( :DOCTYPE, :Query )
xml.Query( :virtualSchemaName => "default", :formatter => "TSV", :header => "0", :uniqueRows => "1", :count => args[:count], :datasetConfigVersion => "0.6" ) {
dataset_xml( xml, self, { :filters => args[:filters], :attributes => args[:attributes] } )
if args[:federate]
args[:federate].each do |joined_dataset|
unless joined_dataset[:dataset].is_a?(Biomart::Dataset)
raise Biomart::ArgumentError, "You must pass a Biomart::Dataset object to the :federate[:dataset] option."
end
dataset_xml(
xml,
joined_dataset[:dataset],
{ :filters => joined_dataset[:filters], :attributes => joined_dataset[:attributes] }
)
end
end
}
return biomart_xml
end
# Simple heartbeat function to test that a Biomart server is online.
#
# @return [Boolean] true/false
def alive?
server = Biomart::Server.new( @url )
return server.alive?
end
private
# Utility function to retrieve and process the configuration
# xml for a dataset
def fetch_configuration
url = @url + "?type=configuration&dataset=#{@name}"
document = REXML::Document.new( request( :url => url ) )
# Filters...
['//FilterDescription','//FilterDescription/Option'].each do |filter_xpath|
REXML::XPath.each( document, filter_xpath ) do |f|
if f.attributes["displayType"] != nil
next if f.attributes["displayType"] == "container"
@filters[ f.attributes["internalName"] ] = Filter.new( f.attributes )
elsif f.attributes["pointerFilter"] != nil
pointer_filter = Filter.new( f.attributes )
@filters[ pointer_filter.name ] = pointer_filter
@filters[ pointer_filter.pointer_filter ] = pointer_filter
end
end
end
# Attributes are much simpler...
REXML::XPath.each( document, '//AttributeDescription' ) do |a|
@attributes[ a.attributes["internalName"] ] = Attribute.new( a.attributes )
end
end
# Utility function to process and test the arguments passed for
# the xml query.
def process_xml_args( args={} )
xml_args = {
:filters => args[:filters],
:attributes => args[:attributes]
}
if args[:federate]
unless args[:federate].is_a?(Array)
raise Biomart::ArgumentError, "The :federate option must be passed as an array."
end
unless args[:federate].size == 1
raise Biomart::ArgumentError, "Sorry, we can only federate two datasets at present. This limitation shall be lifted in version 0.8 of biomart."
end
xml_args[:federate] = args[:federate]
end
return xml_args
end
# Helper function to produce the portion of the biomart xml for
# a dataset query.
def dataset_xml( xml, dataset, args )
xml.Dataset( :name => dataset.name, :interface => "default" ) {
if args[:filters]
args[:filters].each do |name,value|
raise Biomart::ArgumentError, "The filter '#{name}' does not exist" if dataset.filters[name].nil?
if dataset.filters[name].type == 'boolean'
value = value.downcase if value.is_a? String
if [true,'included','only'].include?(value)
xml.Filter( :name => name, :excluded => '0' )
elsif [false,'excluded'].include?(value)
xml.Filter( :name => name, :excluded => '1' )
else
raise Biomart::ArgumentError, "The boolean filter '#{name}' can only accept 'true/included/only' or 'false/excluded' arguments."
end
else
value = value.join(",") if value.is_a? Array
xml.Filter( :name => name, :value => value )
end
end
else
dataset.filters.each do |name,filter|
if filter.default?
if filter.type == 'boolean'
xml.Filter( :name => name, :excluded => filter.default_value )
else
xml.Filter( :name => name, :value => filter.default_value )
end
end
end
end
unless args[:count]
if args[:attributes]
args[:attributes].each do |name|
xml.Attribute( :name => name )
end
else
dataset.attributes.each do |name,attribute|
if attribute.default?
xml.Attribute( :name => name )
end
end
end
end
}
end
# Utility function to transform the tab-separated data retrieved
# from the Biomart search query into a ruby object.
def process_tsv( args, tsv )
headers = []
parsed_data = []
append_header_attributes_for_tsv( headers, self, args[:attributes] )
if args[:federate]
args[:federate].each do |joined_dataset|
append_header_attributes_for_tsv( headers, joined_dataset[:dataset], joined_dataset[:attributes] )
end
end
parsed_data = []
if CSV.const_defined? :Reader
# Ruby < 1.9 CSV code
begin
parsed_data = CSV.parse( tsv, "\t" )
rescue CSV::IllegalFormatError => e
parsed_data = parse_tsv_line_by_line( headers.size, tsv )
end
else
# Ruby >= 1.9 CSV code
begin
parsed_data = CSV.parse( tsv, { :col_sep => "\t", :skip_blanks => true } )
rescue CSV::MalformedCSVError => e
parsed_data = parse_tsv_line_by_line( headers.size, tsv )
end
end
return {
:headers => headers,
:data => parsed_data
}
end
# Helper function to append the attribute names to the 'headers' array
# for processing the returned results.
def append_header_attributes_for_tsv( headers, dataset, attributes )
if attributes
attributes.each do |attribute|
headers.push(attribute)
end
else
dataset.attributes.each do |name,attribute|
if attribute.default?
headers.push(name)
end
end
end
end
# Utility function to process TSV formatted data that raises errors. (Biomart
# has a habit of serving out this...) First attempts to use the CSV modules
# 'parse_line' function to read in the data, if that fails, tries to use split
# to recover the data.
def parse_tsv_line_by_line( expected_row_size, tsv )
parsed_data = []
data_by_line = tsv.split("\n")
data_by_line.each do |line|
elements = []
if CSV.const_defined? :Reader
# Ruby < 1.9 CSV code
elements = CSV::parse_line( line, "\t" ) || []
else
# Ruby >= 1.9 CSV code
begin
elements = CSV::parse_line( line, { :col_sep => "\t" } ) || []
rescue CSV::MalformedCSVError => e
elements = []
end
end
if elements.size == 0
# This is a bad line (causing the above Exception), try and use split to recover.
elements = line.split("\t")
if line =~ /\t$/
# If the last attribute resturn is empty add a nil
# value to the array as it would have been missed
# by the split function!
elements.push(nil)
end
# Substitute blank strings for nils
elements.map! do |elem|
if elem === ""
nil
else
elem
end
end
# Add a safety clause...
if elements.size === expected_row_size
parsed_data.push(elements)
end
else
parsed_data.push(elements)
end
end
return parsed_data
end
# Utility function to quickly convert a search result into an array of hashes
# (keyed by the attribute name) for easier processing - this is not done by
# default on all searches as this can cause a large overhead on big data returns.
def conv_results_to_a_of_h( search_results )
result_objects = []
search_results[:data].each do |row|
tmp = {}
row.each_index do |index|
tmp[ search_results[:headers][index] ] = row[index]
end
result_objects.push(tmp)
end
return result_objects
end
# Utility function to remove data rows from a search result that do not include
# the :required_attributes.
def filter_data_rows( args, result )
# Get the list of attributes searched for...
attributes = args[:attributes] ? args[:attributes] : []
if attributes.empty?
self.attributes.each do |name,attribute|
if attribute.default?
attributes.push(name)
end
end
end
# Work out which attribute positions we need to test...
positions_to_test = []
attributes.each_index do |index|
if args[:required_attributes].include?(attributes[index])
positions_to_test.push(index)
end
end
# Now go through the results and filter out the unwanted data...
filtered_data = []
result[:data].each do |data_row|
save_row_count = 0
positions_to_test.each do |position|
save_row_count = save_row_count + 1 unless data_row[position].nil?
end
if save_row_count == positions_to_test.size
filtered_data.push(data_row)
end
end
return {
:headers => result[:headers],
:data => filtered_data
}
end
end
end
Jump to Line
Something went wrong with that request. Please try again.