# Version 1.0
- Uses a set of attribtues to check for the sex of the organism described in the biosample
- Matched the value of the tag to a set of possible values 
    - Values were taken from the BioSamples documentation 
    - Additional values were added 

## Issues
- Does not match to values in brackets 
- Multiple values with 'and' or punctuation are not matched
- Some terms not included (asexual morph)

In [1]:
import pandas as pd
import xml.sax
import re

In [2]:
class BioSamplesSexHandler(xml.sax.ContentHandler):
    '''
    SAX handler class to read in BioSamples information from a given set of attribute tags 
        - Reads and stores the attribute information in a dictionary
    '''
    def __init__(self, sample_dict, attribute_set) -> None:
        super().__init__()
        self.sample_dict = sample_dict
        self.attribute_set = attribute_set
        self.attribute_name = ''
        self.biosample_id = ''
        self.attribute_value = ''

    def startElement(self, name, attrs):
        if name == 'BioSample':
            self.biosample_id = attrs['accession']
        elif name == 'Attribute':
            try:
                self.attribute_name = attrs['harmonized_name']
            except KeyError:
                self.attribute_name = attrs['attribute_name']
        

    def characters(self, content):
        if self.attribute_name in self.attribute_set:
            self.attribute_value = content

        self.attribute_name = ''

    def endElement(self, name):
        if name == 'BioSample':
            self.sample_dict[self.biosample_id] = self.attribute_value
            self.biosample_id = ''
            self.attribute_value = ''
    
    def endDocument(self):
        print('Finished parsing BioSamples XML file')    


In [3]:
sample_dict = {} 
attribute_set = {'animal_sex', 'host_sex', 'sex', 'plant_sex'}
biosamples_path = '../../data/biosamples/biosample_random_samples.xml'

parser = xml.sax.make_parser()
handler = BioSamplesSexHandler(sample_dict, attribute_set)
parser.setContentHandler(handler)
parser.parse(biosamples_path)

Finished parsing BioSamples XML file


In [14]:
def preprocess(input_str):
    '''
    Preprocesses the input string to remove any non-alphanumeric characters and convert to lowercase
    '''
    return re.sub(r'\ *[_&<>:-]+\ *', ' ', input_str).lower()

In [39]:
# values (up to unisexual) taken from the NCBI BioSamples documentation
sex_values = {'male', 'female', 'pooled male and female', 'male and female', 'neuter', 'hermaphrodite', 'intersex', 'androdioecious', 'androecious', 'androgynous', 'androgynomonoecious', 'andromonoecious', 'bisexual', 'dichogamous', 'diclinous', 'dioecious', 'gynodioecious', 'gynoecious', 'gynomonoecious', 'hermaphroditic', 'imperfect', 'monoclinous', 'monoecious', 'perfect', 'polygamodioecious', 'polygamomonoecious', 'polygamous', 'protandrous', 'protogynous', 'subandroecious', 'subdioecious', 'subgynoecious', 'synoecious', 'trimonoecious', 'trioecious', 'unisexual', 'mixed', 'mixed sex', 'm', 'f'}

matches_dict = {}
# negative_values stores all the attribute values that were not matched to the above list
negative_values = set()

for biosample_id, content in sample_dict.items():
    value = preprocess(content)
    if content != '' and value in sex_values:
        matches_dict[biosample_id] = value
    else:
        negative_values.add(value)
        matches_dict[biosample_id] = 'na'


In [49]:
unknown_values = {'na', '?', 'n/a', 'restricted access', 'missing', 'not determined', 'not collected', ' ', 'not provided', 'not applicable', 'not specified', 'not recorded', 'not available', 'unknown', 'unknown sex', 'missing not provided'}

# prints that negative values that could be false negatives
print([v for v in negative_values if v not in unknown_values])

positive_matches = [biosample_id for biosample_id, match in matches_dict.items() if match != 'na']
negative_matches = [biosample_id for biosample_id, match in matches_dict.items() if match == 'na']
print('Number of positive matches:',len(positive_matches))

['', 'females and males', 'sf', 'hermaphrodite and monoecious', 'mating type a', '47', 'mating type h ', 'female (f)', '48', 'mating type a/mating type alpha', 'asexual morph']
Number of positive matches: 3460
