# Version 1.1
- Added more attribute values to the set of possible attributes
- Added a more general classification for the different values

## Issues
- Does not match to values in brackets 
- Multiple values with 'and' or punctuation are not matched
- Some terms not included (asexual morph)

In [2]:
import pandas as pd
import xml.sax
import re

In [49]:
class BioSamplesSexHandler(xml.sax.ContentHandler):
    '''
    SAX handler class to read in BioSamples information from a given set of attribute tags 
        - Reads and stores the attribute information in a dictionary
    '''
    def __init__(self, sample_dict, attribute_set) -> None:
        super().__init__()
        self.sample_dict = sample_dict
        self.attribute_set = attribute_set
        self.attribute_name = ''
        self.biosample_id = ''
        # self.attribute_value = ''
        self.attribute_dict = {}

    def startElement(self, name, attrs):
        if name == 'BioSample':
            self.biosample_id = attrs['accession']
        elif name == 'Attribute':
            try:
                self.attribute_name = attrs['harmonized_name']
            except KeyError:
                self.attribute_name = attrs['attribute_name']
        

    def characters(self, content):
        if self.attribute_name in self.attribute_set:
            # self.attribute_value = content
            self.attribute_dict[self.attribute_name] = content

        self.attribute_name = ''

    def endElement(self, name):
        if name == 'BioSample':
            self.sample_dict[self.biosample_id] = self.attribute_dict
            self.biosample_id = ''
            self.attribute_dict = {}
            # self.attribute_value = ''
    
    def endDocument(self):
        print('Finished parsing BioSamples XML file')    


In [68]:
sample_dict = {} 
attribute_set = {'animal_sex', 'host_sex', 'sex', 'plant_sex', 'vioscreen_gender', 'offspring gender', 'animal_gender', 'pm_gender', 'demographics_gender'}
# attribute_set = {'animal_sex', 'host_sex', 'sex', 'plant_sex'}

biosamples_path = '../../data/biosamples/biosample_random_samples.xml'

parser = xml.sax.make_parser()
handler = BioSamplesSexHandler(sample_dict, attribute_set)
parser.setContentHandler(handler)
parser.parse(biosamples_path)

Finished parsing BioSamples XML file


In [69]:
# print(sample_dict)
# len(list(s for i, s in sample_dict.items() if s != ''))

print(sample_dict['SAMN06317982'])

{'offspring gender': 'male'}


In [54]:
def preprocess_str(input_str):
    return re.sub(r'\ *[_&<>:-]+\ *', ' ', input_str).lower()

def preprocess(input_dict):
    '''
    Preprocesses the input string to remove any non-alphanumeric characters and convert to lowercase
    '''
    return {k: preprocess_str(v) for k, v in input_dict.items()}

In [70]:
# values (up to unisexual) taken from the NCBI BioSamples documentation
sex_values = {'male', 'female', 'pooled male and female', 'male and female', 'neuter', 'hermaphrodite', 'intersex', 'androdioecious', 'androecious', 'androgynous', 'androgynomonoecious', 'andromonoecious', 'bisexual', 'dichogamous', 'diclinous', 'dioecious', 'gynodioecious', 'gynoecious', 'gynomonoecious', 'hermaphroditic', 'imperfect', 'monoclinous', 'monoecious', 'perfect', 'polygamodioecious', 'polygamomonoecious', 'polygamous', 'protandrous', 'protogynous', 'subandroecious', 'subdioecious', 'subgynoecious', 'synoecious', 'trimonoecious', 'trioecious', 'unisexual', 'mixed', 'mixed sex', 'm', 'f'}

male_sex = {'male', 'm'}
female_sex = {'female', 'f'}


matches_dict = {}
# negative_values stores all the attribute values that were not matched to the above list
negative_values = set()

for biosample_id, content in sample_dict.items():
    value = preprocess(content)
    positive_values = []
    for key, value in value.items():
        if value != '' and value in sex_values:
            positive_values.append(value)
        else:
            negative_values.add(value)
            
    if len(positive_values) == 0:
        matches_dict[biosample_id] = 'na'
    else:
        matches_dict[biosample_id] = positive_values

In [71]:
unknown_values = {'na', '?', 'n/a', 'restricted access', 'missing', 'not determined', 'not collected', ' ', 'not provided', 'not applicable', 'not specified', 'not recorded', 'not available', 'unknown', 'unknown sex', 'missing not provided'}

# prints that negative values that could be false negatives
print([v for v in negative_values if v not in unknown_values])

positive_matches = [biosample_id for biosample_id, match in matches_dict.items() if match != 'na']
negative_matches = [biosample_id for biosample_id, match in matches_dict.items() if match == 'na']
print('Number of positive matches:', len(positive_matches))

['asexual morph', 'unspecified', 'sf', 'hermaphrodite and monoecious', 'females and males', '47', 'mating type h ', 'mating type a/mating type alpha', '48', 'labcontrol test', 'female (f)', 'mating type a']
Number of positive matches: 3462
