# Version 1.0


In [1]:
import xml.sax
import re


In [20]:
class BioSamplesAgeHandler(xml.sax.ContentHandler):
    '''
    SAX handler class to read in BioSamples information from a given set of attribute tags 
        - Reads and stores the attribute information in a dictionary
    '''
    def __init__(self, sample_dict, attribute_set) -> None:
        super().__init__()
        self.sample_dict = sample_dict
        self.attribute_set = attribute_set
        self.attribute_name = ''
        self.biosample_id = ''
        self.attribute_value = ''

    def startElement(self, name, attrs):
        if name == 'BioSample':
            self.biosample_id = attrs['accession']
        elif name == 'Attribute':
            try:
                self.attribute_name = attrs['harmonized_name']
            except KeyError:
                self.attribute_name = attrs['attribute_name']
        

    def characters(self, content):
        if self.attribute_name in self.attribute_set:
            self.attribute_value = content

        self.attribute_name = ''

    def endElement(self, name):
        if name == 'BioSample':
            self.sample_dict[self.biosample_id] = self.attribute_value
            self.biosample_id = ''
            self.attribute_value = ''
    
    def endDocument(self):
        print('Finished parsing BioSamples XML file')    

In [21]:
sample_dict = {} 
attribute_set = {'age', 'host_age'}
biosamples_path = '../../data/biosamples/biosample_random_samples.xml'

parser = xml.sax.make_parser()
handler = BioSamplesAgeHandler(sample_dict, attribute_set)
parser.setContentHandler(handler)
parser.parse(biosamples_path)

>
Finished parsing BioSamples XML file


In [6]:
def preprocess(input_str):
    '''
    Preprocesses the input string to remove any non-alphanumeric characters and convert to lowercase
    '''
    return re.sub(r'\ *[_&<>:-]+\ *', ' ', input_str).lower()

In [26]:
import pprint 

# filter the dictionary for values that are not empty strings
positive_sample_dict = {k: v for k, v in sample_dict.items() if v != ''}

pprint.pprint(positive_sample_dict)

{'SAMD00133768': '12 wk',
 'SAMD00134506': '56',
 'SAMD00142065': 'missing',
 'SAMD00176343': '2',
 'SAMD00209188': '47',
 'SAMD00243312': '24 weeks',
 'SAMD00244426': '25',
 'SAMD00334798': '>',
 'SAMD00439565': 'DAT72',
 'SAMD00505775': 'missing',
 'SAMD00529505': '12 months',
 'SAMD00549931': 'missing',
 'SAMEA10313618': '37',
 'SAMEA103921040': '175',
 'SAMEA104140940': 'adult',
 'SAMEA104181591': '3.0',
 'SAMEA104229780': '15',
 'SAMEA104255718': '31.6',
 'SAMEA104263357': '53',
 'SAMEA104270891': '62',
 'SAMEA104306271': 'Missing: Not provided',
 'SAMEA104322278': '6 to 14',
 'SAMEA104323318': 'not available',
 'SAMEA104382892': '20 to 24',
 'SAMEA10793065': 'not collected',
 'SAMEA10805127': '14',
 'SAMEA110043299': 'not collected',
 'SAMEA110133226': '7',
 'SAMEA110178717': '6 to 8',
 'SAMEA110226702': '31-40',
 'SAMEA110371077': '9',
 'SAMEA110406299': '50',
 'SAMEA110447250': '31-40',
 'SAMEA110584194': '5-9',
 'SAMEA110645315': '25-29',
 'SAMEA110648547': '15-19',
 'SAMEA110