# Version 1.1

- doesn't look at unique attribute names for location information 


In [5]:
import xml.sax
import json


In [14]:
class BioSamplesLocationHandler(xml.sax.ContentHandler):
    '''
    SAX handler class to read in geographical information from a BioSamples XML file
    '''

    def __init__(self, tmp_file, output_file, attribute_type_dict) -> None:
        super().__init__()
        self.tmp_file = tmp_file
        self.output_file = output_file
        self.attribute_type_dict = attribute_type_dict
        self.biosample_id = ''
        self.is_sra = False
        self.sra_id = ''
        self.location_type = {'lat': False, 'lon': False, 'location': False}
        self.cur_dict = {}

        self.count = 0

    def startDocument(self):
        open(self.tmp_file, 'w').close()
        open(self.output_file, 'w').close()

    def startElement(self, name, attrs):
        if name == 'BioSample':
            self.biosample_id = attrs['accession']
        elif name == 'Id':
            if 'db' in attrs and attrs['db'] == 'SRA':
                self.is_sra = True
        elif name == 'Attribute':
            try:
                self.attribute_name = attrs['harmonized_name']
            except KeyError:
                self.attribute_name = attrs['attribute_name']

            # Check if the attribute name could be a lat, lon or location
            if self.attribute_name in self.attribute_type_dict:
                for key in self.attribute_type_dict[self.attribute_name]:
                    self.location_type[key] = True
    
    def characters(self, content):
        if self.is_sra:
            self.sra_id = content
            self.is_sra = False
        elif any(self.location_type.values()):
            self.cur_dict[self.attribute_name] = content
            self.location_type = {'lat': False, 'lon': False, 'location': False}


    def endElement(self, name):
        if name == 'BioSample':
            self.count += 1
            if self.count % 10000 == 0:
                print(self.count)

            if self.cur_dict == {}:
                with open(self.tmp_file, 'a') as f:
                    f.write(self.biosample_id + '\n')
            else:
                with open(self.output_file, 'a') as f:
                    json.dump({**{'biosample_id': self.biosample_id}, **self.cur_dict}, f)
                    f.write('\n')
            self.cur_dict = {}
            self.location_type = {'lat': False, 'lon': False, 'location': False}
            self.biosample_id = ''
            self.sra_id = ''


    def endDocument(self):
        print('Finished parsing BioSamples XML file')




In [15]:
# Dictionary maps attribute names to their corresponding type (lat, lon, location)
attribute_type_dict = {'birth_location': ['location'], 'geo_loc_name': ['location'], 'geo_loc_exposure': ['location'], 'host_recent_travel_loc': ['location'], 'lat_lon': ['lat', 'lon'], 'lat': ['lat'], 'lon': ['lon']}

parser = xml.sax.make_parser()
handler = BioSamplesLocationHandler('/home/ec2-user/workspace/data/results/location_tmp.jsonl', '/home/ec2-user/workspace/data/results/location_output.jsonl', attribute_type_dict)
parser.setContentHandler(handler)

parser.parse('/home/ec2-user/workspace/data/biosample_random_samples.xml')

10000
Finished parsing BioSamples XML file
