In [1]:
import xml.sax


In [2]:
class BioSamplesLocationHandler(xml.sax.ContentHandler):
    '''
    SAX hander class to read in geographical information from BioSamples XML file.
        - uses a set of attributes to identify the relevant XML elements
        - geographical information stored in the provided dictionary
    '''

    def __init__(self, sample_dict, relevant_attribtues) -> None:
        super().__init__()
        self.sample_dict = sample_dict
        self.relevant_attribtues = relevant_attribtues
        self.biosample_id = ''
        self.content_dict = {}
        self.attribute_name = ''

    def startElement(self, name, attrs):
        if name == 'BioSample':
            self.biosample_id = attrs['accession']
        elif name == 'Attribute':
            try:
                self.attribute_name = attrs['harmonized_name']
            except KeyError:
                self.attribute_name = attrs['attribute_name']
        
    def characters(self, content):
        if self.attribute_name in self.relevant_attribtues:
            self.content_dict[self.attribute_name] = content


    def endElement(self, name):
        if name == 'BioSample':
            self.sample_dict[self.biosample_id] = self.content_dict
            self.content_dict = {}
        elif name == 'Attribute':
            self.attribute_name = ''

    def endDocument(self):
        print('Finished parsing BioSamples XML file')

In [13]:
sample_dict = {}
biosamples_path = '../../data/biosamples/biosample_random_samples.xml'
location_attributes = {'birth_location', 'food_dis_point', 'food_dis_point_city', 'food_origin', 'geo_loc_exposure', 'geo_loc_name', 'host_recent_travel_loc', 'lat_lon', 'latitude', 'longitude'}

parser = xml.sax.make_parser()
handler = BioSamplesLocationHandler(sample_dict, location_attributes)
parser.setContentHandler(handler)

parser.parse(biosamples_path)
print('Number of samples:', len(sample_dict))

Finished parsing BioSamples XML file
Number of samples: 10000


In [19]:
positive_samples = {k: v for k, v in sample_dict.items() if len(v) > 0}
negative_samples = {k: v for k, v in sample_dict.items() if len(v) == 0}

print('Number of samples with location information:', len(positive_samples))
print('Number of samples without location information:', len(negative_samples))

Number of samples with location information: 5410
Number of samples without location information: 4590


In [34]:
import random
import csv
random.seed(42)

random_positive_samples = random.sample(list(positive_samples.items()), 50)
random_negative_samples = random.sample(list(negative_samples.items()), 50)

# with open('../../data/biosamples/random_positive_location_samples.csv', 'w') as f:
#     writer = csv.writer(f)
#     writer.writerow(['sample_id', 'url', 'sample_dict', 'eval'])
#     for biosample_id, sample_dict in random_positive_samples:
#         url = f'https://www.ncbi.nlm.nih.gov/biosample/{biosample_id}'
#         writer.writerow([biosample_id, url, sample_dict])


# with open('../../data/biosamples/random_negative_location_samples.csv', 'w') as f:
#     writer = csv.writer(f)
#     writer.writerow(['sample_id', 'url', 'sample_dict', 'eval'])
#     for biosample_id, sample_dict in random_negative_samples:
#         url = f'https://www.ncbi.nlm.nih.gov/biosample/{biosample_id}'
#         writer.writerow([biosample_id, url, sample_dict])

- 50 random negative samples manually annotated to compare to found results

## False Negatives
Missed attributes:
- environmental_marine_region
- sampling event, latitude, end	
- sampling event, latitude, start	
- sampling event, longitude, end	
- sampling event, longitude, start
- sampling site: code that has specific coordinates 

## Results
True negatives: 49

False negatives: 1