# Version 1.2
- extracts values from stored set of attributes
- label the value as male, female or other
    - if value is similar to 'male and female' listed as both male and female

In [83]:
import pandas as pd
import xml.sax
import re
import json

In [93]:
def preprocess(text):
    '''
    Preprocesses text by removing punctuation and converting to lowercase
    '''
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

class BioSamplesHandler(xml.sax.ContentHandler):
    '''
    SAX handler to read in information from a BioSamples XML file.
    - Reads in attributes that correspond to sex/gender values
    - Uses regular expressions to classify the values as male, female, or other
    - Stores the results in a dictionary
    '''

    def __init__(self, positive_attributes, output_file):
        self.positive_attributes = positive_attributes
        self.output_file = output_file
        self.biosample_id = ''
        self.attribute_name = ''
        self.cur_dict = {}
        self.is_sra = False
        self.sra_id = ''
        self.count = 0
        self.male_pattern = r'\bmale\b'
        self.female_pattern = r'\bfemale\b'
        self.is_male = None
        self.is_female = None
        self.is_other = None


    def startDocument(self):
        open(self.output_file, 'w').close()
        print('Started parsing...')


    def startElement(self, name, attrs):
        if name == 'BioSample':
            self.biosample_id = attrs['accession']
            self.cur_dict = {}
        elif name == 'Attribute':
            try:
                self.attribute_name = attrs['harmonized_name']
            except KeyError:
                self.attribute_name = attrs['attribute_name']
        elif name == 'Id':
            if 'db' in attrs and attrs['db'] == 'SRA':
                self.is_sra = True

    def characters(self, content):
        if self.attribute_name in self.positive_attributes:
            value = preprocess(content)
            if re.findall(self.male_pattern, value):
                self.is_male = True

            if re.findall(self.female_pattern, value):
                self.is_female = True

            if not any((self.is_female, self.is_male)):
                self.is_other = True

        if self.is_sra:
            self.sra_id = content
            self.is_sra = False


    def endElement(self, name):
        if name == 'BioSample':
            # only write to file if there is a value for at least one of the attributes
            if any([self.is_male, self.is_female, self.is_other]):
                self.cur_dict['biosample_id'] = self.biosample_id
                self.cur_dict['sra_id'] = self.sra_id
                self.cur_dict['male'] = self.is_male
                self.cur_dict['female'] = self.is_female
                self.cur_dict['other'] = self.is_other
                with open(self.output_file, 'a') as f:
                    json.dump(self.cur_dict, f)
                    f.write('\n')

            self.count += 1
            if self.count % 1000000 == 0:
                print('{} samples parsed'.format(self.count))

            # Reset values
            self.biosample_id = ''
            self.cur_dict = {}
            self.sra_id = ''
            self.is_male = None
            self.is_female = None
            self.is_other = None

    def endDocument(self):
        print('Finished parsing\n')

In [94]:
positive_attributes = {'animal_sex', 'host_sex', 'sex', 'plant_sex', 'vioscreen_gender', 'offspring gender', 'animal_gender', 'pm_gender', 'demographics_gender'}
biosamples_path = '/home/ec2-user/workspace/data/biosample_set.xml'

parser = xml.sax.make_parser()
handler = BioSamplesHandler(positive_attributes, '/home/ec2-user/workspace/data/results/sex_output_1.jsonl')
parser.setContentHandler(handler)

parser.parse(biosamples_path)


Started parsing...
1000000 samples parsed
2000000 samples parsed
3000000 samples parsed
4000000 samples parsed
5000000 samples parsed
6000000 samples parsed
7000000 samples parsed
8000000 samples parsed
9000000 samples parsed
10000000 samples parsed
11000000 samples parsed
12000000 samples parsed
13000000 samples parsed
14000000 samples parsed
15000000 samples parsed
16000000 samples parsed
17000000 samples parsed
18000000 samples parsed
19000000 samples parsed
20000000 samples parsed
21000000 samples parsed
22000000 samples parsed
23000000 samples parsed
24000000 samples parsed
25000000 samples parsed
26000000 samples parsed
27000000 samples parsed
28000000 samples parsed
29000000 samples parsed
30000000 samples parsed
31000000 samples parsed
32000000 samples parsed
33000000 samples parsed
34000000 samples parsed
Finished parsing

