In [2]:
import xml.sax
import csv
import pandas as pd


In [5]:
biosamples_attributes_path = "data/biosamples/biosample_attributes.xml"


# Total of 938 attributes as of 11/5/2023
class BioSampleAttributesHandler(xml.sax.ContentHandler):
    def __init__(self) -> None:
        self.attribute_dict = {}
        self.is_harmonized_name = False
        self.is_synonym = False
        self.is_description = False
        self.harmonized_name = ""
        # all synonyms stored as a semicolon (;) separated string
        self.synonyms = ""
        self.description = ""
        super().__init__()


    def startElement(self, name, attrs):
        if name == "HarmonizedName":
            self.is_harmonized_name = True
        elif name == "Synonym":
            self.is_synonym = True
        elif name == "Description":
            self.is_description = True



    def characters(self, content):
        if self.is_harmonized_name:
            self.harmonized_name = content
            self.is_harmonized_name = False
        elif self.is_synonym:
            if self.synonyms == "":
                self.synonyms = content
            else:
                self.synonyms += ";" + content
            self.is_synonym = False
        elif self.is_description:
            self.description = content
            self.is_description = False
            

    
    def endElement(self, name):
        if name == "Attribute":
            self.attribute_dict[self.harmonized_name] = (self.synonyms, self.description)
            self.harmonized_name = ""
            self.synonyms = ""
            self.description = ""
        pass



    def endDocument(self):
        print(self.attribute_dict)
        # Write the dictionary to a csv file
        with open("data/biosamples/biosample_attributes.csv", "w") as f:
            writer = csv.writer(f)
            writer.writerow(["harmonized_name", "synonyms", "description"])
            for key, value in self.attribute_dict.items():
                writer.writerow([key, value[0], value[1]])
        pass

     
parser = xml.sax.make_parser()
handler = BioSampleAttributesHandler()
parser.setContentHandler(handler)  
parser.parse(biosamples_attributes_path)   

{'api': ('api;api gravity', 'API gravity is a measure of how heavy or light a petroleum liquid is compared to water (source: https://en.wikipedia.org/wiki/API_gravity), e.g. 31.1 API'), 'edta_inhibitor_tested': ('edta inhibitor tested', 'Was carbapenemase activity tested in the presence of EDTA? If carbapenemase activity was tested in the presence of EDTA, the response should be "yes", otherwise "no”.'), 'fao_class': ('fao class;fao classification;soil taxonomic/fao classification', 'soil classification from the FAO World Reference Database for Soil Resources'), 'food_industry_class': ('fda food industry class name;food industry class', 'The US FDA Class is the second of five elements that comprise a FDA product code. This element is directly related to an Industry and designates the food group, source, product, use, pharmacological action, category or animal species of the product. A Class code is more specific than an Industry; for example, the Fishery/Seafood products Industry may c

In [8]:
df = pd.read_csv("data/biosamples/biosample_attributes.csv")
harmonized_names = df["harmonized_name"].tolist()

total_synonyms = 0
synonyms = df["synonyms"].tolist()
for synonym_list in synonyms:
    if type(synonym_list) == str:
        total_synonyms += len(synonym_list.split(";"))


print("total harmonized names: ", len(harmonized_names))
print("total synonyms: ", total_synonyms)

['api', 'edta_inhibitor_tested', 'fao_class', 'food_industry_class', 'food_industry_code', 'food_harvest_proc', 'gisaid_accession', 'gisaid_virus_name', 'hiv_stat', 'haccp_term', 'ifsac_category', 'narms_isolate_number', 'omics_observ_id', 'population_description', 'sars_cov_2_diag_gene_name_1', 'sars_cov_2_diag_gene_name_2', 'sars_cov_2_diag_pcr_ct_value_1', 'sars_cov_2_diag_pcr_ct_value_2', 'super_population_code', 'super_population_description', 'abs_air_humidity', 'additional_info', 'address', 'adj_room', 'aero_struc', 'affection_status', 'age', 'agrochem_addition', 'air_pm_concen', 'air_temp', 'air_temp_regm', 'alkalinity', 'alkalinity_method', 'alkyl_diethers', 'altitude', 'al_sat', 'al_sat_meth', 'aminopept_act', 'ammonium', 'amniotic_fluid_color', 'amount_light', 'analyte_type', 'anamorph', 'ances_data', 'animal_env', 'animal_feed_equip', 'host_housing', 'animal_intrusion', 'anim_water_method', 'annual_season_precpt', 'annual_season_temp', 'antibiotic_regm', 'host_am', 'spikein