In [2]:
import xml.sax
import datetime
import csv

In [5]:
biosamples_path = "data/biosamples/biosample_set.012.xml"

class BioSamplesDateHandler(xml.sax.ContentHandler):
    def __init__(self):
        self.num_samples = 0
        self.biosample_id = ""
        self.publication_date_string = ""
        self.submission_date_string = ""
        self.collection_date_string = ""
        self.is_collection_date = False
        self.biosample_dict = {}


    def startElement(self, tag, attributes):
        if tag == "BioSample":
            self.num_samples += 1
            self.biosample_id = attributes["accession"]
            self.publication_date_string = attributes["publication_date"]
            self.submission_date_string = attributes["submission_date"]
        elif tag == "Attribute":
            if "harmonized_name" in attributes.keys() and attributes["harmonized_name"] == "collection_date":
                    self.is_collection_date = True


    def characters(self, content):
        if self.is_collection_date:
            self.collection_date_string = content
            # print(self.collection_date_string)
            self.is_collection_date = False
        pass

    def endElement(self, name):
        if name == "BioSample":
            if self.collection_date_string == "":
                self.collection_date_string = "NA"

            self.biosample_dict[self.biosample_id] = {
                "publication_date": self.publication_date_string,
                "submission_date": self.submission_date_string,
                "collection_date": self.collection_date_string
            }
            self.biosample_id = ""
            self.publication_date_string = ""
            self.submission_date_string = ""
            self.collection_date_string = ""

    def endDocument(self):
        print("num samples: %s" % len(self.biosample_dict))

        # Write dictionary to csv
        with open("data/biosamples/results/biosample_date.csv", "w") as f:
            writer = csv.writer(f)
            writer.writerow(["biosample_id", "publication_date", "submission_date", "collection_date"])
            for biosample_id, date_dict in self.biosample_dict.items():
                writer.writerow([biosample_id, date_dict["publication_date"], date_dict["submission_date"], date_dict["collection_date"]])

        pass


parser = xml.sax.make_parser()
parser.setContentHandler(BioSamplesDateHandler())
parser.parse(biosamples_path)


num samples: 40608
