In [3]:
import xml.sax
import datetime
import csv

In [8]:
biosamples_path = "data/biosamples/biosample_set.xml"

class BioSamplesDateHandler(xml.sax.ContentHandler):
    def __init__(self):
        self.num_samples = 0
        self.biosample_id = ""
        self.publication_date_string = ""
        self.submission_date_string = ""
        self.collection_date_string = ""
        self.is_collection_date = False
        self.biosample_dict = {}

    def startDocument(self):
        with open("data/biosamples/results/biosample_date.csv", "w") as f:
            writer = csv.writer(f)
            writer.writerow(["biosample_id", "publication_date", "submission_date", "collection_date"])
        

    def startElement(self, tag, attributes):
        if tag == "BioSample":
            self.num_samples += 1

            if "publication_date" not in attributes.keys():
                self.publication_date_string = "NA"
            else:
                self.publication_date_string = attributes["publication_date"]

            if "submission_date" not in attributes.keys():
                self.submission_date_string = "NA"
            else:
                self.submission_date_string = attributes["submission_date"]

            if "accession" not in attributes.keys():
                self.biosample_id = "NA"
            else:
                self.biosample_id = attributes["accession"]

        elif tag == "Attribute":
            if "harmonized_name" in attributes.keys() and attributes["harmonized_name"] == "collection_date":
                    self.is_collection_date = True


    def characters(self, content):
        if self.is_collection_date:
            self.collection_date_string = content
            # print(self.collection_date_string)
            self.is_collection_date = False
        pass

    def endElement(self, name):
        if name == "BioSample":
            if self.collection_date_string == "":
                self.collection_date_string = "NA"

            self.biosample_dict[self.biosample_id] = {
                "publication_date": self.publication_date_string,
                "submission_date": self.submission_date_string,
                "collection_date": self.collection_date_string
            }
            self.biosample_id = ""
            self.publication_date_string = ""
            self.submission_date_string = ""
            self.collection_date_string = ""

            if self.num_samples % 50000 == 0:
                print("num samples: %s" % self.num_samples)
                # Write dictionary to csv then empty it
                with open("data/biosamples/results/biosample_date.csv", "a") as f:
                    writer = csv.writer(f)
                    for biosample_id, date_dict in self.biosample_dict.items():
                        writer.writerow([biosample_id, date_dict["publication_date"], date_dict["submission_date"], date_dict["collection_date"]])
                # remove the dictionary from memory and reinitialise
                del self.biosample_dict
                self.biosample_dict = {}

    def endDocument(self):
        print("num samples: %s" % len(self.biosample_dict))

        # Write dictionary to csv
        with open("data/biosamples/results/biosample_date.csv", "a") as f:
            writer = csv.writer(f)
            for biosample_id, date_dict in self.biosample_dict.items():
                writer.writerow([biosample_id, date_dict["publication_date"], date_dict["submission_date"], date_dict["collection_date"]])

        pass



In [9]:
parser = xml.sax.make_parser()
parser.setContentHandler(BioSamplesDateHandler())
parser.parse(biosamples_path)

num samples: 50000
num samples: 100000
num samples: 150000
num samples: 200000
num samples: 250000
num samples: 300000
num samples: 350000
num samples: 400000
num samples: 450000
num samples: 500000
num samples: 550000
num samples: 600000
num samples: 650000
num samples: 700000
num samples: 750000
num samples: 800000
num samples: 850000
num samples: 900000
num samples: 950000
num samples: 1000000
num samples: 1050000
num samples: 1100000
num samples: 1150000
num samples: 1200000
num samples: 1250000
num samples: 1300000
num samples: 1350000
num samples: 1400000
num samples: 1450000
num samples: 1500000
num samples: 1550000
num samples: 1600000
num samples: 1650000
num samples: 1700000
num samples: 1750000
num samples: 1800000
num samples: 1850000
num samples: 1900000
num samples: 1950000
num samples: 2000000
num samples: 2050000
num samples: 2100000
num samples: 2150000
num samples: 2200000
num samples: 2250000
num samples: 2300000
num samples: 2350000
num samples: 2400000
num samples: