# XML to JSON
Converts XML files into JSON, Currently only works for the URL lists provided by `getHansardUrls.ipynb`

In [9]:
import json

In [209]:
class xmlSnippet(): # A class that represents a snippet of XML data
    def __init__(self, rawStr, selfAnalyse, **kwargs):
        self.rawStr = rawStr.strip().replace("\\n", "")
        self.properties = kwargs.get("properties", [])
        self.errorList = []
        if selfAnalyse:
            self.errorList = self.analyse()
            
    
    def analyse(self):
        tagList = []
        errorEntries = []
        for index, item in enumerate(self.rawStr.split("<")):
            try:
                if item[0] != "/": # If the line doesn't contain a closing tag
                    key = item.split(">")[0]
                    value = item.split(">")[1]
                    attributes = {}
                    
                    # If the XML has attributes associated with it
                    if " " in key: 
                        keyList = key.split(" ")
                        key = keyList[0]
                        for attribute in keyList:
                            if attribute != key:
                                attrPair = attribute.split("=")
                                attributes[attrPair[0]] = attrPair[1]
                    
                    self.properties.append(
                        {
                            "key": key,
                            "value": value,
                            "attr": attributes
                        }
                    )
                
            except IndexError:
                errorEntries.append(item)
        return errorEntries

    def display(self):
        print(f"{'Key':<15} | {'Attr Count':<15} | Value")
        print(f"{'-'*16}|{'-'*17}|{'-'*15}")
        for item in self.properties:
            print(f"{item['key']:<15} | {str(len(item['attr'])):<15} | {item['value']}")

    def toJSON(self):
        data = {}
        for entry in self.properties:
            data[entry["key"]] = entry["value"]
        return data

    def toCSV(self, titleList):
        data = ''
        keyList = [item['key'] for item in self.properties]
        for item in titleList:
            try:
                data += self.properties[keyList.index(item)]['value'].replace(",", "-").replace(";","X") + ","
            except ValueError:
                data += ','
            
        return data

print(xmlSnippet(item, True).toCSV(['title', 'WAHOO', 'link', 'pubDate']))


Title Unavailable,,https://parlinfo.aph.gov.au:443/parlInfo/search/display/display.w3pXquery=Id%3A%22hansard80%2Fhansardr80%2F1903-10-07%2F0000%22,Wed- 07 Oct 1903 00:00:00 +1100,


In [211]:
# Load XML Data
pageCount = 16
chamber = "senate" # Either 'senate' or 'house'
xmlData = []

In [213]:
# Load the Data
for pNumber in range(pageCount):
    with open(f"../data/{chamber}/urlLists/list{pNumber}.txt", "r") as xmlFile:
        content = xmlFile.read()
    for item in content.split("<item>"):
        xmlData.append(xmlSnippet(item, True))

In [221]:
# JSON Output
dataDict = []
for row in xmlData:
    dataDict.append(row.toJSON())

In [225]:
with open(f"../data/{chamber}/urlLists/{chamber}URLsDB.json", "w") as outputFile:
    json.dump(dataDict,outputFile, indent=2)

In [215]:
# CSV Output
titles = ['title', 'pubDate', 'link']
csvData = ",".join(titles) + "\n"
for row in xmlData:
    csvData += row.toCSV(titles) + "\n"

In [216]:
with open(f"../data/{chamber}/urlLists/{chamber}URLsSheet.csv", "w") as outputFile:
    outputFile.write(csvData)