<a href="https://colab.research.google.com/github/charlotter62/EU-ETS-EUTL/blob/main/A2_xml_accounts_byaccountID_PARSE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Parsing Accounts XML files


---


**Description**:

The following code parses account XML files from the [European Union Transaction Log](https://ec.europa.eu/clima/ets/account.do?languageCode=en) into a csv of Accounts and a csv of Account Holders. The files are downloaded by registry and account type by the follow script: [xml-accounts-byaccountID.ipynb](https://colab.research.google.com/drive/1s3OtAiB5NIiOehEuut2z_0fadeFNDyqU?usp=sharing).

**Author**: Charlotte Rivard
**Contact**: 15crivard@gmail.com
**Date**: 1/13/2022

*Please reach out with questions and coauthorship considerations if using this script for publications*

---

In [None]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [None]:
!pip install lxml
from lxml import objectify
import pandas as pd
import numpy as np
import os



Combine to one XML file
(~30 minutes)

In [None]:
workingdir = "/gdrive/MyDrive/Brookings/XML_downloads/xml-accounts-byaccountID/"
folder = "XML files/"
files = [_ for _ in os.listdir(workingdir+folder) if _.endswith("account.xml")]
files[0:30]

In [None]:
[f for f in files if("83704" in f)]
files.index("DE_83704_account.xml")

1150

In [None]:
startfile = files[0]
openstartfile = open(workingdir+folder+startfile, "r").read()
id = startfile.split("_")[1]
regcode = startfile.split("_")[0]
openstartfile = openstartfile.replace("<Account>\n","<Account>\n\t\t\t<AccountID>"+str(id)+"</AccountID>\n\t\t\t<RegistryCode>"+regcode+"</RegistryCode>\n")

# for nextfile in files[1:10]:
for i in range(1,len(files)):
  nextfile = files[i]
  #Edit next file
  opennextfile = open(workingdir+folder+nextfile, "r").read()
  id = nextfile.split("_")[1]
  regcode = nextfile.split("_")[0]
  opennextfile = opennextfile.replace("<Account>\n","<Account>\n\t\t\t<AccountID>"+str(id)+"</AccountID>\n\t\t\t<RegistryCode>"+regcode+"</RegistryCode>\n")

  #Combine with full file
  openstartfile+="\n"
  openstartfile+=opennextfile
  openstartfile = openstartfile.replace('</AccountDetails>\n<?xml version="1.0" encoding="UTF-8"?>\n<AccountDetails>\n',"")

  #Save intermittently
  if(i%500==0):
    print("Account "+str(i)+" saving xml-accounts_byaccountID.xml")
    with open (workingdir+'xml-accounts_byaccountID.xml', 'w') as fp:
        fp.write(openstartfile)

#Save at the end
with open (workingdir+'xml-accounts_byaccountID.xml', 'w') as fp:
        fp.write(openstartfile)

Parse the single XML file...(~25 min)

In [None]:
xml_data = objectify.parse(workingdir+'xml-accounts_byaccountID.xml')
transaction_tags = xml_data.findall("Account")
transaction_tags

In [None]:
accountsdf = pd.DataFrame();
blocks = pd.DataFrame();
blocknames = [];

for tag in transaction_tags:
  row = tag.getchildren()
  holderflag = 0;
  acctid = "";
  regcode = "";
  accounts = [];
  column_names = [];

  for item in row:
    if(len(item.getchildren())> 0): #If the list item has children, it is a holder block
      if(holderflag==1):
        blocknames = ["AccountID","RegistryCode"]+[b.tag for b in item.getchildren()]
        blockrow = [acctid,regcode]+[b.text for b in item.getchildren()]
        blockrow = pd.DataFrame([blockrow],columns=blocknames)
        blocks = pd.concat([blocks,blockrow]).reset_index(drop=True) #blocks.append([blockrow])
      else:
        holderflag=1
    else:
      #colum_names.append(item.tag)
      if(item.tag not in column_names):
        accounts.append(item.text)
        column_names.append(item.tag)
      if(item.tag == "AccountID"):
        acctid = item.text
      if(item.tag == "RegistryCode"):
        regcode = item.text

  accounts = pd.DataFrame([accounts], columns=column_names)
  accountsdf = pd.concat([accountsdf,accounts]).reset_index(drop=True)

In [None]:
accountsdf

Unnamed: 0,AccountID,RegistryCode,AccountHolderName,NationalAdministrator,AccountStatus,AccountOpeningDate,AccountType,RelatedInstallationAircraftOperatorID,AccountClosingDate,CommitmentPeriod
0,90005,SI,Agencija Republike Slovenije za okolje,Slovenia,open,2012-05-09 00:00:00.0,AAU Deposit Account,,,
1,96310,CZ,OSTROJ a.s.,Czech Republic,closed,2006-01-24 00:00:00.0,Operator Holding Account,203,2013-06-28 11:07:31.0,
2,91978,GR,ΦΙΛΚΕΡΑΜ JOHNSON AE,Greece,closed,2006-04-21 00:00:00.0,Operator Holding Account,53,2013-11-20 13:50:41.0,
3,91473,SE,Emisso ek.för.,Sweden,closed,2007-05-12 00:00:00.0,Person Holding Account,,2013-03-11 11:00:14.0,
4,100839,NL,Kwekerij Rimato,Netherlands,closed,2009-10-22 00:00:00.0,Operator Holding Account,431,2013-09-27 13:12:40.0,
...,...,...,...,...,...,...,...,...,...,...
33044,8503,FI,Metsä Board Oyj,Finland,open,2005-11-11 00:00:00.0,Person Account in National Registry,,,Supplementary Program Commitment Period (2005 ...
33045,6392,AT,Schretter & Cie GmbH & Co KG,Austria,closed,2005-06-16 00:00:00.0,Former Operator Holding Account,,2012-10-22 16:26:07.0,Supplementary Program Commitment Period (2005 ...
33046,13537,HU,Wienerberger Téglaipari ZRt.,Hungary,open,2006-04-10 00:00:00.0,Former Operator Holding Account,,,Supplementary Program Commitment Period (2005 ...
33047,6391,AT,Schretter & Cie GmbH & Co KG,Austria,closed,2005-06-16 00:00:00.0,Former Operator Holding Account,,2012-10-22 16:25:35.0,Supplementary Program Commitment Period (2005 ...


In [None]:
blocks

Unnamed: 0,AccountID,registryCode,Name,City,SecondaryAddressLine,RelationshipType,CountryCode,Country,ZipCode,MainAddressLine
0,90005,,Agencija Republike Slovenije za okolje,Ljubljana,Vojkova 1b,Account holder,SI,Slovenia,SI-1000,
1,96310,,OSTROJ a.s.,Opava,Těšínská 1586/66,Account holder,CZ,Czech Republic,74641,
2,91978,,ΦΙΛΚΕΡΑΜ JOHNSON AE,ΘΕΣΣΑΛΟΝΙΚΗ,ΛΑΕΡΤΟΥ 21-23 & 25,Account holder,GR,Greece,55102,5ο χλμ ΓΕΩΡΓΙΚΗΣ ΣΧΟΛΗΣ
3,91473,,Emisso ek.för.,HALMSTAD,Vinkelvägen 4,Account holder,SE,Sweden,30241,
4,100839,,Kwekerij Rimato,HONSELERSDIJK,Zwethlaan 5,Account holder,NL,Netherlands,2675 LB,
...,...,...,...,...,...,...,...,...,...,...
33044,8503,,Metsä Board Oyj,METSÄ,PL 20,Account holder,FI,Finland,02020,PL 20
33045,6392,,Schretter & Cie GmbH & Co KG,Vils,Bahnhofstraße 27,Account holder,AT,Austria,6682,
33046,13537,,Wienerberger Téglaipari ZRt.,Budapest,Bártfai utca,Account holder,HU,Hungary,1119,34
33047,6391,,Schretter & Cie GmbH & Co KG,Vils,Bahnhofstraße 27,Account holder,AT,Austria,6682,


In [None]:
accountsdf.to_csv(workingdir+"all_Accounts_byAccountID.csv",index=False,encoding="UTF-8-sig")

In [None]:
blocks.to_csv(workingdir+"all_AccountHolders_byAccountID.csv",index=False,encoding="UTF-8-sig")