<a href="https://colab.research.google.com/github/charlotter62/EU-ETS-EUTL/blob/main/T2_transaction_xmls_highvolume_days_DOWNLOAD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download High Volume Transaction Days


---


**Description**:

The following code downloads individual transactions from the [European Union Transaction Log](https://ec.europa.eu/clima/ets/transaction.do), from an input file of TransactionIDs. This script is needed for days when a single registry had over 3000 transactions, the limit for exporting XML searches. For these days, we first scrape the TransactionIDs, and then individually download the transaction XML files. All other transactions can be downloaded in bulk using this script: [xml-byregistry-bydate.ipynb](https://colab.research.google.com/drive/1lmHfv5nGsRHqT0ce6R0OiZDq_JBmTrOe?usp=sharing)

**Author**: Charlotte Rivard
**Contact**: 15crivard@gmail.com
**Date**: 1/13/2022

*Please reach out with questions and coauthorship considerations if using this script for publications*

---


In [None]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
import os

In [None]:
def getTransactionIDs(soup):
  tags = soup.findAll("span",{"class":"classictext"})[::15]
  transactionIDs = []
  for tag in tags:
    transactionIDs.append(tag.string.strip())
    #print(tag.string.strip())
  return(transactionIDs)

In [None]:
workingdir = "/gdrive/MyDrive/Brookings/XML_downloads/xml-byregistry-bydate/"
hvdays = pd.read_csv(workingdir + "HighVolumeDays.csv")

In [None]:
hvdays

Unnamed: 0,Year,Month,Day,Code
0,2014,2,28,EU
1,2015,2,27,EU


In [None]:
URL ="https://ec.europa.eu/clima/ets/transaction.do?search=search&suppTransactionType=-1&endDate=28%2F02%2F2014&transactionStatus=4&currentSortSettings=&originatingAccountType=-1&originatingAccountIdentifier=&languageCode=en&originatingAccountHolder=&destinationAccountIdentifier=&exportAction=transaction&transactionID=&transactionType=-1&destinationAccountType=-1&form=transaction&toCompletionDate=&exportType=1&originatingRegistry=EU&destinationAccountHolder=&fromCompletionDate=&destinationRegistry=-1&startDate=28%2F02%2F2014"
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
numpgs = int(soup.find("input", {"name":"resultList.lastPageNumber"})["value"])
numpgs

In [None]:
len(hvdays.index)

In [None]:
allIDs = []
for i in range(0,len(hvdays.index)):
  y1 = str(hvdays.loc[i][0])
  y2 = y1
  m1 = "{:02d}".format(hvdays.loc[i][1])
  m2 = m1
  d1 = "{:02d}".format(hvdays.loc[i][2])
  d2 = d1
  treg = hvdays.loc[i][3]
  URL = "https://ec.europa.eu/clima/ets/transaction.do?search=search&suppTransactionType=-1&endDate="+d2+"%2F"+m2+"%2F" + y2 +"&transactionStatus=4&currentSortSettings=&originatingAccountType=-1&originatingAccountIdentifier=&languageCode=en&originatingAccountHolder=&destinationAccountIdentifier=&exportAction=transaction&transactionID=&transactionType=-1&destinationAccountType=-1&form=transaction&toCompletionDate=&exportType=1&originatingRegistry=EU&destinationAccountHolder=&fromCompletionDate=&destinationRegistry=-1&startDate="+d1+"%2F"+ m2 +"%2F"+y2
  #URL = "https://ec.europa.eu/clima/ets/exportEntry.do?form=transaction&endDate="+d2+"%2F"+m2+"%2F"+y2+ "&transactionStatus=4&suppTransactionType=-1&currentSortSettings=&originatingAccountType=-1&originatingAccountIdentifier=&languageCode=en&originatingAccountHolder=&destinationAccountIdentifier=&transactionID=&transactionType=-1&destinationAccountType=-1&toCompletionDate=&originatingRegistry=" + treg + "&destinationAccountHolder=&fromCompletionDate=&destinationRegistry=-1&startDate="+d1+"%2F"+m1+"%2F"+y1+"&exportType=1&exportAction=transactionAll&exportOK=exportOK"
  print(URL)
  page = requests.get(URL)
  soup = BeautifulSoup(page.content, "html.parser")
  numpgs = int(soup.find("input", {"name":"resultList.lastPageNumber"})["value"])

  dateIDs = []
  for i in range(0,numpgs):
    URL = "https://ec.europa.eu/clima/ets/transaction.do?languageCode=en&startDate="+d1+"%2F"+ m1 +"%2F"+ y1 +"&endDate="+d2+"%2F"+m2+"%2F"+y2+"&transactionStatus=4&fromCompletionDate=&toCompletionDate=&transactionID=&transactionType=-1&suppTransactionType=-1&originatingRegistry=EU&destinationRegistry=-1&originatingAccountType=-1&destinationAccountType=-1&originatingAccountIdentifier=&destinationAccountIdentifier=&originatingAccountHolder=&destinationAccountHolder=&currentSortSettings=&resultList.currentPageNumber="+str(i)+"&nextList=Next%3E"
    print("Page "+str(i+1)+" of "+str(numpgs)+": "+URL)
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, "html.parser")
    ids = getTransactionIDs(soup)
    dateIDs = dateIDs + ids

  allIDs = allIDs + dateIDs

https://ec.europa.eu/clima/ets/transaction.do?search=search&suppTransactionType=-1&endDate=28%2F02%2F2014&transactionStatus=4&currentSortSettings=&originatingAccountType=-1&originatingAccountIdentifier=&languageCode=en&originatingAccountHolder=&destinationAccountIdentifier=&exportAction=transaction&transactionID=&transactionType=-1&destinationAccountType=-1&form=transaction&toCompletionDate=&exportType=1&originatingRegistry=EU&destinationAccountHolder=&fromCompletionDate=&destinationRegistry=-1&startDate=28%2F02%2F2014
Page 1 of 165: https://ec.europa.eu/clima/ets/transaction.do?languageCode=en&startDate=28%2F02%2F2014&endDate=28%2F02%2F2014&transactionStatus=4&fromCompletionDate=&toCompletionDate=&transactionID=&transactionType=-1&suppTransactionType=-1&originatingRegistry=EU&destinationRegistry=-1&originatingAccountType=-1&destinationAccountType=-1&originatingAccountIdentifier=&destinationAccountIdentifier=&originatingAccountHolder=&destinationAccountHolder=&currentSortSettings=&resu

In [None]:
allIDs

In [None]:
pd.DataFrame(allIDs).to_csv(workingdir+"HighVolumeDays_TransactionIDs.csv",index=False)

Downloading these transactions by ID

In [None]:
!pip install wget
import wget
from socket import error as SocketError
import errno
import time
import os

Collecting wget
  Downloading wget-3.2.zip (10 kB)
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9672 sha256=ef684f55f2ae6bb1c3003724986661eb33b4cf4714b8ea9b02469d86a9293ae6
  Stored in directory: /root/.cache/pip/wheels/a1/b6/7c/0e63e34eb06634181c63adacca38b79ff8f35c37e3c13e3c02
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [None]:
def isXML(filepath):
  f = open(filepath, "r")
  xml=False
  if "<?xml" in f.readline():
    xml=True
  return(xml)

In [None]:
def patientDownload(link,savename):
  success=0
  while(success!=1):
    try:
      wget.download(link,savename)
      if(isXML(savename)):
        success=1
      else:
        print("Download failed, attempting again")
        os.remove(savename)
        time.sleep(10)
    except SocketError as e:
      if e.errno == errno.ECONNRESET:
        print("Connection reset by server error handling")
        time.sleep(10)
    except HTTPError as e:
      print("HTTP error handling")
      time.sleep(10)
    except Exception as e:
      time.sleep(10)

In [None]:
def getXMLbyTransaction(transID,folder):
  link = "https://ec.europa.eu/clima/ets/exportEntry.do?form=transaction&endDate=&transactionStatus=4&suppTransactionType=-1&currentSortSettings=&originatingAccountType=-1&originatingAccountIdentifier=&languageCode=en&originatingAccountHolder=&destinationAccountIdentifier=&transactionID="+transID+"&transactionType=-1&destinationAccountType=-1&toCompletionDate=&originatingRegistry=-1&destinationAccountHolder=&fromCompletionDate=&destinationRegistry=-1&startDate="+"&exportType=1&exportAction=transaction&exportOK=exportOK"
  print(link)
  savepath = folder +"TransactionsBasic/"+ transID +"_TransactionsBasic.xml"
  print(savepath)
  patientDownload(link,savepath)

  detailslink = "https://ec.europa.eu/clima/ets/exportEntry.do?form=transactionAll&originatingAccountNumber=&suppTransactionType=-1&endDate=&currentSortSettings=&originatingAccountType=-1&originatingAccountIdentifier=&originatingAccountHolder=&destinationAccountIdentifier=&transactionID="+transID+"&transferringEsdRegistryCode=&toCompletionDate=&destinationRegistry=-1&transactionStatus=4&currentPageNumberCZ9833=1&transferringEsdYear=&destinationAccountNumber=&languageCode=en&transactionType=-1&destinationAccountType=-1&acquiringEsdYear=&originatingRegistry=-1&acquiringEsdRegistryCode=&destinationAccountHolder=&fromCompletionDate=&startDate=&primaryKey=" + transID + "&exportType=1&exportAction=transaction&exportOK=exportOK"
  print(detailslink)
  detailspath = folder + "DetailsAll/" + transID + "_DetailsAll.xml"
  print(detailspath)
  patientDownload(detailslink,detailspath)


In [None]:
# transID = "EU161579"
# getXMLbyTransaction(transID,workingdir+"HighVolumeDays/")

https://ec.europa.eu/clima/ets/exportEntry.do?form=transaction&endDate=&transactionStatus=4&suppTransactionType=-1&currentSortSettings=&originatingAccountType=-1&originatingAccountIdentifier=&languageCode=en&originatingAccountHolder=&destinationAccountIdentifier=&transactionID=EU161579&transactionType=-1&destinationAccountType=-1&toCompletionDate=&originatingRegistry=-1&destinationAccountHolder=&fromCompletionDate=&destinationRegistry=-1&startDate=&exportType=1&exportAction=transaction&exportOK=exportOK
/gdrive/MyDrive/Brookings/XML_downloads/xml-byregistry-bydate/HighVolumeDays/TransactionsBasic/EU161579_TransactionsBasic.xml
https://ec.europa.eu/clima/ets/exportEntry.do?form=transactionAll&originatingAccountNumber=&suppTransactionType=-1&endDate=&currentSortSettings=&originatingAccountType=-1&originatingAccountIdentifier=&originatingAccountHolder=&destinationAccountIdentifier=&transactionID=EU161579&transferringEsdRegistryCode=&toCompletionDate=&destinationRegistry=-1&transactionStat

In [None]:
folder = workingdir + "HighVolumeDays"
if not os.path.isdir(folder):
  os.makedirs(folder)
  print("created folder : ", folder)

if not os.path.isdir(folder+"/TransactionsBasic"):
  os.makedirs(folder+"/TransactionsBasic")
  print("created folder : ", folder+"/TransactionsBasic")

if not os.path.isdir(folder+"/DetailsAll"):
  os.makedirs(folder+"/DetailsAll")
  print("created folder : ", folder+"/DetailsAll")

created folder :  /gdrive/MyDrive/Brookings/XML_downloads/xml-byregistry-bydate/HighVolumeDays/DetailsAll


In [None]:
allIDs = pd.read_csv(workingdir+"HighVolumeDays_TransactionIDs.csv").iloc[:,0]
allIDs

0       EU162304
1       EU162245
2       EU160286
3       EU160210
4       EU160226
          ...   
6534    EU249291
6535    EU249289
6536    EU249287
6537    EU249284
6538    EU249275
Name: 0, Length: 6539, dtype: object

In [None]:
for id in allIDs:
  getXMLbyTransaction(id,workingdir+"HighVolumeDays/")