# Parsing the bulk download facility table of contents

### The output Excel file is an input in the Power BI application

In [4]:

import pandas as pd

import lxml
import lxml.etree




### Parse the contents file

* Download table_of_contents.xml from url = 'https://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?sort=1&file=table_of_contents.xml' and put it in the same folder with this Jupyter notebook.

In [5]:
xml = lxml.etree.parse('table_of_contents.xml')

* Get the nodes with the codes (_elements_codes_), the english titles (_elements_names_)and the download (in .TSV format) links (_elements_files_).

In [6]:
import re

elements_codes = xml.xpath(r'//nt:code',namespaces={'nt': 'urn:eu.europa.ec.eurostat.navtree'}) 
elements_names = xml.xpath(r'//nt:title[@language="en"]',namespaces={'nt': 'urn:eu.europa.ec.eurostat.navtree'}) 
elements_downloads = xml.xpath(r'//nt:downloadLink[@format="tsv"]',namespaces={'nt': 'urn:eu.europa.ec.eurostat.navtree'}) 
elements_downloads=[el.text for el in elements_downloads]
elements_files = [re.sub(r'\.tsv\.gz','',el).split('/')[-1] for el in elements_downloads]

In [7]:
print([el. text for el in elements_codes][:5])
print([el. text for el in elements_names][:5])

['data', 'general', 'euroind', 'ei_bcs', 'ei_bcs_cs']
['Database by themes', 'General and regional statistics', 'European and national indicators for short-term analysis', 'Business and consumer surveys (source: DG ECFIN)', 'Consumer surveys (source: DG ECFIN)']


* Create lists _crumbs_codes_, _crumbs_names_, _numbers_, _numbered_crumbs_ and _crumbs_files_.
* _crumbs_codes_ and _crumbs_names_ will **accumulate** the codes and names.
* _numbers_ will contain a lexicographic numbering.
* _numbered_crumbs_ will put the above in tuples.
* Whenever the current code is the same as the corresponding part of a download link, this code is added to the tuple in _numbered_crumbs_. It is then easy to reconstruct the complete link from the code.

In [8]:
# counts the number of parents to the root element
def get_depth(element):
    depth = 0
    parent = element.getparent()
    while parent is not None:
        depth += 1
        parent = parent.getparent()
    return depth

# when a new element is entered, it replaces the value in the list
# at that level and drops all values to the right
def reduce_by_depth(elements_codes,elements_names):
    MAX_DEPTH=20
    count = 0
    crumbs_codes, crumbs_names, numbers, numbered_crumbs = ([],[],[],[])
    crumbs_files = []
    depth = 0
    crumb_c, crumb_n  = (['']*MAX_DEPTH, ['']* MAX_DEPTH)
    numbering = ['']*MAX_DEPTH
    for i in range(len(elements_codes)):
        elem_code, elem_name  = (elements_codes[i], elements_names[i])
        depth = get_depth(elem_code)
        crumb_c[depth], crumb_n[depth] = (elem_code.text, elem_name.text)
        numbering[depth] = str(int(numbering[depth])+1) if numbering[depth] != '' else '1' 
        crumb_c[depth+1:], crumb_n[depth+1:] = (['']*(MAX_DEPTH-depth-1), ['']*(MAX_DEPTH-depth-1))
        numbering[depth+1:] = ['']*(MAX_DEPTH-depth-1)
        if elem_code.text in elements_files:
            count += 1
            #print('*** found: ',count,':',elem_code.text)
            crumbs_files.append(elem_code.text)
        else:
            crumbs_files.append('')
        crumbs_codes.append(';'.join([e for e in crumb_c if e]))
        crumbs_names.append(';'.join([e for e in crumb_n if e]))
        numbers.append('.'.join([e for e in numbering if e]))
        numbered_crumbs.append((numbers[-1],crumbs_codes[-1],crumbs_names[-1], crumbs_files[-1]))
        if numbered_crumbs[-1][3] != '' and not numbered_crumbs[-1][1].endswith(numbered_crumbs[-1][3]):
            raise Exception('*** Error A!')
    if count != len(elements_downloads) :
        raise Exception('*** Error B!')
    return (crumbs_codes,crumbs_names,numbers,numbered_crumbs,crumbs_files)

crumbs_codes,crumb2,numbers,numbered_crumbs,crumbs_files = reduce_by_depth(elements_codes,elements_names)


### Convert to dataframe

* And export to Excel.

In [9]:


import datetime
current_time = datetime.datetime.now() 
outfile = 'Crumbs_'+str(current_time.month)+ '_' + str(current_time.day) + '_' + str(current_time.hour)+ '_' + str(current_time.minute)  +'.xlsx'


crumbsDF = pd.DataFrame(numbered_crumbs,columns=['Numbers','Codes','Names','Files'])
crumbsDF['Level'] = crumbsDF['Numbers'].apply(lambda x: x.count('.'))

for i in range(len(crumbsDF)):
    if crumbsDF.loc[i,'Files'] !='':
        crumbsDF.loc[i,'Files'] = 'https://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?file=data/'+crumbsDF.loc[i,'Files']+'.tsv.gz'


crumbsDF.to_excel(outfile,index=False)
crumbsDF

Unnamed: 0,Numbers,Codes,Names,Files,Level
0,1,data,Database by themes,,0
1,1.1,data;general,Database by themes;General and regional statis...,,1
2,1.1.1,data;general;euroind,Database by themes;General and regional statis...,,2
3,1.1.1.1,data;general;euroind;ei_bcs,Database by themes;General and regional statis...,,3
4,1.1.1.1.1,data;general;euroind;ei_bcs;ei_bcs_cs,Database by themes;General and regional statis...,,4
...,...,...,...,...,...
10466,4.8.6.4.1,cc;sks;sks_dev;sks_devict;isoc_ske_ittn2,Cross cutting topics;Skills-related statistics...,https://ec.europa.eu/eurostat/estat-navtree-po...,4
10467,4.8.6.5,cc;sks;sks_dev;sks_devcvt,Cross cutting topics;Skills-related statistics...,,3
10468,4.8.6.5.1,cc;sks;sks_dev;sks_devcvt;trng_cvt_01s,Cross cutting topics;Skills-related statistics...,https://ec.europa.eu/eurostat/estat-navtree-po...,4
10469,4.8.6.5.2,cc;sks;sks_dev;sks_devcvt;trng_cvt_12s,Cross cutting topics;Skills-related statistics...,https://ec.europa.eu/eurostat/estat-navtree-po...,4
