**Libraries**

In [1]:
from requests_html import HTMLSession
import re, os, shutil, zipfile, io

**Functions created**

In [2]:
## To download and unzip the zip files
def download_unzip(names, webs, path):
    print('       Downloading "{}"'.format(names))
    zip_link = re.findall('https://microdatos.dane.gov.co//catalog/[0-9]+/download/[0-9]+', webs)[0]
    
    if re.match(".*[0-9]+.*", names) == None:
        zip_doc = session.get(zip_link, stream=True)
        zip = zipfile.ZipFile(io.BytesIO(zip_doc.content))
        listOfiles = zip.namelist()
        zip.extractall(path)

        one_path = os.path.join(path, names.replace('.zip','').replace('_',' ')).replace('\\','\\\\')
        ### Identifyin zip files
        for element in os.listdir(one_path):
            if bool(re.search('.zip$', element)) == True:
                other_path = os.path.join(one_path,element)
                #### Extracting zip files
                with zipfile.ZipFile(other_path, 'r') as zObject:
                    # Extracting all the members of the zip 
                    # into a specific location.
                      zObject.extractall(path)
                zObject.close()
    else:        
        zip_doc = session.get(zip_link, stream=True)
        zip = zipfile.ZipFile(io.BytesIO(zip_doc.content))
        zip.extractall(path)     

In [3]:
## Removing unneeded files
def remove_unneeded(path):    
    for doc in os.listdir(path):
        doc_path = os.path.join(path, doc)
        if os.path.isdir(doc_path):

            ## Read files
            for element in os.listdir(doc_path):
                if re.search('\.txt$|\.csv$', element):
                    old_element_path = os.path.join(doc_path, element)
                    new_element_path = os.path.join(path, element)
                    os.rename(old_element_path, new_element_path)
            try:
                shutil.rmtree(doc_path)
                continue
            except:
                os.rmdir(doc_path)
        else:
            if re.search('\.txt$', doc):
                continue
            elif re.search('\.csv$', doc) and re.sub('\.csv$','.txt', doc) not in os.listdir(path):
                continue
            else:
                os.remove(doc_path)
                
    print('Unneded files removed')

In [4]:
# To rename files
def rename_files(path, identity):
    for doc in os.listdir(path):
        file_year = re.findall('\d+.+', doc)[0]
        new_name  = identity + file_year

        old_path = os.path.join(path, doc)
        new_path = os.path.join(path, new_name)
        os.rename(old_path, new_path)

**Getting pages' URL**

In [5]:
session = HTMLSession()

In [6]:
url = 'https://microdatos.dane.gov.co/catalog/MICRODATOS/about_collection/22/?per_page='

In [7]:
response = session.get(url)

In [8]:
links = response.html.absolute_links

In [9]:
url_1 = url.replace('?', '\?') + '[0-9]*'
links_list = list()
links_list.append(url)

In [10]:
for link in links:         
    if re.search(url_1, link):
        links_list.append(link)

**Getting zip's links**

In [11]:
files = list()

for page in links_list:
    response = session.get(page)
    webs = response.html.absolute_links

    for web in webs:
        if re.search('https://microdatos.dane.gov.co//catalog/[0-9]+/get_microdata', web):
            files.append(web)
print('Urls to scrape, got')
print('======================')

Urls to scrape, got


**Downloading files**

In [12]:
## Creating directories
path_raw = os.path.join(os.getcwd(), 'raw_data')
path_births = os.path.join(path_raw, 'births')
path_deaths = os.path.join(path_raw, 'deaths')
path_fetals = os.path.join(path_raw, 'fetals')
if os.path.exists(path_raw) == False: 
    os.mkdir(path_raw)
if os.path.exists(path_births) == False: 
    os.mkdir(path_births)
if os.path.exists(path_deaths) == False: 
    os.mkdir(path_deaths)
if os.path.exists(path_fetals) == False: 
    os.mkdir(path_fetals)
    
files_birth = [re.sub('\.txt|\.sav|\.csv', '', doc) for doc in os.listdir(path_births)]
files_death = [re.sub('\.txt|\.sav|\.csv', '', doc) for doc in os.listdir(path_deaths)]
files_fetals = [re.sub('\.txt|\.sav|\.csv', '', doc) for doc in os.listdir(path_fetals)]

In [13]:
## Downloading and decompressing zips
zips = list()

for file in files:
    response = session.get(file)
    webpages = response.html.find('input[alt=zip]')
    title   = response.html.find('h1[itemprop=name]')[0].text
    print('\nRetrieving "{}" from: \n{}'.format(title, file))
    
    for webpage in webpages:
        zip_webs  = webpage.attrs['onclick']
        zip_names = webpage.attrs['title']
        zip_names = re.sub(' ', '_', zip_names)
        
        if zip_names.replace('.zip', '') in files_birth or zip_names.replace('.zip', '') in files_death:
            continue
        else:
            if re.search('Nac'.casefold(), zip_names.casefold()):
                download_unzip(zip_names, zip_webs, path_births)
            elif re.search('Def|No'.casefold(), zip_names.casefold()):
                download_unzip(zip_names, zip_webs, path_deaths)
            else:
                download_unzip(zip_names, zip_webs, path_fetals)
            
print('Zip files downloaded')


Retrieving "COLOMBIA - Estadísticas Vitales - EEVV - 2012-2013" from: 
https://microdatos.dane.gov.co//catalog/377/get_microdata
       Downloading "Defunciones.zip"
       Downloading "Defunciones_txt.zip"
       Downloading "Nacidos_vivos.zip"
       Downloading "Nacidos_vivos_txt.zip"
       Downloading "fetal2012.zip"
       Downloading "fetal2013.zip"

Retrieving "COLOMBIA - Estadìsticas Vitales - EEVV - 1992 -1996" from: 
https://microdatos.dane.gov.co//catalog/397/get_microdata
       Downloading "Defun1992.zip"
       Downloading "Defun1993.zip"
       Downloading "Defun1994.zip"
       Downloading "Defun1995.zip"
       Downloading "Defun1996.zip"

Retrieving "COLOMBIA - Estadìsticas Vitales - EEVV - 1998 -2007" from: 
https://microdatos.dane.gov.co//catalog/366/get_microdata
       Downloading "Nacimientos_1998.zip"
       Downloading "Defunciones_1998.zip"
       Downloading "Nacimientos_1999.zip"
       Downloading "Defunciones_1999.zip"
       Downloading "Nacimientos_200

**Removing unneeded files**

In [14]:
remove_unneeded(path_births)
remove_unneeded(path_deaths)
remove_unneeded(path_fetals)

Unneded files removed
Unneded files removed
Unneded files removed


**Renaming the files**

In [15]:
rename_files(path_births, 'Births_')
rename_files(path_deaths, 'Deaths_')
rename_files(path_fetals, 'Fetals_')