**1. Libraries**

In [1]:
from requests_html import HTMLSession
import pandas as pd
import re, os, urllib, shutil

**2. Directories**

In [2]:
## Creating directories
path_raw = os.path.join(os.getcwd(), 'Projections')
path_national = os.path.join(path_raw, 'National')
path_departmental = os.path.join(path_raw, 'Departmental')
path_municipal = os.path.join(path_raw, 'Municipal')
if os.path.exists(path_raw) == False: 
    os.mkdir(path_raw)
if os.path.exists(path_national) == False: 
    os.mkdir(path_national)
if os.path.exists(path_departmental) == False: 
    os.mkdir(path_departmental)
if os.path.exists(path_municipal) == False: 
    os.mkdir(path_municipal)

**3. Connecting**

In [3]:
session = HTMLSession()

In [4]:
url = 'https://www.dane.gov.co/index.php/estadisticas-por-tema/demografia-y-poblacion/proyecciones-de-poblacion'

In [5]:
response = session.get(url)

**3.1 Scrapping**

In [6]:
links = response.html.absolute_links

In [7]:
National = list()
Departmental = list()
Municipal = list()
for link in links:
    if re.search('DCD-area-sexo-edad-proypoblacion-Nac-[0-9]{4}-[0-9]{4}.xlsx$', link):
        National.append(link)
        continue
    if re.search('DCD-area-sexo-edad-proypoblacion-dep-[0-9]{4}-[0-9]{4}.xlsx$', link):
        Departmental.append(link)
        continue
    if re.search('DCD-area-sexo-edad-proypoblacion-Mun-[0-9]{4}-[0-9]{4}.xlsx$', link):
        Municipal.append(link)
        continue

**3.2 Download and export**

In [8]:
def get_csv(links, pattern, path_level,csv_name):
    df_all = pd.DataFrame()
    for web in links:
            name = re.findall(pattern, web)[0]
            path = os.path.join(path_level, name)
            
            if os.path.isfile(path) == False:
                print('Reading', path)
                urllib.request.urlretrieve(web, path)
            print('File downloaded')
            current = pd.read_excel(path, skiprows=11)
            print(current.head(2))
            df_all = pd.concat([df_all, current]).reset_index(drop=True)
            print(name, 'read')
    
    cols_to_split = df_all.loc[:,'Hombres_0':].columns.values
    ids = df_all.loc[:,:'ÁREA GEOGRÁFICA'].columns.values
    print('ids and split identified')
    new_all = pd.melt(df_all, id_vars =ids, value_vars =cols_to_split, var_name ='Sexo_edad', value_name ='No_personas')

    print('col to row')
    new_all[["Sexo", "Edad"]] = new_all.Sexo_edad.str.split(pat='_', n = 1, expand = True)
    print('sex and age splited')
    new_all.drop(columns=['Sexo_edad'])
    
    new_all.to_csv(csv_name, encoding='utf-8-sig', index=False)

In [9]:
get_csv(National, "Nac-[0-9]{4}-[0-9]{4}.xlsx$", path_national, 'Projections\\National.csv')

Reading C:\Users\romer\Documents\Projects\2_Demographic_data\Projections\National\Nac-1950-2019.xlsx
File downloaded
    DP     DPNOM     AÑO                    ÁREA GEOGRÁFICA  Hombres_0  \
0  0.0  Nacional  1950.0                           Cabecera    78665.0   
1  0.0  Nacional  1950.0  Centros Poblados y Rural Disperso   184737.0   

   Hombres_1  Hombres_2  Hombres_3  Hombres_4  Hombres_5  ...  Total_94  \
0    73308.0    70757.0    68626.0    66636.0    64726.0  ...     121.0   
1   170191.0   162832.0   156622.0   150955.0   145789.0  ...     234.0   

   Total_95  Total_96  Total_97  Total_98  Total_99  Total_100  Total Hombres  \
0      57.0      24.0       9.0       3.0       1.0      873.0      2433730.0   
1     109.0      45.0      18.0       6.0       3.0     1044.0      4931650.0   

   Total Mujeres      Total  
0      2757803.0  5191533.0  
1      3447760.0  8379410.0  

[2 rows x 310 columns]
Nac-1950-2019.xlsx read
Reading C:\Users\romer\Documents\Projects\2_Demograp

In [10]:
get_csv(Departmental, "dep-[0-9]{4}-[0-9]{4}.xlsx$", path_departmental, 'Projections\\Departmental.csv')

Reading C:\Users\romer\Documents\Projects\2_Demographic_data\Projections\Departmental\dep-2005-2019.xlsx
File downloaded
   DP      DPNOM   AÑO                    ÁREA GEOGRÁFICA  Hombres_0  \
0   5  Antioquia  2005                           Cabecera      31263   
1   5  Antioquia  2005  Centros Poblados y Rural Disperso      17346   

   Hombres_1  Hombres_2  Hombres_3  Hombres_4  Hombres_5  ...  Total_94  \
0      32006      32827      33689      34587      35538  ...       725   
1      17392      17518      17661      17781      17931  ...       221   

   Total_95  Total_96  Total_97  Total_98  Total_99  Total_100  Total Hombres  \
0       580       451       363       303       244        700        1893416   
1       164       136       105        76        70        155         737371   

   Total Mujeres  Total general  
0        2143679        4037095  
1         686377        1423748  

[2 rows x 310 columns]
dep-2005-2019.xlsx read
Reading C:\Users\romer\Documents\Projects\

In [11]:
get_csv(Municipal, "Mun-[0-9]{4}-[0-9]{4}.xlsx$", path_municipal,'Projections\\Municipal.csv')

Reading C:\Users\romer\Documents\Projects\2_Demographic_data\Projections\Municipal\Mun-2005-2019.xlsx
File downloaded
   DP      DPNOM      DPMP  MPIO   AÑO                    ÁREA GEOGRÁFICA  \
0   5  Antioquia  Medellín  5001  2005                 Cabecera Municipal   
1   5  Antioquia  Medellín  5001  2005  Centros Poblados y Rural Disperso   

   Hombres_0  Hombres_1  Hombres_2  Hombres_3  ...  Total_79  Total_80  \
0      13812      14233      14680      15148  ...      4352      3944   
1        489        493        499        503  ...        83        76   

   Total_81  Total_82  Total_83  Total_84  Total_85 y más  Total Hombres  \
0      3547      3132      2776      2540           12870         927942   
1        67        60        52        48             231          23924   

   Total Mujeres  Total General  
0        1069821        1997763  
1          24654          48578  

[2 rows x 267 columns]
Mun-2005-2019.xlsx read
Reading C:\Users\romer\Documents\Projects\2_Demo

**4. Remove directories**

In [12]:
shutil.rmtree(path_national)
shutil.rmtree(path_departmental)
shutil.rmtree(path_municipal)