**1. Libraries**

In [1]:
from requests_html import HTMLSession
import pandas as pd
import re, os, urllib, shutil

**2. Directories**

In [2]:
## Creating directories
path_raw = os.path.join(os.getcwd(), 'Projections')
path_national = os.path.join(path_raw, 'National')
path_departmental = os.path.join(path_raw, 'Departmental')
path_municipal = os.path.join(path_raw, 'Municipal')
if os.path.exists(path_raw) == False: 
    os.mkdir(path_raw)
if os.path.exists(path_national) == False: 
    os.mkdir(path_national)
if os.path.exists(path_departmental) == False: 
    os.mkdir(path_departmental)
if os.path.exists(path_municipal) == False: 
    os.mkdir(path_municipal)

**3. Connecting**

In [3]:
session = HTMLSession()

In [4]:
url = 'https://www.dane.gov.co/index.php/estadisticas-por-tema/demografia-y-poblacion/proyecciones-de-poblacion'

In [5]:
response = session.get(url)

**3.1 Scrapping**

In [6]:
links = response.html.absolute_links

In [7]:
National = list()
Departmental = list()
Municipal = list()
for link in links:
    if re.search('anexo-(area-sexo-edad-)*proyecciones-poblacion-Nacional[0-9]{4}_[0-9]{4}.xlsx$', link):
        National.append(link)
        continue
    if re.search('anexo-(area-sexo-edad-)*proyecciones-poblacion-departamental_[0-9]{4}-[0-9]{4}.xlsx$', link):
        Departmental.append(link)
        continue
    if re.search('anexo-(area-sexo-edad-)*proyecciones-poblacion-Municipal_[0-9]{4}-[0-9]{4}.xlsx$', link):
        Municipal.append(link)
        continue

**3.2 Download and export**

In [8]:
def get_csv(links, pattern, path_level,csv_name):
    df_all = pd.DataFrame()
    for web in links:
            name = re.findall(pattern, web)[0]
            path = os.path.join(path_level, name)
            
            if os.path.isfile(path) == False:
                print('Reading', path)
                urllib.request.urlretrieve(web, path)
            print('File downloaded')
            current = pd.read_excel(path, skiprows=11)
            print(current.head(2))
            df_all = pd.concat([df_all, current]).reset_index(drop=True)
            print(name, 'read')
    
    cols_to_split = df_all.loc[:,'Hombres_0':].columns.values
    ids = df_all.loc[:,:'ÁREA GEOGRÁFICA'].columns.values
    print('ids and split identified')
    new_all = pd.melt(df_all, id_vars =ids, value_vars =cols_to_split, var_name ='Sexo_edad', value_name ='No_personas')

    print('col to row')
    new_all[["Sexo", "Edad"]] = new_all.Sexo_edad.str.split(pat='_', n = 1, expand = True)
    print('sex and age splited')
    new_all.drop(columns=['Sexo_edad'])
    
    new_all.to_csv(csv_name, encoding='utf-8-sig', index=False)

In [25]:
get_csv(National, "Nacional[0-9]{4}_[0-9]{4}.xlsx$", path_national, 'Projections\\National.csv')

Reading C:\Users\david\Documents\Projects\1_Own_Projects\Demographic\Projections\National\Nacional1950_1984.xlsx
File downloaded
   DP     DPNOM   AÑO                    ÁREA GEOGRÁFICA  Hombres_0  \
0   0  Nacional  1950                           Cabecera      78665   
1   0  Nacional  1950  Centros Poblados y Rural Disperso     184737   

   Hombres_1  Hombres_2  Hombres_3  Hombres_4  Hombres_5  ...  Total_94  \
0      73308      70757      68626      66636      64726  ...       121   
1     170191     162832     156622     150955     145789  ...       234   

   Total_95  Total_96  Total_97  Total_98  Total_99  Total_100  Total Hombres  \
0        57        24         9         3         1        873        2433730   
1       109        45        18         6         3       1044        4931650   

   Total Mujeres    Total  
0        2757803  5191533  
1        3447760  8379410  

[2 rows x 310 columns]
Nacional1950_1984.xlsx read
Reading C:\Users\david\Documents\Projects\1_Own_Pro

In [9]:
get_csv(Departmental, "departamental_[0-9]{4}-[0-9]{4}.xlsx$", path_departmental, 'Projections\\Departmental.csv')

Reading C:\Users\david\Documents\Projects\1_Own_Projects\Demographic\Projections\Departmental\departamental_2005-2017.xlsx
File downloaded
   DP      DPNOM   AÑO                    ÁREA GEOGRÁFICA  Hombres_0  \
0   5  Antioquia  2005                 Cabecera Municipal      31263   
1   5  Antioquia  2005  Centros Poblados y Rural Disperso      17346   

   Hombres_1  Hombres_2  Hombres_3  Hombres_4  Hombres_5  ...  Total_94  \
0      32006      32827      33689      34587      35538  ...       725   
1      17392      17518      17661      17781      17931  ...       221   

   Total_95  Total_96  Total_97  Total_98  Total_99  Total_100 y más  \
0       580       451       363       303       244              700   
1       164       136       105        76        70              155   

   Total Hombres  Total Mujeres    Total  
0        1893416        2143679  4037095  
1         737371         686377  1423748  

[2 rows x 310 columns]
departamental_2005-2017.xlsx read
Reading C:\Use

In [None]:
get_csv(Municipal, "Municipal_[0-9]{4}-[0-9]{4}.xlsx$", path_municipal,'Projections\\Municipal.csv')

Reading C:\Users\david\Documents\Projects\1_Own_Projects\Demographic\Projections\Municipal\Municipal_1985-1994.xlsx
File downloaded
   DP      DPNOM  DPMP      MPIO   AÑO                    ÁREA GEOGRÁFICA  \
0   5  Antioquia  5001  Medellín  2027                 Cabecera Municipal   
1   5  Antioquia  5001  Medellín  2027  Centros Poblados y Rural Disperso   

   Hombres_0  Hombres_1  Hombres_2  Hombres_3  ...  Total_94  Total_95  \
0      14542      14744      14916      15081  ...      1022       822   
1        270        269        270        271  ...        11         8   

   Total_96  Total_97  Total_98  Total_99  Total_100 y más  Total Hombres  \
0       649       515       401       313             1055        1314478   
1         6         5         4         3                7          21908   

   Total Mujeres    Total  
0        1469827  2784305  
1          21272    43180  

[2 rows x 312 columns]
Municipal_2027-2035.xlsx read
Reading C:\Users\david\Documents\Projects\1

**4. Remove directories**

In [10]:
shutil.rmtree(path_national)
shutil.rmtree(path_departmental)
shutil.rmtree(path_municipal)