**1. Libraries**

In [1]:
from requests_html import HTMLSession
import pandas as pd
import re, os, urllib, shutil

**2. Directories**

In [2]:
## Creating directories
path_raw = os.path.join(os.getcwd(), 'Projections')
path_national = os.path.join(path_raw, 'National')
path_departmental = os.path.join(path_raw, 'Departmental')
path_municipal = os.path.join(path_raw, 'Municipal')
if os.path.exists(path_raw) == False: 
    os.mkdir(path_raw)
if os.path.exists(path_national) == False: 
    os.mkdir(path_national)
if os.path.exists(path_departmental) == False: 
    os.mkdir(path_departmental)
if os.path.exists(path_municipal) == False: 
    os.mkdir(path_municipal)

**3. Connecting**

In [3]:
session = HTMLSession()

In [4]:
url = 'https://www.dane.gov.co/index.php/estadisticas-por-tema/demografia-y-poblacion/proyecciones-de-poblacion'

In [5]:
response = session.get(url)

**3.1 Scrapping**

In [6]:
links = response.html.absolute_links

In [7]:
National = list()
Departmental = list()
Municipal = list()
for link in links:
    if re.search('DCD-area-sexo-edad-proypoblacion-Nac-[0-9]{4}-[0-9]{4}.xlsx$', link):
        National.append(link)
        continue
    if re.search('DCD-area-sexo-edad-proye*poblacion-dep-[0-9]{4}-[0-9]{4}.*\.xlsx$', link):
        Departmental.append(link)
        continue
    if re.search('DCD-area-sexo-edad-proye*poblacion-Mun-[0-9]{4}-[0-9]{4}.*\.xlsx$', link):
        Municipal.append(link)
        continue

**3.2 Download and export**

In [8]:
def get_csv(links, pattern, path_level,csv_name):
    df_all = pd.DataFrame()
    for web in links:
            name = re.findall(pattern, web)[0]
            path = os.path.join(path_level, name)
            
            if os.path.isfile(path) == False:
                print('Reading', path)
                urllib.request.urlretrieve(web, path)
            print('File downloaded')
            current = pd.read_excel(path, skiprows=11, dtype=str)
            # Find the index of the row where the table ends
            end_index = None
            for i, row in current.iterrows():
                if pd.isnull(row[0]):  # Assuming the first column is used to determine the end of the table
                    end_index = i
                    break

            # Filter out the unwanted rows
            if end_index is not None:
                current = current.iloc[:end_index]
            print(current.head(2))
            df_all = pd.concat([df_all, current]).reset_index(drop=True)
            print(name, 'read')
    
    cols_to_split = df_all.loc[:,'Hombres_0':].columns.values
    ids = df_all.loc[:,:'ÁREA GEOGRÁFICA'].columns.values
    print('ids and split identified')
    new_all = pd.melt(df_all, id_vars =ids, value_vars =cols_to_split, var_name ='Sexo_edad', value_name ='No_personas')
    new_all = new_all.rename(columns={'ÁREA GEOGRÁFICA': 'Area'})

    print('col to row')
    new_all[["Sexo", "Edad"]] = new_all.Sexo_edad.str.split(pat='_', n = 1, expand = True)
    print('sex and age splited')
    new_all = new_all.drop(columns=['DPNOM','Sexo_edad'])
    
    new_all.to_csv(csv_name, encoding='utf-8-sig', index=False)

In [9]:
get_csv(National, "Nac-[0-9]{4}-[0-9]{4}.xlsx$", path_national, 'Projections\\National.csv')

Reading C:\Users\romer\Documents\Projects\2_Demographic_data\Projections\National\Nac-2020-2070.xlsx
File downloaded
   DP           DPNOM   AÑO                    ÁREA GEOGRÁFICA Hombres_0  \
0  00  Total Nacional  2020                           Cabecera    270528   
1  00  Total Nacional  2020  Centros Poblados y Rural Disperso    117791   

  Hombres_1 Hombres_2 Hombres_3 Hombres_4 Hombres_5  ... Total_94 Total_95  \
0    273271    277416    279302    280802    281974  ...    14749    12496   
1    117443    117932    117746    117428    116950  ...     6167     5084   

  Total_96 Total_97 Total_98 Total_99 Total_100 y más Total Hombres  \
0    10252     8304     6834     5878           18782      18312462   
1     4206     3346     2304     2045            5084       6315052   

  Total Mujeres     Total  
0      19922767  38235229  
1       5857366  12172418  

[2 rows x 310 columns]
Nac-2020-2070.xlsx read
Reading C:\Users\romer\Documents\Projects\2_Demographic_data\Projections\

In [10]:
get_csv(Departmental, "dep-[0-9]{4}-[0-9]{4}.*\.xlsx$", path_departmental, 'Projections\\Departmental.csv')

Reading C:\Users\romer\Documents\Projects\2_Demographic_data\Projections\Departmental\dep-1985-1992.xlsx
File downloaded
   DP      DPNOM   AÑO                    ÁREA GEOGRÁFICA Hombres_0 Hombres_1  \
0  05  Antioquia  1985                           Cabecera     26387     26250   
1  05  Antioquia  1985  Centros Poblados y Rural Disperso     22738     22154   

  Hombres_2 Hombres_3 Hombres_4 Hombres_5  ... Total_94 Total_95 Total_96  \
0     26238     26189     25979     25463  ...      995      872      707   
1     21745     21324     20742     19954  ...      504      407      351   

  Total_97 Total_98 Total_99 Total_100 y más Total Hombres Total Mujeres  \
0      567      439      271             581       1232924       1324913   
1      270      177      125             209        744430        692717   

     Total  
0  2557837  
1  1437147  

[2 rows x 310 columns]
dep-1985-1992.xlsx read
Reading C:\Users\romer\Documents\Projects\2_Demographic_data\Projections\Departmental\d

In [12]:
get_csv(Municipal, "Mun-[0-9]{4}-[0-9]{4}.*\.xlsx$", path_municipal,'Projections\\Municipal.csv')

File downloaded
   DP      DPNOM   DPMP      MPIO   AÑO                    ÁREA GEOGRÁFICA  \
0  05  Antioquia  05001  Medellín  1995                 Cabecera Municipal   
1  05  Antioquia  05001  Medellín  1995  Centros Poblados y Rural Disperso   

  Hombres_0 Hombres_1 Hombres_2 Hombres_3  ... Total_79 Total_80 Total_81  \
0     16228     16363     16592     16586  ...     2789     2535     2309   
1       681       663       653       636  ...       69       63       58   

  Total_82 Total_83 Total_84 Total_85 y más Total Hombres Total Mujeres  \
0     2068     1810     1556           8346        789154        915073   
1       53       48       43            280         25859         26675   

  Total General  
0       1704227  
1         52534  

[2 rows x 267 columns]
Mun-1995-2004.xlsx read
File downloaded
   DP      DPNOM      DPMP   MPIO   AÑO                    ÁREA GEOGRÁFICA  \
0  05  Antioquia  Medellín  05001  2005                 Cabecera Municipal   
1  05  Antioquia 

KeyError: 0

**4. Remove directories**

In [None]:
shutil.rmtree(path_national)
shutil.rmtree(path_departmental)
shutil.rmtree(path_municipal)