# BCRP Web Scrapping

We will create a webscrapper for the BCRP statistics database located at https://estadisticas.bcrp.gob.pe/estadisticas/series/ 

We will define three main functions:

```bcrp_search()```

```bcrp_scrapper()```

```downlodad_graph()```

### Author: Esteban Cabrera (esteban.cabrera@pucp.edu.pe)
### December 2023

# Libraries

In [1]:
# We import all necessary libraries

# Selenium
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Options driver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select

# Dataframes
import pandas as pd
import itertools
import os
from io import StringIO
import time
import requests

# Simulating human behavior
import time
from time import sleep
import random

# Clear data
import unidecode

# Json files
import json
import re
import numpy as np
import itertools
from pandas import json_normalize

# To use explicit waits
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Download files
import urllib.request
import requests
from openpyxl import Workbook

# # pytesseract
# from PIL import Image
# from io import BytesIO
# import pytesseract
# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# Code

### ```bcrp_search()```
We first define a function that allows us to search for different series in the BCRP Database website

In [2]:
def bcrp_search( series, frequency=None):
    ''' 
    Objective: 
    This function provides a basic searcher for the BCRP Database website. We provide the name of the series and the frequency we want
    and search for the data available. Ir returns a dataframe with all series that match our input.

    Input:
        series (str) : The name of the series we want, i.e: 'Inflación', 'PBI', 'Tasa de referencia', etc.

        frequency (str) : The frequency of the series, i.e: 'Mensual', 'Anual', 'Trimestral', etc.

    Output:
        The function creates a dataframe with all the series that match our input. The dataframe contains three columns:
        Código = Code of the series
        Descripción = Name of the series
        Frecuencia = The frequency of the series
    
    '''

    
    url     = 'https://estadisticas.bcrp.gob.pe/estadisticas/series/'
    options = Options()
    options.add_argument( '--headless' )
    driver  = webdriver.Chrome(options = options)        
    driver.get( url )
    driver.maximize_window()
    wait = WebDriverWait(driver, 10)

    search_box = wait.until( EC.element_to_be_clickable( ( By.XPATH, '//*[@id="txtbuscador"]' ) ) )
    search_box.send_keys( series )
    search_box.send_keys(Keys.RETURN)

    table_element = wait.until ( EC.element_to_be_clickable( (By.XPATH, '//*[@id="consultadata"]') ) )
    table_html    = table_element.get_attribute( 'outerHTML' )
    table_html_io = StringIO( table_html )
    table_df      = pd.read_html( table_html_io )[ 0 ]
    table_df = table_df.drop(columns=['Unnamed: 0'])
    table_df = table_df[:-1]

    # If we specify the frequency, we filter only those that match the frequency desired
    if frequency != None:
       table_df = table_df[table_df['Frecuencia'] == frequency]
        
    return table_df

First, we test the function

In [3]:
bcrp_search('PBI')

Unnamed: 0,Código,Descripción,Frecuencia
0,PN39868FQ,Flujos macroeconómicos (porcentaje del PBI) - ...,Trimestral
1,PN39867FQ,Flujos macroeconómicos (porcentaje del PBI) - ...,Trimestral
2,PN39866FQ,Flujos macroeconómicos (porcentaje del PBI) - ...,Trimestral
3,PN39524FM,Operaciones del sector público no financiero (...,Mensual
4,PN39351BQ,Posición de activos y pasivos externos (porcen...,Trimestral
...,...,...,...
965,CD10438DA,"PIB nominal y real, agregado y per cápita, 193...",Anual
966,CD10437DA,"PIB nominal y real, agregado y per cápita, 193...",Anual
967,CD10435DA,"PIB nominal y real, agregado y per cápita, 193...",Anual
968,CD10434DA,"PIB nominal y real, agregado y per cápita, 193...",Anual


In [4]:
bcrp_search('Inflación')

Unnamed: 0,Código,Descripción,Frecuencia
0,PN09830PM,Inflación de socios comerciales - IPC Externo ...,Mensual
1,PN09828PM,Inflación de socios comerciales - IPC Externo ...,Mensual
2,PN01311PM,Índice de precios al consumidor Lima Metropoli...,Mensual
3,PN01310PM,Índice de precios al consumidor Lima Metropoli...,Mensual
4,PN01309PM,Índice de precios al consumidor Lima Metropoli...,Mensual
5,PN01308PM,Índice de precios al consumidor Lima Metropoli...,Mensual
6,PN01306PM,Índice de precios al consumidor Lima Metropoli...,Mensual
7,PN01305PM,Índice de precios al consumidor Lima Metropoli...,Mensual
8,PN01304PM,Índice de precios al consumidor Lima Metropoli...,Mensual
9,PN01303PM,Índice de precios al consumidor Lima Metropoli...,Mensual


We test how much it takes the function to search for some variables, with and without specifying the frequency

In [5]:
st = time.time()
bcrp_search('Inflación')
et = time.time()
elapsed_time = et - st
print('Execution time:', elapsed_time, 'seconds')

Execution time: 8.395594358444214 seconds


In [6]:
st = time.time()
bcrp_search('Inflacion', 'Mensual')
et = time.time()
elapsed_time = et - st
print('Execution time:', elapsed_time, 'seconds')

Execution time: 8.212276458740234 seconds


In [7]:
st = time.time()
bcrp_search('PBI')
et = time.time()
elapsed_time = et - st
print('Execution time:', elapsed_time, 'seconds')

Execution time: 8.72754979133606 seconds


In [8]:
st = time.time()
bcrp_search('PBI', 'Trimestral')
et = time.time()
elapsed_time = et - st
print('Execution time:', elapsed_time, 'seconds')

Execution time: 9.573150396347046 seconds


### ```bcrp_scrapper()```

We then define a function that allows us to do webscrapping in the BCRP Database. We will call this function bcrp_scrapper. It will call four other functions: scrapper_diario, scrapper_mensual, scrapper_trimestral, scrapper_anual which are used to scrap the series of the corresponding frequency.

In [9]:
def scrapper_diario( driver ):

    '''
    Objective: 
        This function is called within bcrp_scrapper(). It is used to scrap daily series.

    Input: 
        Our only input is the driver, which is defined inside bcrp_scrapper().

    Output:
        It returns a dataframe with the series
    '''

    month_s = ['Ene','Feb','Mar','Abr','May','Jun','Jul','Ago','Set','Oct','Nov','Dic']
    month_d = ['-01-','-02-','-03-','-04-','-05-','-06-','-07-','-08-','-09-','-10-','-11-','-12-']

    table_element = driver.find_element(By.XPATH, '//*[@id="frmDiarias"]/div[3]/table')
    table_html    = table_element.get_attribute( 'outerHTML' )
    table_html_io = StringIO( table_html )
    table_df      = pd.read_html( table_html_io )[ 0 ]

    for (s,d) in zip(month_s,month_d):
        table_df['Fecha'] = table_df['Fecha'].str.replace(s,d)
        
    table_df['Fecha'] = pd.to_datetime(table_df['Fecha'], format = '%d-%m-%y')
    table_df.set_index(table_df['Fecha'], inplace=True)
    table_df = table_df.drop(columns=['Fecha'])

    return table_df


In [10]:
def scrapper_mensual( driver ):

    '''
    Objective: 
        This function is called within bcrp_scrapper(). It is used to scrap monthly series.

    Input: 
        Our only input is the driver, which is defined inside bcrp_scrapper().

    Output:
        It returns a dataframe with the series
    '''
    
    month_s = ['Ene','Feb','Mar','Abr','May','Jun','Jul','Ago','Sep','Oct','Nov','Dic']
    month_d = ['01-01-','01-02-','01-03-','01-04-','01-05-','01-06-','01-07-','01-08-','01-09-','01-10-','01-11-','01-12-']

    table_element = driver.find_element(By.XPATH, '//*[@id="frmMensual"]/div[3]/table')
    table_html    = table_element.get_attribute( 'outerHTML' )
    table_html_io = StringIO( table_html )
    table_df      = pd.read_html( table_html_io )[ 0 ]
    
    for (s,d) in zip(month_s,month_d):
        table_df['Fecha'] = table_df['Fecha'].str.replace(s,d)
        
    table_df['Fecha'] = pd.to_datetime(table_df['Fecha'], format = '%d-%m-%y')
    table_df.set_index(table_df['Fecha'], inplace=True)
    table_df = table_df.drop(columns=['Fecha'])

    return table_df

In [11]:
def scrapper_trimestral( driver ):

    '''
    Objective: 
        This function is called within bcrp_scrapper(). It is used to scrap quarterly series.

    Input: 
        Our only input is the driver, which is defined inside bcrp_scrapper().
    
    Output:
        It returns a dataframe with the series
    '''

    month_s = ['T1','T2','T3','T4']
    month_d = ['01-03-','01-06-','01-09-','01-12-']

    table_element = driver.find_element(By.XPATH, '//*[@id="frmTrimestral"]/div[3]/table')
    table_html    = table_element.get_attribute( 'outerHTML' )
    table_html_io = StringIO( table_html )
    table_df      = pd.read_html( table_html_io )[ 0 ]
    
    for (s,d) in zip(month_s,month_d):
        table_df['Fecha'] = table_df['Fecha'].str.replace(s,d)
        
    table_df['Fecha'] = pd.to_datetime(table_df['Fecha'], format = '%d-%m-%y')
    table_df.set_index(table_df['Fecha'], inplace=True)
    table_df = table_df.drop(columns=['Fecha'])

    return table_df    

In [12]:
def scrapper_anual( driver ):

    '''
    Objective: 
        This function is called within bcrp_scrapper(). It is used to scrap anual series.

    Input: 
        Our only input is the driver, which is defined inside bcrp_scrapper().

    Output:
        It returns a dataframe with the series
    '''

    table_element = driver.find_element(By.XPATH, '//*[@id="frmAnual"]/div[3]/table')
    table_html    = table_element.get_attribute( 'outerHTML' )
    table_html_io = StringIO( table_html )
    table_df      = pd.read_html( table_html_io )[ 0 ]
    table_df['Fecha'] = pd.to_datetime(table_df['Fecha'], format="%Y")
    table_df.set_index(table_df['Fecha'], inplace=True)
    table_df = table_df.drop(columns=['Fecha'])

    return table_df    

In [13]:
def bcrp_scrapper( series , start_date , end_date , freq ):

    '''
    Objective: 
        This function scraps series from the BCRP Database and gives us a dataframe with the series.

    Input: 
        series (str/list) : The code of the series we want to webscrap, i.e: '	PN38705PM'. In case we want to scrap many series, we enter a list with each code 
                            separated by a coma, up to 10 codes, i.e: ['PN38706PM', 'PN38707PM', 'PN38708PM', 'PN38708PM']

        start_date (str)  : The starting date of the series. For daily series it must follow the patter 'yyyy-mm-dd'. For other frequencies it must 
                            follow 'yyyy-mm'. For anual series it can be specified just as 'yyyy'.

        end_date (str)    : The starting date of the series. For daily series it must follow the patter 'yyyy-mm-dd'. For other frequencies it must 
                            follow 'yyyy-mm'. For anual series it can be specified just as 'yyyy'.

        freq (str)        : The frequency of the series. It accepts one of the following values: 'Diaria', 'Mensual', 'Trimestral', 'Anual'. It is
                            important that freq matches the frequency of the code/codes in series.
 
    Output:
        It returns a dataframe with the series.
    '''
    
    base     = 'https://estadisticas.bcrp.gob.pe/estadisticas/series/api/'

    if isinstance( series , list):
        string = ''
        for element in series : 
            string += element
            string += '-'
        string =  string[:-1]
        serie  = string

    else:
        serie  =  series
    anio1     = start_date
    anio2     = end_date
    url       = base + serie + '/' + 'html' + '/' + anio1  + '/' + anio2
    options   = Options()
    options.add_argument( '--headless' )
    driver    = webdriver.Chrome(options = options)        
    driver.get( url )
    driver.maximize_window()

    if freq == 'Diario' :

        table_df = scrapper_diario( driver )

    elif freq == 'Mensual' :

        table_df = scrapper_mensual( driver )

    elif freq == 'Trimestral' :

        table_df = scrapper_trimestral( driver )

    else:

        table_df = scrapper_anual( driver )
        
    return table_df
        
    

We test the function with all four frequencies, given each one a list of codes to scrap.

In [14]:
bcrp_scrapper( ['PD04637PD', 'PD04638PD'] , '2012-03-12' , '2022-05-30' , 'Diario' ).head()

Unnamed: 0_level_0,Tipo de cambio - TC Interbancario (S/ por US$) - Compra,Tipo de cambio - TC Interbancario (S/ por US$) - Venta
Fecha,Unnamed: 1_level_1,Unnamed: 2_level_1
2012-03-12,2.668,2.669
2012-03-13,2.669,2.67
2012-03-14,2.669,2.67
2012-03-15,2.671,2.673
2012-03-16,2.67,2.671


In [15]:
bcrp_scrapper(['PN01288PM', 'PN01218PM', 'PN01219PM'], '2009-06', '2020-03', 'Mensual' ).head()

Unnamed: 0_level_0,Tipo de cambio - fin de periodo (S/ por US$) - Bancario - Venta,Tipo de cambio - fin de periodo (S/ por US$) - Bancario - Promedio,Índice de precios Lima Metropolitana (índice 2009 = 100) (descontinuada) - IPC Sin Alimentos
Fecha,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2009-06-01,3.011,3.01,99.6
2009-07-01,2.987,2.986,99.75
2009-08-01,2.948,2.947,99.61
2009-09-01,2.885,2.884,99.65
2009-10-01,2.906,2.904,99.54


In [16]:
bcrp_scrapper(['PN39030BQ', 'PD37942PQ'], '2020-03', '2022-06', 'Trimestral').head()

Unnamed: 0_level_0,"Departamentos (Sector Alto): Barranco, La Molina, Miraflores, San Borja, San Isidro y Surco - Precios por m2 (S/ Constantes del 2009)",Balanza comercial - valores FOB (millones US$) - Exportaciones
Fecha,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-01,4901,10298
2020-06-01,5095,6779
2020-09-01,5180,11689
2020-12-01,5278,14060
2021-03-01,5242,13594


In [17]:
bcrp_scrapper(['PN39030BQ', 'PD37942PQ'], '2020-04', '2022-04', 'Trimestral').head()

Unnamed: 0_level_0,"Departamentos (Sector Alto): Barranco, La Molina, Miraflores, San Borja, San Isidro y Surco - Precios por m2 (S/ Constantes del 2009)",Balanza comercial - valores FOB (millones US$) - Exportaciones
Fecha,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-01,4901,10298
2020-06-01,5095,6779
2020-09-01,5180,11689
2020-12-01,5278,14060
2021-03-01,5242,13594


In [18]:
bcrp_scrapper(['PD10021MA', 'PM05279PA'], '2009-01', '2023-01', 'Anual').head()

Unnamed: 0_level_0,Indicadores monetarios - Velocidad de Circulación (var. %),Índice del tipo de cambio - variaciones porcentuales (base 2009 = 100) - Compra
Fecha,Unnamed: 1_level_1,Unnamed: 2_level_1
2009-01-01,49,1000
2010-01-01,-96,938
2011-01-01,-69,915
2012-01-01,-121,876
2013-01-01,-99,897


Finally, we test how much the function takes

In [19]:
st = time.time()
bcrp_scrapper( ['PD04637PD', 'PD04638PD'] , '2012-03-12' , '2022-05-30' , 'Diario' )
et = time.time()
elapsed_time = et - st
print('Execution time:', elapsed_time, 'seconds')

Execution time: 6.931246519088745 seconds


In [20]:
st = time.time()
bcrp_scrapper(['PN01288PM', 'PN01218PM', 'PN01219PM'], '2009-06', '2020-03', 'Mensual' )
et = time.time()
elapsed_time = et - st
print('Execution time:', elapsed_time, 'seconds')

Execution time: 8.25431203842163 seconds


In [21]:
st = time.time()
bcrp_scrapper(['PN39030BQ', 'PD37942PQ'], '2020-04', '2022-04', 'Trimestral')
et = time.time()
elapsed_time = et - st
print('Execution time:', elapsed_time, 'seconds')

Execution time: 6.855939626693726 seconds


In [22]:
st = time.time()
bcrp_scrapper(['PD10021MA', 'PM05279PA'], '2009-01', '2023-01', 'Anual')
et = time.time()
elapsed_time = et - st
print('Execution time:', elapsed_time, 'seconds')

Execution time: 6.683593511581421 seconds


### ```download_graph()```

We now create function that allows us to download a graph of the series in three different formats: png, jpg and pdf.

In [23]:
def download_graph( series , start_date , end_date , format= 'png'):
    '''
    Objective: 
        This function scraps series from the BCRP Database and downloads the image in the given format.

    Input: 
        series (str/list) : The code of the series we want to webscrap, i.e: '	PN38705PM'. In case we want to scrap many series, we enter a list with each code 
                            separated by a coma, up to 10 codes, i.e: ['PN38706PM', 'PN38707PM', 'PN38708PM', 'PN38708PM']

        start_date (str)  : The starting date of the series. For daily series it must follow the patter 'yyyy-mm-dd'. For other frequencies it must 
                            follow 'yyyy-mm'. For anual series it can be specified just as 'yyyy'.

        end_date (str)    : The starting date of the series. For daily series it must follow the patter 'yyyy-mm-dd'. For other frequencies it must 
                            follow 'yyyy-mm'. For anual series it can be specified just as 'yyyy'.

        format (str)      : It tells us the format of the download. It can be 'jpg', 'png', 'pdf'. By default it downloads a png picture.
 
    Output:
        It downloads the image in the given format.
    '''
    base     = 'https://estadisticas.bcrp.gob.pe/estadisticas/series/api/'

    if isinstance( series , list):
        string = ''
        for element in series : 
            string += element
            string += '-'
        string =  string[:-1]
        serie  = string

    else:
        serie  =  series
    anio1     = start_date
    anio2     = end_date
    url       = base + serie + '/' + 'html' + '/' + anio1  + '/' + anio2
    options   = Options()
    options.add_argument( "--headless=new" )
    driver    = webdriver.Chrome(options = options)        
    driver.get( url )
    driver.maximize_window()

    driver.find_element(By.XPATH, '//*[@id="btnGrafico"]').click()

    # We go to the next window and download the image in selected format.
    window_after = driver.window_handles[1]
    driver.switch_to.window(window_after)

    # Depending on the format we choose, the image is downloaded    
    if format   == 'png':
        driver.find_element(By.XPATH, '//*[@id="chart-selector"]/li[2]/img').click()
        time.sleep(4)

    elif format == 'jpg':
        driver.find_element(By.XPATH, '//*[@id="chart-selector"]/li[1]/img').click()
        time.sleep(4)

    elif format == 'pdf':
        driver.find_element(By.XPATH, '//*[@id="chart-selector"]/li[3]/img').click()
        time.sleep(4)

    return
           

We know test the function with different parameters

In [24]:
download_graph( 'PN01288PM' , '2020-04' , '2023-01' , 'jpg')

In [25]:
download_graph( 'PN01288PM' , '2020-04' , '2023-01' , 'pdf')

In [26]:
download_graph( 'PN01288PM' , '2020-04' , '2023-01' , 'png')

We test how long the function takes

In [27]:
st = time.time()
download_graph( 'PN01288PM' , '2020-04' , '2023-01' , 'pdf')
et = time.time()
elapsed_time = et - st
print('Execution time:', elapsed_time, 'seconds')

Execution time: 12.376088380813599 seconds
