# Procurement

In [5]:
import pandas as pd
import requests
import re
from lxml import html

## Offices

In [6]:
def normalize_acronym(acron):
    aux = re.sub(r'[^\w]', ' ', acron)
    aux = re.sub(r'\s+', '-', aux)
    return aux.lower()

In [7]:
offices = pd.read_json('https://www.transparencia.gob.sv/api/v1/institutions.json')
offices['key'] = offices['acronym'].apply(normalize_acronym)
offices[['name', 'acronym', 'key']]

Unnamed: 0,name,acronym,key
0,Registro Nacional de las Personas Naturales,RNPN,rnpn
1,Instituto Nacional de los Deportes de El Salvador,INDES,indes
2,Instituto Salvadoreño para el Desarrollo de la...,ISDEMU,isdemu
3,Fondo de Inversión Social para el Desarrollo L...,FISDL / FINET,fisdl-finet
4,Fondo Solidario para la Familia Microempresaria,FOSOFAMILIA,fosofamilia
...,...,...,...
95,Alcaldía Municipal de El Porvenir,"El Porvenir, Santa Ana",el-porvenir-santa-ana
96,Alcaldía Municipal de Candelaria,"Candelaria, Cuscatlán",candelaria-cuscatlán
97,Alcaldía Municipal de San Pedro Masahuat,"San Pedro Masahuat, La Paz",san-pedro-masahuat-la-paz
98,Alcaldía Municipal de Santiago Texancuangos,"Santiago Texancuangos, San Salvador",santiago-texancuangos-san-salvador


## Compras y contrataciones

In [8]:
def contract_detail(url):
    page = html.fromstring(requests.get(url).text)
    sections = page.xpath('/html/body/main/div/div/div[2]/div[2]/div/div/div')
    labels = [sections[i].xpath('strong/text()')[0] for i in range(0, len(sections), 2)]
    content = [sections[i].xpath('text()')[0].strip() for i in range(1, len(sections), 2)]
    card = {labels[i]: content[i] for i in range(len(labels))}
    card['Archivo adjunto'] = page.xpath('/html/body/main/div/div/div[2]/div[2]/div/div[12]/div[2]/a/@href')[0]
    return card

In [9]:
def contract_list(office):
    contracts = []
    url = 'https://www.transparencia.gob.sv/institutions/{}/contracts'.format(office)
    # print(url)
    page = html.fromstring(requests.get(url).text)
    # contracts += page.xpath('/html/body/main/div/div/div[2]/div[1]/div[2]/div/div/a/@href')
    contracts += page.xpath('//div[@class="columns small-24 medium-16 large-12"]/a/@href')
    pagination = page.xpath('//div[@class="pagination"]/a/text()')
    if len(pagination) > 0:
        # print(pagination)
        last_page = int(pagination[pagination.index('Siguiente') - 1])
        # print('pages', last_page)
        for i in range(2, last_page + 1):
            page = html.fromstring(requests.get(url + '?page=%d' % i).text)
            # contracts += page.xpath('/html/body/main/div/div/div[2]/div[1]/div[2]/div/div/a/@href')
            contracts += page.xpath('//div[@class="columns small-24 medium-16 large-12"]/a/@href')
    return contracts

In [10]:
def build_list():
    offices = pd.read_json('https://www.transparencia.gob.sv/api/v1/institutions.json')
    offices['key'] = offices['acronym'].apply(normalize_acronym)
    offices[['key', 'name']].to_csv('offices.csv', index=False)
    dfs = []
    for k in offices['key'].values:
        try:
            cl = contract_list(k)
            dfs.append(pd.DataFrame({'office': k, 'list': cl}))
            print(k, ':', len(cl))
        except:
            print(k, ': failed')
    return pd.concat(dfs)

In [None]:
cl = build_list()

rnpn : failed
indes : failed
isdemu : 0
fisdl-finet : 0
fosofamilia : 383
mh : 0
lnb : 286


In [10]:
cl.to_csv('list.csv', index=False)


In [11]:
cl = pd.read_csv('list.csv')
len(cl)

83130

In [14]:
contracts = []
for i, j in cl[:100].iterrows():
    try:
        contract = contract_detail(cl.iloc[i]['list'])
        contract['office'] = cl.iloc[i]['office']
        contracts.append(contract)
        print(i, contract['office'], end='\r')
    except:
        pass            

99 rnpn

In [17]:
pd.DataFrame(contracts).to_csv('contracts.csv')