# Procesamiento del Estándar de Datos para las Contrataciones Abiertas

**Fecha:** 26/02/2023

* [Página "Estándar de Datos para las Contrataciones Abiertas EDCA - APF"](https://www.gob.mx/compranet/documentos/estandar-de-datos-para-las-contrataciones-abiertas-edca?fbclid=IwAR0NMR4iDF7s9qxsUwa32M3YaAksDlb4sVmmS_ouQXx1b2ivx7ayX4bXsSY)
* [Archivo contratacionesabiertas_bulk_paquetes_json.zip](https://compranetinfo.hacienda.gob.mx/dabiertos/NOVIEMBRE/contratacionesabiertas_bulk_paquetes_json.zip)

**Notas**
  * Descargar el archivo `contratacionesabiertas_bulk_paquetes_json.zip` y extraerlo en la carperta `contratacionesabiertas_bulk_paquetes_json`
  * Información relevante:
    * nombres de quienes ganaron, separados por comma - OK
    * ofertas economicas de participantes - **No Identificado**
    * arreglo de los cucops (?)
    * numeros postales - OK

    * buyers
    	- toda la info en un solo arreglo el CP
    	- codigo postal en otro

    * procuryingEntity
    	- toda la info en un solo arreglo el CP
    	- codigo postal en otro

    * parties tenderer & supplier
    	- toda la info en un solo arreglo el CP
    	- codigo postal en otro
  
    * publishedDate - OK
    * publisher.name - OK

## 1.1 Librerias

In [1]:
import os
import json
import pandas as pd
import itertools

In [2]:
# Nombre de los archivos json en la carpeta de trabajo
path_to_json = 'contratacionesabiertas_bulk_paquetes_json/'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]

In [17]:
# Revisa que exista la carpeta donde se ponen los json procesados
path_to_processed_json = 'processed/'
is_exist = os.path.exists(path_to_processed_json)

if not is_exist:

   # Crea el directorio si no existe
   os.makedirs(path_to_processed_json)
   print(f"Se creo el directorio: {path_to_processed_json}")
else:
   print(f"El directorio \"{path_to_processed_json}\" ya existe")

El directorio "processed/" ya existe


## 1.2 Procesamiento

In [32]:
json_name_test = "contratacionesabiertas_bulk_paquete1.json" #json_files[0]

# Lee el contenido del archivo json como diccionatio
with open(os.path.join(path_to_json, json_name_test)) as json_file:

    data = json.load(json_file)

    print(f"Archivo {json_name_test} :",len(data), "registros")

Archivo contratacionesabiertas_bulk_paquete1.json : 25000 registros


In [4]:
data[0]

{'_id': {'$oid': '61628f21f82fe09cd22d528c'},
 'extensions': ['https://raw.githubusercontent.com/CompraNet/ocds_schemeUrl_extension/master/extension.json',
  'https://raw.githubusercontent.com/compranet-dev/ocds_frameworkAgreement_extension/main/extension.json'],
 'license': 'https://datos.gob.mx/libreusomx',
 'publicationPolicy': 'https://compranetinfo.hacienda.gob.mx/descargas/politica-publicacion-EDCA-MX.pdf',
 'publishedDate': '2021-10-09T22:29:03Z',
 'publisher': {'name': 'SECRETARÍA DE HACIENDA Y CRÉDITO PÚBLICO',
  'uid': '6716',
  'uri': 'http://www.gob.mx/shcp'},
 'releases': [{'awards': [{'contractPeriod': {'endDate': '2019-06-30T23:59:00Z',
      'startDate': '2019-03-19T00:00:00Z'},
     'description': 'SERVICIOS MEDICOS SUBROGADOS DE ESPECIALIZACION (ESTEREOLITOGRAFIAS',
     'id': '2038777',
     'items': [{'classification': {'description': 'Servicio de estudios clinicos',
        'id': '33900008',
        'scheme': 'CUCOP',
        'uri': 'https://compranetinfo.hacienda.

In [40]:
def filter_info(contract: dict) -> dict:
    """
    Auxiliar para filtrar información relevante de ECDA
    """

    extraction = {}

    # extraction['published_date'] = pd.to_datetime(contract['publishedDate'])
    extraction['publisher_name'] = contract['publisher']['name']

    # ------------- Datos de los ganadores ------------- #
    ### Nombres
    try:
        awards = contract['releases'][0]['awards']

        # Extraemos datos de suppliers que fueron awards
        suppliers_aux_names = []

        for j in range(len(awards)):
            suppliers_aux_names.append(awards[j]['suppliers'])

        # Aplana las lista de supliers en una sola
        suppliers_list = list(itertools.chain(*suppliers_aux_names))

        # extract los nombres de los suppliers añadiendo comas
        extraction['awards_names'] = ', '.join(
            [ supplier['name'] for supplier in suppliers_list]
            )
    except:
        extraction['awards_names'] = 'NA'

    ### awards_id
    try:
        awards = contract['releases'][0]['awards']

        # Extraemos datos de suppliers que fueron awards
        awards_ids = []

        for j in range(len(awards)):
            awards_ids.append(awards[j]['id'])

        # extract los nombres de los suppliers añadiendo comas
        extraction['awards_ids'] = ', '.join(awards_ids)
    except:
        extraction['awards_ids'] = 'NA'


    # ------------- Datos de ids de los tenders ------------- #
    ## tenders_ids
    try:
        tenders = contract['releases'][0]['tender']

        # extract los nombres de los suppliers añadiendo comas
        extraction['tender_ids'] = tenders['id']
    except:
        extraction['tender_ids'] = 'NA'


    # ---------- Datos de los parties  ----------- #
    parties = contract['releases'][0]['parties']
    len_parties = len(parties)

    # datos de buyer, procuringEntity, tenderer & supplier
    buyers = [ x for x in parties if ('buyer' in x['roles']) ]
    procuring_entity = [ x for x in parties if ('procuringEntity' in x['roles']) ]
    tenderer_supplier = [ x for x in parties if (('tenderer' in x['roles']) or ('supplier' in x['roles']) ) ]

    ##---  Reformateando los datos para extraer codigo postal y 
    ##--- concatenar lo demas del address

    buyers_formated= []
    procuring_entity_formated = []
    tenderer_supplier_formated = []

    #### Buyers
    for buyer in buyers:

        buyer_mod = {}

        for x in buyer.keys():

            if x != 'address':
                buyer_mod[x] = buyer[x]
            else:
                if 'address' in buyer.keys():
                    if 'postalCode' not in buyer['address']:
                        buyer_mod['address_postal_code'] = 'NA'
                        buyer_mod['address_string'] = ', '.join(list(buyer['address'].values()))
                    else:
                        addres_info = [ buyer['address'][x] for x in buyer['address'].keys() if x != 'postalCode' ]
                        buyer_mod['address_string'] = ', '.join(addres_info)
                        buyer_mod['address_postal_code'] = buyer['address']['postalCode']
                else:
                    buyer_mod['address_postal_code'] = 'NA'
                    buyer_mod['address_string'] = 'NA' 
        
            buyers_formated.append(buyer_mod)

    extraction['parties_buyers'] = buyers_formated

    #### procuringEntity
    for entity in procuring_entity:

        entity_mod = {}

        for x in entity.keys():

            if x != 'address':
                entity_mod[x] = entity[x]
            else:
                if 'address' in entity.keys():
                    if 'postalCode' not in entity['address']:
                        entity_mod['address_postal_code'] = 'NA'
                        entity_mod['address_string'] = ', '.join(list(entity['address'].values()))
                    else:
                        addres_info = [ entity['address'][x] for x in entity['address'].keys() if x != 'postalCode' ]
                        entity_mod['address_string'] = ', '.join(addres_info)
                        entity_mod['address_postal_code'] = entity['address']['postalCode']
                else:
                    entity_mod['address_postal_code'] = 'NA'
                    entity_mod['address_string'] = 'NA'
        
            procuring_entity_formated.append(entity_mod)

    extraction['parties_procuring_entities'] = procuring_entity_formated

    #### tenderer & suppliers 
    for tender in tenderer_supplier:

        tender_mod = {}

        for x in tender.keys():

            if x != 'address':
                tender_mod[x] = tender[x]
            else:
                if 'address' in tender.keys():
                    if 'postalCode' not in tender['address']:
                        tender_mod['address_postal_code'] = 'NA'
                        tender_mod['address_string'] = ', '.join(list(tender['address'].values()))
                    else:
                        addres_info = [ tender['address'][x] for x in tender['address'].keys() if x != 'postalCode' ]
                        tender_mod['address_string'] = ', '.join(addres_info)
                        tender_mod['address_postal_code'] = tender['address']['postalCode']
                else:
                    tender_mod['address_postal_code'] = 'NA'
                    tender_mod['address_string'] = 'NA'

        
            tenderer_supplier_formated.append(tender_mod)

    extraction['parties_tenderers_suppliers'] = tenderer_supplier_formated

    # ---------- Extrae codigos postales de ganadores ----------- #
    tenderers_suppliers_postal_code = []
    
    for x in tenderer_supplier:
        if 'address' in x.keys():
            if 'postalCode' in x['address'].keys():
                tenderers_suppliers_postal_code.append(x['address']['postalCode'])
            else:
                tenderers_suppliers_postal_code.append('NA')
        else:
            tenderers_suppliers_postal_code.append('NA')

    extraction['parties_tenderers_suppliers_postal_codes'] = tenderers_suppliers_postal_code

    # ---------- Extrae numeros y nombres de los tenders----------- #
    tenderers_suppliers_names= []
    
    for x in tenderer_supplier:
        if 'name' in x.keys():
            tenderers_suppliers_names.append(x['name'])
        else:
            tenderers_suppliers_names.append('NA')

    
    extraction['tenderer_supplier_number'] = len(tenderer_supplier)
    extraction['tenderer_supplier_names'] = ', '.join(tenderers_suppliers_names)

    return extraction

In [41]:
filter_info(data[0])

{'publisher_name': 'SECRETARÍA DE HACIENDA Y CRÉDITO PÚBLICO',
 'awards_names': 'APLICACIONES INDUSTRIALES DE CALIDAD, S.A. DE C.V.',
 'awards_ids': '2038777',
 'tender_ids': '1892834',
 'parties_buyers': [{'address_string': 'MX, Iztapalapa, Ciudad de México, Calzada Ignacio Zaragoza No 1711 Col. Ejercito Constitucionalista',
   'address_postal_code': '09220',
   'contactPoint': {'email': 'dulce.violante@issste.gob.mx',
    'name': 'Dulce Idania Violante Medina',
    'telephone': '57165268'},
   'id': 'ISSSTE-181',
   'identifier': {'id': 'ISS6001015A3-051GYN085',
    'legalName': 'ISSSTE-Hospital Regional Gral. Ignacio Zaragoza, Coordinación de Recursos Materiales #051GYN085',
    'scheme': 'MX-RFC',
    'uri': 'https://portalsat.plataforma.sat.gob.mx/ConsultaRFC'},
   'name': 'Instituto de Seguridad y Servicios Sociales de los Trabajadores del Estado',
   'roles': ['buyer']},
  {'address_string': 'MX, Iztapalapa, Ciudad de México, Calzada Ignacio Zaragoza No 1711 Col. Ejercito Consti

In [7]:
#data[12]['releases'][0]['parties']

In [8]:
# funcion para desanidad la estructura de un json
from typing import Union

def flatten_json(y: dict):
	"""
	Funcion que desanida estructuras de diccionario y lista.
	La funcion desanidara por niveles numerando las variables
	segun la profundida del anidamiento con un entero y guion bajo
	"""
	unnested = {}

	def flatten(x: Union[dict, list], name:str =''):

		# Desanida recursivamente pares llave-valor
		# de diccionarios
		if type(x) is dict:

			for a in x:
				flatten(x[a], name + a + '_')
	
		# Desanida recursivamente pares llave-valor
		# presentes en list
		elif type(x) is list:

			i = 0

			for a in x:
				flatten(a, name + str(i) + '_')
				i += 1
		else:
			unnested[name[:-1]] = x

	flatten(y)

	return unnested

In [9]:
data_flatten = [flatten_json(filter_info(x)) for x in data]

In [11]:
with open("mydata.json", "w") as final:
   json.dump(data_flatten, final)

In [66]:
import pymongo

In [67]:
CONN_STRING = "mongodb://127.0.0.1:27017/?readPreference=primary&serverSelectionTimeoutMS=2000&directConnection=true&ssl=false"

In [68]:
myclient = pymongo.MongoClient(CONN_STRING)

In [69]:
db = myclient["edca"]["processed"]

In [70]:
db.insert_many(data_flatten)

<pymongo.results.InsertManyResult at 0x7f7b03a89400>