# Procesamiento del Estándar de Datos para las Contrataciones Abiertas

**Fecha:** 26/02/2023

* [Página "Estándar de Datos para las Contrataciones Abiertas EDCA - APF"](https://www.gob.mx/compranet/documentos/estandar-de-datos-para-las-contrataciones-abiertas-edca?fbclid=IwAR0NMR4iDF7s9qxsUwa32M3YaAksDlb4sVmmS_ouQXx1b2ivx7ayX4bXsSY)
* [Archivo contratacionesabiertas_bulk_paquetes_json.zip](https://compranetinfo.hacienda.gob.mx/dabiertos/NOVIEMBRE/contratacionesabiertas_bulk_paquetes_json.zip)

**Notas**
  * Descargar el archivo `contratacionesabiertas_bulk_paquetes_json.zip` y extraerlo en la carperta `contratacionesabiertas_bulk_paquetes_json`
  * Adicionalmente se deben crean las carpetas
    * `processed`
    * `csv`
      *  A su vez esta debe contener a una carpeta llamada `zip` 
  * Información relevante:
    * nombres de quienes ganaron, separados por comma - OK
    * ofertas economicas de participantes - **No Identificado**
    * numeros postales - OK

    * buyers
    	- toda la info en un solo arreglo el CP
    	- codigo postal en otro

    * procuryingEntity
    	- toda la info en un solo arreglo el CP
    	- codigo postal en otro

    * parties tenderer & supplier
    	- toda la info en un solo arreglo el CP
    	- codigo postal en otro
  
    * publishedDate - OK
    * publisher.name - OK

## 1.1 Librerias

In [None]:
import os
import json
import pandas as pd
import itertools
from tqdm.notebook import tqdm

In [None]:
# Nombre de los archivos json en la carpeta de trabajo
path_to_json = 'contratacionesabiertas_bulk_paquetes_json/'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]

In [None]:
# Revisa que exista la carpeta donde se ponen los json procesados
path_to_processed_json = 'processed/'
if not os.path.exists(path_to_processed_json):

   # Crea el directorio si no existe
   os.makedirs(path_to_processed_json)
   print(f"Se creo el directorio: {path_to_processed_json}")
else:
   print(f"El directorio \"{path_to_processed_json}\" ya existe")

# Revisa que exista la carpeta donde se ponen los csv procesados
path_to_processed_csv = 'csv/'
if not os.path.exists(path_to_processed_csv):

   # Crea el directorio si no existe
   os.makedirs(path_to_processed_csv)
   print(f"Se creo el directorio: {path_to_processed_csv}")
else:
   print(f"El directorio \"{path_to_processed_csv}\" ya existe")


path_to_processed_zip = 'csv/zip/'
if not os.path.exists(path_to_processed_zip):

   # Crea el directorio si no existe
   os.makedirs(path_to_processed_zip)
   print(f"Se creo el directorio: {path_to_processed_zip}")
else:
   print(f"El directorio \"{path_to_processed_zip}\" ya existe")

## 1.2 Procesamiento

En esta seccion vamos a procesar 1-1 los archivos JSON de los datos.


In [None]:
def filter_info(contract: dict) -> dict:
    """
    Auxiliar para filtrar información relevante de ECDA
    """

    extraction = {}

    # extrae datos de publicacion
    extraction['published_date'] = contract['publishedDate']
    extraction['publisher_name'] = contract['publisher']['name']

    # ------------- Datos de los ganadores ------------- #
    ### Extrae nombre de los ganadores
    try:
        awards = contract['releases'][0]['awards']

        # Extraemos datos de suppliers que fueron awards
        suppliers_aux_names = []

        for j in range(len(awards)):
            suppliers_aux_names.append(awards[j]['suppliers'])

        # Aplana las lista de supliers en una sola
        suppliers_list = list(itertools.chain(*suppliers_aux_names))

        # extract los nombres de los suppliers añadiendo comas
        extraction['awards_names'] = ', '.join(
            [ supplier['name'] for supplier in suppliers_list]
            )
    except:
        extraction['awards_names'] = 'NA'

    ### Extraemos los id de los awards
    try:
        awards = contract['releases'][0]['awards']

        # Extraemos datos de suppliers que fueron awards
        awards_ids = []

        for j in range(len(awards)):
            awards_ids.append(awards[j]['id'])

        # extract los nombres de los suppliers añadiendo comas
        extraction['awards_ids'] = ', '.join(awards_ids)
    except:
        extraction['awards_ids'] = 'NA'


    # ------------- Datos de ids de los tenders ------------- #
    ## Extramos los id de los tenders
    try:
        tenders = contract['releases'][0]['tender']

        # extract los nombres de los suppliers añadiendo comas
        extraction['tender_ids'] = tenders['id']
    except:
        extraction['tender_ids'] = 'NA'


    # ---------- Datos de los parties  ----------- #
    parties = contract['releases'][0]['parties']
    len_parties = len(parties)

    # datos de buyer, procuringEntity, tenderer & supplier
    buyers = [ x for x in parties if ('buyer' in x['roles']) ]
    procuring_entity = [ x for x in parties if ('procuringEntity' in x['roles']) ]
    tenderer_supplier = [ x for x in parties if (('tenderer' in x['roles']) or ('supplier' in x['roles']) ) ]

    ##---  Reformateando los datos para extraer codigo postal y 
    ##--- concatenar lo demas del address

    buyers_formated= []
    procuring_entity_formated = []
    tenderer_supplier_formated = []

    #### Buyers
    for buyer in buyers:

        buyer_mod = {}

        for x in buyer.keys():

            if x != 'address':
                buyer_mod[x] = buyer[x]
            else:
                if 'address' in buyer.keys():
                    if 'postalCode' not in buyer['address']:
                        buyer_mod['address_postal_code'] = 'NA'
                        buyer_mod['address_string'] = ', '.join(list(buyer['address'].values()))
                    else:
                        addres_info = [ buyer['address'][x] for x in buyer['address'].keys() if x != 'postalCode' ]
                        buyer_mod['address_string'] = ', '.join(addres_info)
                        buyer_mod['address_postal_code'] = buyer['address']['postalCode']
                else:
                    buyer_mod['address_postal_code'] = 'NA'
                    buyer_mod['address_string'] = 'NA' 
        
            buyers_formated.append(buyer_mod)

    extraction['parties_buyers'] = buyers_formated

    #### procuringEntity
    for entity in procuring_entity:

        entity_mod = {}

        for x in entity.keys():

            if x != 'address':
                entity_mod[x] = entity[x]
            else:
                if 'address' in entity.keys():
                    if 'postalCode' not in entity['address']:
                        entity_mod['address_postal_code'] = 'NA'
                        entity_mod['address_string'] = ', '.join(list(entity['address'].values()))
                    else:
                        addres_info = [ entity['address'][x] for x in entity['address'].keys() if x != 'postalCode' ]
                        entity_mod['address_string'] = ', '.join(addres_info)
                        entity_mod['address_postal_code'] = entity['address']['postalCode']
                else:
                    entity_mod['address_postal_code'] = 'NA'
                    entity_mod['address_string'] = 'NA'
        
            procuring_entity_formated.append(entity_mod)

    extraction['parties_procuring_entities'] = procuring_entity_formated

    #### tenderer & suppliers 
    for tender in tenderer_supplier:

        tender_mod = {}

        for x in tender.keys():

            if x != 'address':
                tender_mod[x] = tender[x]
            else:
                if 'address' in tender.keys():
                    if 'postalCode' not in tender['address']:
                        tender_mod['address_postal_code'] = 'NA'
                        tender_mod['address_string'] = ', '.join(list(tender['address'].values()))
                    else:
                        addres_info = [ tender['address'][x] for x in tender['address'].keys() if x != 'postalCode' ]
                        tender_mod['address_string'] = ', '.join(addres_info)
                        tender_mod['address_postal_code'] = tender['address']['postalCode']
                else:
                    tender_mod['address_postal_code'] = 'NA'
                    tender_mod['address_string'] = 'NA'

        
            tenderer_supplier_formated.append(tender_mod)

    extraction['parties_tenderers_suppliers'] = tenderer_supplier_formated

    # ---------- Extrae codigos postales de ganadores ----------- #
    tenderers_suppliers_postal_code = []
    
    for x in tenderer_supplier:
        if 'address' in x.keys():
            if 'postalCode' in x['address'].keys():
                tenderers_suppliers_postal_code.append(x['address']['postalCode'])
            else:
                tenderers_suppliers_postal_code.append('NA')
        else:
            tenderers_suppliers_postal_code.append('NA')

    extraction['parties_tenderers_suppliers_postal_codes'] = tenderers_suppliers_postal_code

    # ---------- Extrae numeros y nombres de los tenders----------- #
    tenderers_suppliers_names= []
    
    for x in tenderer_supplier:
        if 'name' in x.keys():
            tenderers_suppliers_names.append(x['name'])
        else:
            tenderers_suppliers_names.append('NA')

    
    extraction['tenderer_supplier_number'] = len(tenderer_supplier)
    extraction['tenderer_supplier_names'] = ', '.join(tenderers_suppliers_names)

    return extraction

In [None]:
# funcion para desanidad la estructura de un json
from typing import Union

def flatten_json(y: dict):
	"""
	Funcion que desanida estructuras de diccionario y lista.
	La funcion desanidara por niveles numerando las variables
	segun la profundida del anidamiento con un entero y guion bajo
	"""
	unnested = {}

	def flatten(x: Union[dict, list], name:str =''):

		# Desanida recursivamente pares llave-valor
		# de diccionarios
		if type(x) is dict:

			for a in x:
				flatten(x[a], name + a + '_')
	
		# Desanida recursivamente pares llave-valor
		# presentes en list
		elif type(x) is list:

			i = 0

			for a in x:
				flatten(a, name + str(i) + '_')
				i += 1
		else:
			unnested[name[:-1]] = x

	flatten(y)

	return unnested

### Extraccion recursiva

Aqui se ejecutan los siguientes pasos:

A. Se procesa la información para extraer los datos relevantes
B. El resultado se guarda a un archivo json
C. Se procesa la información de cada json para convertirlo a csv (este proceso es largo)

## Pasos A y B

In [None]:
for json_name_test in json_files:

    # Lee el contenido del archivo json como diccionatio
    with open(os.path.join(path_to_json, json_name_test)) as json_file:

        data = json.load(json_file)

        print(f"Archivo {json_name_test} :",len(data), "registros")

    # archivo procesado y desaninado
    data_flatten = [flatten_json(filter_info(x)) for x in data]

    # salvamos el proceso
    saving_path = os.path.join(path_to_processed_json, 'ext_'+json_name_test)

    with open(saving_path, "w") as final:
        json.dump(data_flatten, final)


## Paso C

In [None]:
# Nombre de los archivos json en la carpeta de trabajo ya procesados
json_files_processed = [
    pos_json for pos_json in os.listdir(path_to_processed_json) if pos_json.endswith('.json')
    ]

In [None]:
for json_name_test in json_files_processed:

    # Lee el contenido del archivo json como diccionatio
    with open(os.path.join(path_to_processed_json, json_name_test)) as json_file:

        data = json.load(json_file)
        print(f"Subiendo archivo {json_name_test} :",len(data), "registros")

        db.insert_many(data)

In [None]:
sample_size = 10
df_sample = pd.read_csv('ecda_processed_26022023.csv',nrows=sample_size, low_memory=False)

In [None]:
df_sample

In [None]:
cols_main = list(df_sample.columns)

In [None]:
cols_main

In [None]:
# Nombre de los archivos json en la carpeta de trabajo ya procesados
json_files_processed = [
    pos_json for pos_json in os.listdir(path_to_processed_json) if pos_json.endswith('.json')
    ]

In [None]:
json_files_processed[0][:-5]

In [None]:
from os import path

# ------------- recursive processing

for json_name_test in tqdm(json_files_processed):

    #if (path.exists('./csv/'+json_name_test[:-5]+'.csv')) or (json_name_test == 'ext_contratacionesabiertas_bulk_paquete5.json'):
    if (path.exists('./csv/zip/'+json_name_test[:-5]+'.zip')):

        print(f"File {json_name_test} already processed --- Skipping")

    else:
        # Lee el contenido del archivo json como diccionatio
        with open(os.path.join(path_to_processed_json, json_name_test)) as json_file:

            data = json.load(json_file)
            print(f"Subiendo archivo {json_name_test} :",len(data), "registros")

            temp_data = pd.DataFrame(data)

            # for x in cols_main:

            #     if x not in temp_data.columns:

            #         temp_data[x] = ''

            # temp_data= temp_data[cols_main]

            temp_data.to_csv('./csv/'+json_name_test[:-5]+'.csv')