In [1]:
import requests

import pandas as pd
import numpy as np

import camelot
import json

from glob import glob

from datetime import datetime as dt

In [2]:
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [10, 10]

## 1. OSC con Clave Única de Inscripción

In [28]:
print(str(dt.now())[:19], '- Loading OSCs with CLUNI (Unique Id)\n')

# check if file already exist in folder
sirfosc_files = glob('./data/SIRFOSC/csv/*.csv')

if len(sirfosc_files) > 0: 
    most_recent_file = max([pd.to_datetime(dt[-23:-4], format='%Y-%m-%d-%H-%M-%S') for dt in sirfosc_files])
    print(str(dt.now())[:19], f'- File already exists. Retrieving most recent ({str(most_recent_file)})...\n')
    file_name = './data/SIRFOSC/csv/report-rfosc-' + str(most_recent_file).replace(' ', '-').replace(':', '-') + '.csv'
    
    df_osc_cluni = pd.read_csv(file_name, low_memory=False)
    df_osc_cluni = df_osc_cluni.drop(df_osc_cluni.columns[0], axis=1)
    print(str(dt.now())[:19], '- Data loaded in dataframe')
    
# if not, download from gov site
else:
    print(str(dt.now())[:19], '- Info is not locally available. Data will be downloaded from SII portal')
    with open("./params/sirfosc_query_filters.json", "r") as  params:
        jParams = json.load(params)

    CLUNI_URL = f"http://www.sii.gob.mx/portal/organizaciones/excel/?cluni={jParams['CLUNI']}&nombre={jParams['NOMBRE']}&acronimo={jParams['ACRONIMO']}&rfc={jParams['RFC']}&status_osc={jParams['STATUS_OSC']}&status_sancion={jParams['STATUS_SANCION']}&figura_juridica={jParams['FIGURA_JURIDICA']}&estado={jParams['ESTADO']}&municipio={jParams['MUNICIPIO']}&asentamiento={jParams['ASENTAMIENTO']}&cp={jParams['CP']}&rep_nombre={jParams['REP_NOMBRE']}&rep_apaterno={jParams['REP_APATERNO']}&rep_amaterno={jParams['REP_AMATERNO']}&num_notaria={jParams['NUM_NOTARIA']}&objeto_social={jParams['OBJETO_SOCIAL']}&red={jParams['RED']}&advanced=1"

    # todo: do i really need all this data to talk to the server?
    payload={}
    headers = {
      'Upgrade-Insecure-Requests': '1',
      'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36',
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
    }
    
    print(str(dt.now())[:19], '- Requesting data to server...')
    cluni_response = requests.request("GET", CLUNI_URL, headers=headers, data=payload)

    CLUNI_NOW = str(dt.now())[:19].replace(' ', '-').replace(':', '-')

    print(str(dt.now())[:19], '- Applying some cleaning processing')
    with open(f"./data/SIRFOSC/txt/report-rfosc-{CLUNI_NOW}.txt", "w+") as f:
        f.write(cluni_response.text)
        
    print(str(dt.now())[:19], '- Saving retreived data as text file')
    df_osc_cluni = pd.DataFrame([row.split('","') for row in cluni_response.text.split('\n')])
    df_osc_cluni[0] = df_osc_cluni[0].str.replace('"', '')
    df_osc_cluni[df_osc_cluni.shape[1]-1] = df_osc_cluni[df_osc_cluni.shape[1]-1].str.replace('"\r', '')
    df_osc_cluni = df_osc_cluni.rename(columns=df_osc_cluni.iloc[0]).iloc[1:]
    df_osc_cluni = df_osc_cluni[:-1].reset_index(drop=True)
    
    print(str(dt.now())[:19], '- Writing request as csv in SIRFOSC folder\n')
    df_osc_cluni.to_csv(f"./data/SIRFOSC/csv/report-rfosc-{CLUNI_NOW}.csv")
    
    print(str(dt.now())[:19], '- Data loaded in dataframe')

2021-02-22 07:56:57 - Loading OSC with CLUNI (Unique Id)

2021-02-22 07:56:57 - Info is not locally available. Data will be downloaded from SII portal
2021-02-22 07:56:57 - Requesting data to server...
2021-02-22 07:58:12 - Applying some cleaning processing
2021-02-22 07:58:12 - Saving retreived data as text file
2021-02-22 07:58:14 - Writing request as csv in SIRFOSC folder

2021-02-22 07:58:14 - Data loaded in dataframe


In [30]:
df_osc_cluni.head()

Unnamed: 0,CLUNI,RAZON SOCIAL,FIGURA JURIDICA,RFC,ESTATUS,ESTATUS DE SANCION,REPRESENTANTES LEGALES,ESTATUS DE LA REPRESENTACION,FECHA DE CONSTITUCION,FECHA DE INSCRIPCION,...,INFORME 2015 PRESENTADO,INFORME 2015 EN TIEMPO,INFORME 2016 PRESENTADO,INFORME 2016 EN TIEMPO,INFORME 2017 PRESENTADO,INFORME 2017 EN TIEMPO,INFORME 2018 PRESENTADO,INFORME 2018 EN TIEMPO,INFORME 2019 PRESENTADO,INFORME 2019 EN TIEMPO
0,VCO98091405018,Voluntades por Coahuila,Asociación Civil,VCO980914457,ACTIVA,,Rodrigo Montelongo Suárez,VENCIDA,1998-09-14,2005-05-18,...,NO,,NO,,NO,,NO,,NO,
1,UEF0303170101D,Unión Estatal Femenil FEPRA,Asociación Civil,UEF0303179P9,ACTIVA,,Adriana Díaz de León Valdivia,VIGENTE,2003-03-17,2005-02-22,...,NO,,NO,,NO,,NO,,NO,
2,FOJ04100219013,Federación de Organizaciones Juveniles,Asociación Civil,FOJ041002PB3,INACTIVA,,Jorge Alberto Arrambide Montemayor,VIGENTE,2004-10-02,2005-03-09,...,NO,,NO,,NO,,NO,,NO,
3,ARP95040609019,Aguilas Reales Pro Dignidad del Discapacitado,Asociación Civil,ARP950406IF8,ACTIVA,,"David Peral Manzo,David Peral Manzo",VENCIDA,1995-04-06,2005-04-21,...,NO,,NO,,NO,,NO,,NO,
4,FIM97120122015,Formación Integral de la Mujer,Institución de Asistencia Privada,FIM971201896,INACTIVA,,"María Teresa Duarte Belloc,María Teresa Rodríg...",VIGENTE,1997-09-18,2005-04-08,...,SI,SI,SI,SI,SI,SI,SI,SI,NO,


## 2. Donatarias Autorizadas (Directorio SAT)

In [41]:
print(str(dt.now())[:19], '- Loading OSCs from Directorio Donatarias Autorizadas (SAT)\n')

# check if file already exist in folder
ddonaut_files = glob('./data/DonatariasAutorizadas/*.xls')

if len(ddonaut_files) > 0: 
    most_recent_file = max([pd.to_datetime(dt[-23:-4], format='%Y-%m-%d-%H-%M-%S') for dt in ddonaut_files])
    print(str(dt.now())[:19], f'- File already exists. Retrieving most recent ({str(most_recent_file)})...\n')
    file_name = './data/DonatariasAutorizadas/report-sat-2020-' + str(most_recent_file).replace(' ', '-').replace(':', '-') + '.xls'
    
    df_osc_donaut_sat = pd.read_excel(file_name, skiprows = range(0, 27), usecols = "A:O")
    df_osc_donaut_sat.columns = df_osc_donaut_sat.iloc[0]
    df_osc_donaut_sat = df_osc_donaut_sat.loc[1:].reset_index(drop=True)
    print(str(dt.now())[:19], '- Data loaded in dataframe')

else:
    
    print(str(dt.now())[:19], '- Info is not locally available. Data will be downloaded from SAT portal')
    # todo: create params to download directorio donaut
    YEAR = '2020'
    DONAUT_NOW = str(dt.now())[:19].replace(' ', '-').replace(':', '-')
    SAT_URL = f'http://omawww.sat.gob.mx/documentossat/Documents/DirectorioDonatariasAutorizadas{YEAR}.xls'

    print(str(dt.now())[:19], '- Requesting data to server...')
    response = requests.get(SAT_URL)

    print(str(dt.now())[:19], '- Saving retreived data as XLS file\n')
    with open(f"./data/DonatariasAutorizadas/report-sat-{YEAR}-{DONAUT_NOW}.xls", "wb") as f:
        f.write(response.content)
        
    df_osc_donaut_sat = pd.read_excel(
        f"./data/DonatariasAutorizadas/report-sat-{YEAR}-{DONAUT_NOW}.xls"
        , skiprows = range(0, 27)
        , usecols = "A:O"
        )

    df_osc_donaut_sat = df_osc_donaut_sat.rename(columns=df_osc_donaut_sat.iloc[0]).iloc[1:].reset_index(drop=True)
    print(str(dt.now())[:19], '- Data loaded in dataframe')

2021-02-22 08:15:22 - Loading OSCs from Directorio Donatarias Autorizadas (SAT)

2021-02-22 08:15:22 - File already exists. Retrieving most recent (2021-02-21 11:23:35)...

2021-02-22 08:15:23 - Data loaded in dataframe


In [42]:
df_osc_donaut_sat.head()

Unnamed: 0,ENTIDAD FEDERATIVA,ADMINISTRACIÓN DESCONCENTRADA DE SERVICIOS AL CONTRIBUYENTE,ACTIVIDAD O FIN AUTORIZADO,RFC,DENOMINACIÓN O RAZÓN SOCIAL,DOMICILIO FISCAL,OFICIO DE AUTORIZACIÓN,FECHA DE OFICIO,OBJETO SOCIAL AUTORIZADO,REPRESENTANTE LEGAL,NÚMEROS TELEFÓNICOS,E-MAIL,DOMICILIO DE ESTABLECIMIENTO,NÚMEROS TELEFÓNICOS DEL ESTABLECIMIENTO,ACREDITAMIENTO
0,AGUASCALIENTES,"AGUASCALIENTES ""1""",M,AAG150226BX0,"Autismo Aguascalientes, A.C.","Canada Núm 301, Col. El Dorado 1a Sección, C.P...",700-02-01-2019-08990,2019-09-26 00:00:00,"""CUARTA.- La Asociación Autismo Aguascalientes...",Olga Rubio Camarena,No Manifestó,No Manifestó,No Manifestó,No Manifestó,Constancia de Inscripción ante el Registro Fed...
1,AGUASCALIENTES,"AGUASCALIENTES ""1""",A,AAG150226BX0,"Autismo Aguascalientes, A.C.","Canada Núm 301, Col. El Dorado 1a Sección, C.P...",700-02-01-2019-08990,2019-09-26 00:00:00,"""CUARTA.- La Asociación Autismo Aguascalientes...",Olga Rubio Camarena,No Manifestó,No Manifestó,No Manifestó,No Manifestó,Constancia de Inscripción ante el Registro Fed...
2,AGUASCALIENTES,"AGUASCALIENTES ""1""",A,AAP010713QV5,"Asilo de Ancianos de Pabellón de Arteaga, Agua...","Independencia Núm 48, Col. Francisco Villa, C....",700-02-01-2020-05788,2020-05-22 00:00:00,"""ARTICULO SEGUNDO.- La Asociación tendrá por o...",Ma del Rosario Bueno Martínez,01465 958 10 90,No Manifestó,No Manifestó,No Manifestó,oficio DGSEA/0914/2019 de fecha 18 de diciembr...
3,AGUASCALIENTES,"AGUASCALIENTES ""1""",A,AAP7601261F0,Asociación Aguascalentense para la Promoción I...,"Olivos Núm 202, Col. Jardines de la Cruz, C.P....",600-04-05-2012-57253,2012-06-26 00:00:00,"""ARTICULO PRIMERO: OBJETO SOCIAL: La asociació...",Luis Manuel Macías López,(01449) 970-62-92,No Manifestó,No Manifestó,No Manifestó,oficio número DEIP-GF-80 de 24 de enero de 201...
4,AGUASCALIENTES,"AGUASCALIENTES ""1""",M,ACD130327SG7,Aliadas Carmelitas Descalzas de la Santísima T...,"Salvador Quezada Limón Núm 1105, Col. Curtidor...",600-04-02-2014-6778,2014-08-26 00:00:00,"""ARTÍCULO SEGUNDO.- El objeto de la asociación...",Laura Patricia Colmenares Ramírez,449168395,alidesamparados@hotmail.com,No Manifestó,No Manifestó,oficio número DGSEA-009/2014 de 7 de enero de ...


## 3. Donatarias Autorizadas (Resolución Miscelánea Fiscal)

In [43]:
with open("./params/dof_page_breaks.json", "r") as  params:
    jParams = json.load(params)

In [44]:
DOF_YEARS = list(jParams.keys())

In [45]:
def fix_multiple_cols_name(df):
    ncols = df.shape[1]
    if ncols == 2: return(df)
    elif ncols > 2:
        # names can generate multiple columns due to bad parsing
        name_cols = list(range(1, ncols))
        df[1] = df[name_cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
        df = df[[0, 1]]
        return(df)
    else:
        # print('Check dataframe')
        # print(df.head(3))
        return(pd.DataFrame(columns=[0, 1]))

In [46]:
df_osc_donaut_dof = pd.DataFrame(columns=['RFC', 'razon_social', 'dof_year'])

for year in DOF_YEARS:
    
    # if year != '2020': continue
    print(str(dt.now())[:19], f'- Getting OSC data from "Diario Oficial de la Federacion ({year})"\n')
    DOF_YEAR = year
    
    print(str(dt.now())[:19], "- Create dataframe to store OSC's data")
    df_donaut_rms = pd.DataFrame(columns=[0, 1])
    
    # parsing first page where osc start (truncated)
    first = jParams[DOF_YEAR]['parse']['start']['pages']
    print(str(dt.now())[:19], f"- Parsing first page ({first}) where OSC's data begins")
    start_tl = jParams[DOF_YEAR]['parse']['start']['top-left']
    start_br = jParams[DOF_YEAR]['parse']['start']['bottom-right']
    first_page = camelot.read_pdf(
        './data/A14-RMS/a14-rms-2021.pdf'
        , flavor='stream'
        , table_areas=[','.join(start_tl + start_br)]
        , pages=first
        )

    df_donaut_rms = pd.concat([df_donaut_rms, fix_multiple_cols_name(first_page[0].df)])
    
    # parsing full pages where osc continue
    full = jParams[DOF_YEAR]['parse']['full']['pages']
    print(str(dt.now())[:19], f"- Parsing full pages ({full}) where OSC's data is present")
    full_tl = jParams[DOF_YEAR]['parse']['full']['top-left']
    full_br = jParams[DOF_YEAR]['parse']['full']['bottom-right']
    full_pages = camelot.read_pdf(
        './data/A14-RMS/a14-rms-2021.pdf'
        , flavor='stream'
        , table_areas=[','.join(full_tl + full_br)]
        , pages=full
        )
    
    for n in range(len(full_pages)):
        aux = fix_multiple_cols_name(full_pages[n].df)
        df_donaut_rms = pd.concat([df_donaut_rms, aux])
    
    # parsing last page where osc end (truncated)
    last = jParams[DOF_YEAR]['parse']['end']['pages']
    print(str(dt.now())[:19], f"- Parsing last page ({last}) where OSC's data ends\n")
    end_tl = jParams[DOF_YEAR]['parse']['end']['top-left']
    end_br = jParams[DOF_YEAR]['parse']['end']['bottom-right']
    last_page = camelot.read_pdf(
        './data/A14-RMS/a14-rms-2021.pdf'
        , flavor='stream'
        , table_areas=[','.join(end_tl + end_br)]
        , pages=last
        )

    df_donaut_rms = pd.concat([df_donaut_rms, fix_multiple_cols_name(last_page[0].df)])
    print(str(dt.now())[:19], '- Parse process completed')
    
    # some cleaning over here
    print(str(dt.now())[:19], '- Applying a some cleaning to the data')
    df_donaut_rms = df_donaut_rms.rename(columns={0: 'RFC', 1: 'razon_social'})
    # remove headers when osc changes their social objective in pdf
    df_donaut_rms = df_donaut_rms[~((df_donaut_rms.RFC == 'RFC') & (df_donaut_rms.razon_social == 'Denominación Social'))].reset_index(drop=True)
    df_donaut_rms = df_donaut_rms[df_donaut_rms.RFC.str.len().isin([0, 12])].reset_index(drop=True)
    
    df_donaut_rms['razon_social'] = df_donaut_rms.razon_social.str.replace('\n', ' ')
    
    print(str(dt.now())[:19], '- Fixing splitted OSC names')
    # check if razon social was splitted in two, row merge needed
    df_donaut_rms[['merge_needed']] = (
        1*(df_donaut_rms.razon_social.shift(-1).str.len() == 0) 
        + 1*(df_donaut_rms.razon_social.shift(1).str.len() == 0)
    )
    df_donaut_rms[['merge_needed']] = (df_donaut_rms.merge_needed.shift(-1) + df_donaut_rms.merge_needed.shift(1))
    for i in df_donaut_rms[df_donaut_rms.merge_needed == 2].index:
        df_donaut_rms.loc[i, 'razon_social'] = df_donaut_rms.loc[i-1, 'razon_social'] + ' ' + df_donaut_rms.loc[i+1, 'razon_social']
    
    print(str(dt.now())[:19], '- Remove bad RFCs')
    df_donaut_rms = df_donaut_rms[df_donaut_rms.RFC.str.len() == 12]
    df_donaut_rms = df_donaut_rms[df_donaut_rms.RFC.str.isupper()]
    df_donaut_rms = df_donaut_rms[~df_donaut_rms.RFC.str.isalpha()].reset_index(drop=True)
    df_donaut_rms = df_donaut_rms.drop_duplicates("RFC").reset_index(drop=True)
    df_donaut_rms = df_donaut_rms.drop(columns=['merge_needed'])
    
    df_donaut_rms[['dof_year']] = DOF_YEAR
    
    print(str(dt.now())[:19], f'- Total unique OSCs retrived in {year}:', str(df_donaut_rms.shape[0]), '\n')
    
    df_osc_donaut_dof = pd.concat([df_osc_donaut_dof, df_donaut_rms])
    

2021-02-22 08:17:27 - Getting OSC data from "Diario Oficial de la Federacion (2021)"

2021-02-22 08:17:27 - Create dataframe to store OSC's data
2021-02-22 08:17:27 - Parsing first page (3) where OSC's data begins
2021-02-22 08:17:27 - Parsing full pages (4-299) where OSC's data is present
2021-02-22 08:19:07 - Parsing last page (300) where OSC's data ends

2021-02-22 08:19:07 - Parse process completed
2021-02-22 08:19:07 - Applying a some cleaning to the data
2021-02-22 08:19:07 - Fixing splitted OSC names
2021-02-22 08:19:07 - Remove bad RFCs
2021-02-22 08:19:07 - Total unique OSCs retrived in 2021: 9556 

2021-02-22 08:19:07 - Getting OSC data from "Diario Oficial de la Federacion (2020)"

2021-02-22 08:19:07 - Create dataframe to store OSC's data
2021-02-22 08:19:07 - Parsing first page (112) where OSC's data begins
2021-02-22 08:19:07 - Parsing full pages (113-401) where OSC's data is present
2021-02-22 08:20:55 - Parsing last page (402) where OSC's data ends

2021-02-22 08:20:55 

In [None]:
# accuracies = []
# whitespaces = []

# accuracies += [full_pages[n].parsing_report['accuracy']
# whitespaces += [full_pages[n].parsing_report['accuracy']]

# print('Average parsed accuracy', np.mean(accuracies).round(2))
# print('Average parsed whitespace', np.mean(whitespaces).round(2))

In [47]:
df_osc_donaut_dof.head()

Unnamed: 0,RFC,razon_social,dof_year
0,APO151201EH2,"""Ameyalco Posible"", A.C.",2021
1,AAY110609KW0,"""ANEXA AYUDA"", A.C.",2021
2,DAC0505167G4,"""Destino y Apoyo a las Comunidades por los Ni...",2021
3,FSE110323T5A,"""FUNDACIÓN PARA LA SUPERACIÓN EDUCATIVA Y SOCI...",2021
4,ODE120423K39,"""ORDEN Y DESARROLLO EN CIUDAD NEZAHUALCOYOTL"",...",2021
