# Merge the Mexican Federal Budget files

In [1]:
from sys import stdout

## Data ingestion

Ingest all columns as raw strings.

In [2]:
def read_columns(file):
    with open(file, encoding='iso-8859-1') as csv:
        header = csv.readline()
        return header.replace('\n', '').split(',')

In [3]:
def force_strings(columns):
    for column in columns:
        yield column, str

In [4]:
import cchardet as chardet

def detect_encoding(years, file_format):
    for year in years:
        filename = file_format % year

        with open(filename, 'rb') as f:
            text = f.read()
            
        result = chardet.detect(text)
        print('Inspecting', filename, result)

In [5]:
from pandas import read_csv

def load_csv_files(years, file_format, encoding_):
    df = {}
    
    for year in years:
        filename = file_format % year
        columns = read_columns(filename)
        types = dict(force_strings(columns))
        print('Loading', filename, 'with encoding:', encoding_)
        stdout.flush()
        
        df[year] = read_csv(filename, encoding=encoding_, dtype=types)#, verbose=True)
    
    return df

## Cleaning

In [6]:
from sys import stdout
from numpy import nan

def strip_blanks(batch):
    for year in sorted(batch.keys()):
        for column in batch[year].columns:
            batch[year].rename(columns={column: column.strip()}, inplace=True)
            batch[year][column].apply(lambda x: x.strip() if x is not nan else x)
        print(year, 'stripped padding')
        stdout.flush()

In [7]:
def delete_empty_columns(batch):
    for year in batch.keys():
        for column in batch[year].columns:
            if 'Unnamed:' in column:
                try:
                    del batch[year][column]
                    print(year, column, 'deleted')
                    stdout.flush()
                except KeyError:
                    pass  

In [8]:
from numpy import count_nonzero
from numpy import nan

def count_empty_values(batch):
    table = []

    for column in get_union_of_columns(batch):
        row = {'Column': column}
        
        for year in batch.keys():
            if column in batch[year].columns:
                nb_empty_cells = batch[year][column].apply(lambda x: 1 if x is nan else 0).sum()
            else:
                nb_empty_cells = nan
                
            row.update({year: nb_empty_cells})

        table.append(row)
        
    ordered_columns = ['Column']
    ordered_columns.extend(sorted(batch.keys()))
    
    return DataFrame(table).reindex_axis(ordered_columns, axis=1)

## Mapping

In [9]:
def get_union_of_columns(batch):
    union = set()
    for year in batch.keys():
        union = union | set(batch[year].columns)
    return union

In [10]:
from yaml import load

def load_aliases(file):
    with open(file) as yaml:
        aliases = load(yaml.read())
        return aliases

In [11]:
def alias_columns(batch, list_of_aliases):
    for year in sorted(batch.keys()):
        for column in sorted(batch[year].columns):
            if not column in list_of_aliases:
                for reference, aliases in list_of_aliases.items():
                    if aliases:
                        if column in aliases:
                            batch[year].rename(columns={column: reference}, inplace=True)
                            print(year, column, 'replaced with', reference)
                            stdout.flush()
                            break  
                else:
                    print(year, 'NO ALIAS: ', column)
                    stdout.flush()
        print('\nDone mapping', year, '\n')
        stdout.flush()

In [12]:
from pandas import DataFrame

def build_overview(batch):
    table = []
    
    for column in get_union_of_columns(batch):
        row = {'Column': column}
        for year in batch.keys():
            row.update({year: column in batch[year].columns})
        table.append(row)
        
    ordered_columns = ['Column']
    ordered_columns.extend(sorted(batch.keys()))
    
    overview = DataFrame(table).reindex_axis(ordered_columns, axis=1)
    return overview

## Parse amount columns

There's a little cleaning to do on the amount columns (zeros are represented by a dash and there is some whitespace padding). I might as well take care of this here to make sure I don't run into parsing problems later).

In [13]:
from numpy import nan

amounts = ['Ejercido', 'Devengado', 'Aprobado', 'Pagado', 'Modificado', 'Adefas', 'Ejercicio']

def clean_amount_columns(batch):
    for year, df in sorted(batch.items()):
        for amount in amounts:
            try:
                # Weird behaviour from pandas: although I coerced columns to strings, 
                # there are some nan values which pandas considers as floats.
                series = batch[year][amount].apply(lambda x: '' if x is nan else x)
                series = series.apply(lambda x: x.strip())
                # '-' seem to represent a zero, judging from the value counts
                series = series.apply(lambda x: '0' if x == '-' else x)
                series = series.apply(lambda x: x.replace(',', ''))
                batch[year][amount] = series
                print(year, amount, 'cleaned numerical column')
            except KeyError:
                pass

In [23]:
from pandas import concat

def do_pipeline(output_file):
    print('\nLoading files...\n')
    
    # Load Cuenta Publica files
    cp_years = range(2010, 2016)
    cp_file_format = 'Cuenta_Publica_%s.csv'
    cp = load_csv_files(cp_years, cp_file_format, 'windows-1252')
    
    # Load PEF 2016
    pef_years = [2016]
    pef_file_format = 'PEF%s_AC01.csv'
    pef = load_csv_files(pef_years, pef_file_format, 'cp850')
    
    # Assemble the two datasets in one dictionary
    datasets = cp
    datasets.update(pef)
    
    print('\nDelete empty columns...\n')
    delete_empty_columns(datasets)

    print('\nStrip padding from all cells...\n')
    strip_blanks(datasets)
    
    print('\nMapping columns...\n')
    aliases = load_aliases('mexican_federal_budget_column_nomenclature.yaml')
    alias_columns(datasets, aliases)

    print('\nCounting missing values...\n')
    empty_values_table = count_empty_values(datasets)
    
    print('\nBuilding column overview...\n')
    overview_table = build_overview(datasets)
    
    print('\nCleaning numerical columns...\n')
    clean_amount_columns(datasets)
    
    print('\nMerging datasets...\n')
    merged_dataset = concat(list(datasets.values()))
    
    print('\nSaving merged datasets to', output_file, '\n')    
#     merged_dataset.to_csv(output_file, encoding='utf-8', index=False)
    
    empty_counts_output_file = output_file.replace('csv', 'missing.csv')
    print('\nSaving missing values overview table to', empty_counts_output_file, '\n')    
    empty_values_table.to_csv(empty_counts_output_file, encoding='utf-8', index=False)

    overview_output_file = output_file.replace('csv', 'columns.csv')
    print('\nSaving column overview to', overview_output_file, '\n')    
    overview_table.to_csv(overview_output_file, encoding='utf-8', index=False)

    print('\nPipeline done!', '\n')
    return merged_dataset, overview_table, empty_values_table, datasets

In [24]:
mexico, overview, empties, datasets = do_pipeline('mexican_federal_budget.2010_to_2016.merged.iteration2.csv')


Loading files...

Loading Cuenta_Publica_2010.csv with encoding: windows-1252
Loading Cuenta_Publica_2011.csv with encoding: windows-1252
Loading Cuenta_Publica_2012.csv with encoding: windows-1252
Loading Cuenta_Publica_2013.csv with encoding: windows-1252
Loading Cuenta_Publica_2014.csv with encoding: windows-1252
Loading Cuenta_Publica_2015.csv with encoding: windows-1252
Loading PEF2016_AC01.csv with encoding: cp850

Delete empty columns...

2011 Unnamed: 25 deleted
2011 Unnamed: 26 deleted
2011 Unnamed: 27 deleted
2011 Unnamed: 28 deleted
2011 Unnamed: 29 deleted
2011 Unnamed: 30 deleted
2011 Unnamed: 31 deleted
2011 Unnamed: 32 deleted
2011 Unnamed: 33 deleted
2011 Unnamed: 34 deleted
2011 Unnamed: 35 deleted
2011 Unnamed: 36 deleted
2011 Unnamed: 37 deleted
2011 Unnamed: 38 deleted
2011 Unnamed: 39 deleted
2011 Unnamed: 40 deleted
2011 Unnamed: 41 deleted

Strip padding from all cells...

2010 stripped padding
2011 stripped padding
2012 stripped padding
2013 stripped padding
20

## Quality control

In [16]:
# for value in cp[2012]['Descripción de la entidad federativa'].values:
#     try:
#         value.strip()
#     except:
#         print(value)
#         print(type(value))
#         print(str(value))

for year, df in datasets.items():
    print(year, df.duplicated().apply(lambda x: 1 if x is True else 0).sum())

2016 0
2010 34
2011 37
2012 560
2013 0
2014 47
2015 16


In [25]:
empties

Unnamed: 0,Column,2010,2011,2012,2013,2014,2015,2016
0,Descripción de Grupo Funcional,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Objeto del Gasto,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,Tipo de Gasto,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,Descripción de Unidad Responsable,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Modalidad del Programa presupuestario,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,Subfunción,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Descripción de la modalidad del programa presu...,0.0,1.0,0.0,0.0,0.0,0.0,0.0
7,Descripción de Reasignacion,,,,,,,0.0
8,Descripción de Tipo de Gasto,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9,Programa Presupuestario,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [18]:
mexico['Descripción de Tipo de Gasto'].value_counts()

Gasto corriente                                                                                                                                                                     1020963
Gasto de obra pública                                                                                                                                                                199927
Gasto corriente por concepto de gastos indirectos de programas de subsidios                                                                                                           72246
Gasto de capital diferente de obra pública                                                                                                                                            30285
Participaciones                                                                                                                                                                         676
Gasto de inversión por concepto de gastos indirectos de prog

In [19]:
ls -lh | grep mexican_federal_budget.2010_to_2016

-rw-rw-r-- 1 loic loic 610M Aug 10 10:15 mexican_federal_budget.2010_to_2016.merged.backup.csv
-rw-rw-r-- 1 loic loic 2.1K Aug 10 10:19 mexican_federal_budget.2010_to_2016.merged.columns.csv
-rw-rw-r-- 1 loic loic 610M Aug 10 10:19 mexican_federal_budget.2010_to_2016.merged.csv
-rw-rw-r-- 1 loic loic 2.1K Aug 11 17:32 mexican_federal_budget.2010_to_2016.merged.iteration2.columns.csv
-rw-rw-r-- 1 loic loic 610M Aug 11 16:39 mexican_federal_budget.2010_to_2016.merged.iteration2.csv
-rw-rw-r-- 1 loic loic 2.1K Aug 11 17:32 mexican_federal_budget.2010_to_2016.merged.iteration2.missing.csv
-rw-rw-r-- 1 loic loic 379K Aug 10 07:36 mexican_federal_budget.2010_to_2016.merged.sample1000.csv


In [20]:
cat mexican_federal_budget.2010_to_2016.merged.csv | head -n 1

Actividad Institucional,Adefas,Aprobado,Ciclo,Clave de cartera,Descripción de Fuente de Financiamiento,Descripción de Función,Descripción de Grupo Funcional,Descripción de Objeto del Gasto,Descripción de Programa Presupuestario,Descripción de Ramo,Descripción de Reasignacion,Descripción de Subfunción,Descripción de Tipo de Gasto,Descripción de Unidad Responsable,Descripción de la Actividad Institucional,Descripción de la entidad federativa,Descripción de la modalidad del programa presupuestario,Devengado,Ejercicio,Ejercido,Entidad Federativa,Fuente de Financiamiento,Función,Grupo Funcional,Modalidad del Programa presupuestario,Modificado,Objeto del Gasto,Pagado,Programa Presupuestario,Ramo,Reasignacion,Subfunción,Tipo de Gasto,Unidad Responsable
cat: write error: Broken pipe


In [21]:
datasets[2014]['Descripción de Función'].value_counts()

Transporte                                                                                       52861
Coordinación de la Política de Gobierno                                                          34572
Justicia                                                                                         19123
Agropecuaria, Silvicultura, Pesca y Caza                                                         18344
Protección Ambiental                                                                             15930
Asuntos Económicos, Comerciales y Laborales en General                                           14747
Protección Social                                                                                13398
Educación                                                                                        12635
Salud                                                                                            10325
Otros Servicios Generales                                                

In [22]:
datasets[2014]['Función'].

SyntaxError: invalid syntax (<ipython-input-22-f9087391caa5>, line 1)

In [None]:
nan in mexico['Descripción de Objeto del Gasto'].unique()

In [None]:
nan in list(mexico['Descripción de la entidad federativa'].unique())

In [None]:
from numpy import nan

nan in list(mexico['Entidad Federativa'].unique())

In [None]:
'0' in mexico['Entidad Federativa'].unique()

In [None]:
from numpy import count_nonzero

count_nonzero(mexico['Entidad Federativa'].isnull()

In [None]:
datasets[2012]['Programa Presupuestario'].apply(lambda x: 1 if x is nan else 0).sum()