In [48]:
import json

# Load JSON data
with open('../../data/00-map/capacity/raw/2011/CE_2011_2.json', 'r', encoding='utf-8') as f:
    data = json.load(f)


In [49]:
tables = data["analyzeResult"]["tables"]

In [None]:
# Define keywords to search for
keywords = ["total", "subtotal", "gran total"]

# Normalize to lowercase for matching
keywords = [k.lower() for k in keywords]

# Function to check if any keyword is in the row
def row_contains_keyword(row):
    return row.astype(str).str.lower().apply(
        lambda cell: any(kw in cell for kw in keywords)
    ).any()

# Filter: keep only rows that match any keyword
df = df[df.apply(row_contains_keyword, axis=1)]

In [None]:
import pandas as pd
import numpy as np
# Function to check if any keyword is in the row
def column_contains_keyword(series):
    return series.astype(str).str.lower().apply(
        lambda val: any(kw in val for kw in keywords)
    ).any()

df_final = pd.DataFrame()
for idx, table in enumerate(tables):
    rows = table['rowCount']
    cols = table['columnCount']
    grid = [[np.nan for _ in range(cols)] for _ in range(rows)]

    for cell in table['cells']:
        r = cell['rowIndex']
        c = cell['columnIndex']
        val = cell.get('content', '').strip()
        if val:
            grid[r][c] = val

    df = pd.DataFrame(grid)
    # Step 4a: Drop fully empty rows
    df = df[~df.apply(lambda row: row.isna().all(), axis=1)]
    new_column_names = {old_name: f"col{i+1}" for i, old_name in enumerate(df.columns)}
    df.rename(columns=new_column_names, inplace=True)
    # Define keywords to search for
    keywords = ["TOTAL", "Capacidad", "Subtotal", "Entidad Federativa y Centro de Reclusión"]

    # Normalize to lowercase for matching
    keywords = [k.lower() for k in keywords]

    # Filter: keep only rows that match any keyword
    # Identify columns to keep
    columns_to_keep = [col for col in df.columns if column_contains_keyword(df[col])]

    # Filter the DataFrame
    df = df[columns_to_keep]

    new_column_names = {old_name: f"col{i+1}" for i, old_name in enumerate(df.columns)}
    df.rename(columns=new_column_names, inplace=True)

    df_final = pd.concat([df_final, df])

In [56]:
df_final

Unnamed: 0,col1,col2,col3,col4,col5
0,Entidad Federativa y Centro de Reclusión,Capacidad,,,TOTAL
1,,,Subtotal,Subtotal,
2,,,,,
4,AGUASCALIENTES,1270,1036,189,1225
6,CERESO Varonil Aguascalientes,600,516,118,634
...,...,...,...,...,...
23,"CEFERESO No. 4 ""Noroeste"" (4)",1456,82,1303,1385
24,"CEFERESO No. 5 ""Oriente""",2538,356,2167,2523
25,"CEFERESO No. 7 ""Nor-Noroeste""",480,233,229,462
26,CEFEREPSI,460,143,158,301


In [None]:
# a test for 2012, which since i PAID LED TO A DATA SET BUT NOW WE WANT TO DO IT FOR EVERY MONTH
df_final.to_excel('../../data/00-map/capacity/raw/2011/CE_2011_2.xlsx', 
                  index=False)

In [98]:
## 2012 and onwards

import json
import pandas as pd
import numpy as np

# Variables

keywords = ["TOTAL", "Capacidad", "Subtotal", "Entidad Federativa y Centro de Reclusión", 
            'ENTIDAD FEDERATIVA E INSTITUCIÓN PENITENCIARIA FEDERAL', 'ESPACIOS', 
            'Entidad Federativa / Institución Penitenciaria Federal', 'CAPACIDAD'
            ]

# Functions: 

def should_drop(val):
    if pd.isna(val):
        return True
    val = str(val).strip().lower()
    return "entidad federativa y centro de reclusión" in val

def column_contains_keyword(series):
    return series.astype(str).str.lower().apply(
        lambda val: any(kw in val for kw in keywords)
    ).any()

# Load JSON data

for year in [2012, 2015]:
    with open(f'../../data/00-map/capacity/raw/{year}/CE_ocr_all.json', 'r', encoding='utf-8') as f:
                data = json.load(f)

    tables = data["analyzeResult"]["tables"]
    data = []



    df_final = pd.DataFrame()
    i = 0
    for idx, table in enumerate(tables):
        rows = table['rowCount']
        cols = table['columnCount']
        grid = [[np.nan for _ in range(cols)] for _ in range(rows)]

        for cell in table['cells']:
            r = cell['rowIndex']
            c = cell['columnIndex']
            val = cell.get('content', '').strip()
            if val:
                grid[r][c] = val

        df = pd.DataFrame(grid)
        # Step 4a: Drop fully empty rows
        df = df[~df.apply(lambda row: row.isna().all(), axis=1)]
        new_column_names = {old_name: f"col{i+1}" for i, old_name in enumerate(df.columns)}
        df.rename(columns=new_column_names, inplace=True)
        # Define keywords to search for
    

        # Normalize to lowercase for matching
        keywords = [k.lower() for k in keywords]

        # Filter: keep only rows that match any keyword
        # Identify columns to keep
        columns_to_keep = [col for col in df.columns if column_contains_keyword(df[col])] 

        if 'col1' in df.columns and 'col1' not in columns_to_keep:
            columns_to_keep.insert(0, 'col1')

        # Filter the DataFrame
        df = df[columns_to_keep]
        if df.empty:
            print("⚠️ Warning: DataFrame is empty.")
            
        new_column_names = {old_name: f"col{i+1}" for i, old_name in enumerate(df.columns)}
        df.rename(columns=new_column_names, inplace=True)
        df.rename(columns={"col3": "col9", "col4": "col14", 
                           "col5": "col16"}, inplace=True)
        #  Create an indicator for "TOTAL" in col_1
        i = i + 1
        df_final = pd.concat([df_final, df])
    
    df_final['is_total'] = df_final['col1'].astype(str).str.upper().str.contains('TOTAL')

    # Create a numeric month index that increments on each TOTAL
    df_final['month'] = df_final['is_total'].cumsum() +1

    # Forward-fill to assign month label to all rows
    df_final['month'] = df_final['month'].ffill()

    # Clean up
    df_final.drop(columns=['is_total'], inplace=True)
    print(year)
    df_final.to_excel(f'../../data/00-map/capacity/raw/{year}/CE_{year}_all.xlsx', 
                      index=False)
    

for year in [2016]:
    with open(f'../../data/00-map/capacity/raw/{year}/CE_ocr_all.json', 'r', encoding='utf-8') as f:
                data = json.load(f)

    tables = data["analyzeResult"]["tables"]
    data = []



    df_final = pd.DataFrame()
    i = 0
    for idx, table in enumerate(tables):
        rows = table['rowCount']
        cols = table['columnCount']
        grid = [[np.nan for _ in range(cols)] for _ in range(rows)]

        for cell in table['cells']:
            r = cell['rowIndex']
            c = cell['columnIndex']
            val = cell.get('content', '').strip()
            if val:
                grid[r][c] = val

        df = pd.DataFrame(grid)
        # Step 4a: Drop fully empty rows
        df = df[~df.apply(lambda row: row.isna().all(), axis=1)]
        new_column_names = {old_name: f"col{i+1}" for i, old_name in enumerate(df.columns)}
        df.rename(columns=new_column_names, inplace=True)
        # Define keywords to search for
    

        # Normalize to lowercase for matching
        keywords = [k.lower() for k in keywords]

        # Filter: keep only rows that match any keyword
        # Identify columns to keep
        columns_to_keep = [col for col in df.columns if column_contains_keyword(df[col])] 

        if 'col1' in df.columns and 'col1' not in columns_to_keep:
            columns_to_keep.insert(0, 'col1')

        # Filter the DataFrame
        df = df[columns_to_keep]
        if df.empty:
            print("⚠️ Warning: DataFrame is empty.")
            
        new_column_names = {old_name: f"col{i+1}" for i, old_name in enumerate(df.columns)}
        df.rename(columns=new_column_names, inplace=True)
        df.rename(columns={"col3": "col9", "col4": "col14", 
                           "col5": "col16"}, inplace=True)
        #  Create an indicator for "TOTAL" in col_1
        i = i + 1
        df_final = pd.concat([df_final, df])
    
    df_final['is_total'] = df_final['col1'].astype(str).str.upper().str.contains('FEDERALES')

    # Create a numeric month index that increments on each TOTAL
    df_final['month'] = df_final['is_total'].cumsum() +1

    # Forward-fill to assign month label to all rows
    df_final['month'] = df_final['month'].ffill()

    # Clean up
    df_final.drop(columns=['is_total'], inplace=True)
    print(year)
    df_final.to_excel(f'../../data/00-map/capacity/raw/{year}/CE_{year}_all.xlsx', 
                      index=False)

2012
2015
2016


In [92]:
df_final.groupby(['month']).count()

Unnamed: 0_level_0,col1,col2,col9,col14,col16
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,437,437,429,304,429
2,439,439,430,309,430
3,873,873,856,619,856
4,437,437,421,307,421
5,428,428,415,309,415
6,851,851,827,606,827
7,428,428,417,305,417
8,429,426,387,330,317
9,427,421,367,353,272
10,426,426,415,301,414
