In [4]:
import pandas as pd
import pathlib
import os


# Note: This code uses data that is not in the repository since it is too big. It is the raw data from CONEVAL

In [5]:
# Dictionary to map state codes to state names
state_names = {
    "1": "AGUASCALIENTES",
    "2": "BAJA CALIFORNIA",
    "3": "BAJA CALIFORNIA SUR",
    "4": "CAMPECHE",
    "5": "COAHUILA",
    "6": "COLIMA",
    "7": "CHIAPAS",
    "8": "CHIHUAHUA",
    "9": "DISTRITO FEDERAL",
    "10": "DURANGO",
    "11": "GUANAJUATO",
    "12": "GUERRERO",
    "13": "HIDALGO",
    "14": "JALISCO",
    "15": "MEXICO",
    "16": "MICHOACAN",
    "17": "MORELOS",
    "18": "NAYARIT",
    "19": "NUEVO LEON",
    "20": "OAXACA",
    "21": "PUEBLA",
    "22": "QUERETARO",
    "23": "QUINTANA ROO",
    "24": "SAN LUIS POTOSI",
    "25": "SINALOA",
    "26": "SONORA",
    "27": "TABASCO",
    "28": "TAMAULIPAS",
    "29": "TLAXCALA",
    "30": "VERACRUZ",
    "31": "YUCATAN",
    "32": "ZACATECAS"
}

In [6]:
current_dir = pathlib.Path(os.getcwd())
data_directory = current_dir.parents[1] / "data"

initial_data_2016_filename = data_directory / "pobreza16.csv"
initial_data_2018_filename = data_directory / "pobreza18.csv"
initial_data_2020_filename = data_directory / "pobreza20.csv"
initial_data_2022_filename = data_directory / "pobreza22.csv"

initial_data_2016 = pd.read_csv(initial_data_2016_filename)
initial_data_2018 = pd.read_csv(initial_data_2018_filename)
initial_data_2020 = pd.read_csv(initial_data_2020_filename)
initial_data_2022 = pd.read_csv(initial_data_2022_filename)

def clean_dataframe(initial_df, state_code_to_name):
    """
    Function that clean CONEVAL's dfs. Keeps only the variables we are going to
    use, uses the expansion factor to replicate accurate poverty rates, and
    groups by state to get the poverty rate by state (observations are households)

    Inputs:
        initial_df (pd.DataFrame): dataframe obtained from CONEVAL's webpage
        state_code_to_name (dict): dictionary that maps state code to name in order
            to have the name as a column
    
    Return:
        grouped_by_state_df (pd.DataFrame): dataframe with poverty rates by state
    """
    # Only keep variables of interest
    initial_df = initial_df[["ent", "factor", "ic_ali", "plp_e"]]
    # Use the expansion factor
    expanded_df = initial_df.loc[initial_df.index.repeat(initial_df["factor"])]
    # Group data by state
    grouped_by_state_df = expanded_df.groupby("ent").agg({
        "ic_ali": "mean",
        "plp_e": "mean"
    }).reset_index()
    grouped_by_state_df["state_name"] = grouped_by_state_df["ent"].astype(str).map(state_code_to_name)
    return grouped_by_state_df


FileNotFoundError: [Errno 2] No such file or directory: '/Users/danielm/Documents/UChicago/Harris/data_visualization/Poverty_Alimentary_Poverty_in_Mexico/data/pobreza16.csv'

In [42]:
# Create one dataframe that has all the years

collapsed_data_2016 = clean_dataframe(initial_data_2016, state_names)
collapsed_data_2016["year"] = 2016
collapsed_data_2018 = clean_dataframe(initial_data_2018, state_names)
collapsed_data_2018["year"] = 2018
collapsed_data_2020 = clean_dataframe(initial_data_2020, state_names)
collapsed_data_2020["year"] = 2020
collapsed_data_2022 = clean_dataframe(initial_data_2022, state_names)
collapsed_data_2022["year"] = 2022

final_df = pd.concat([collapsed_data_2016, collapsed_data_2018, collapsed_data_2020, collapsed_data_2022]).reset_index()
# Rename columns to have better readability
final_df = final_df.rename(columns={"ent": "state_code", "ic_ali": "alimentary_poverty", "plp_e": "extreme_poverty"})


In [43]:
final_df

Unnamed: 0,index,state_code,alimentary_poverty,extreme_poverty,state_name,year
0,0,1,0.171967,0.084531,AGUASCALIENTES,2016
1,1,2,0.149487,0.047609,BAJA CALIFORNIA,2016
2,2,3,0.191848,0.052915,BAJA CALIFORNIA SUR,2016
3,3,4,0.255969,0.144365,CAMPECHE,2016
4,4,5,0.161234,0.096119,COAHUILA,2016
...,...,...,...,...,...,...
123,27,28,0.121553,0.080343,TAMAULIPAS,2022
124,28,29,0.187368,0.169315,TLAXCALA,2022
125,29,30,0.198500,0.187598,VERACRUZ,2022
126,30,31,0.135700,0.101918,YUCATAN,2022


In [44]:
# Create new file in data directory using pathlib

filename = data_directory / "clean_dataset.csv"
final_df.to_csv(filename)