In [11]:
# Cell: Inspect first person’s *valid* responses (exclude "-9" and blanks)
import os
import pandas as pd
import chardet
import json

# ── 1) Paths (relative to this notebook) ────────────────────────────────────
RAW_DATA = os.path.join("..","data","raw","cep_base_consolidada_2010_ahora.csv")
RAW_DICT = os.path.join("..","data","raw","cep_diccionario_de_variables_base_consolidada.csv")

# ── 2) Detect encoding of the dictionary ────────────────────────────────────
with open(RAW_DICT, "rb") as f:
    sample = f.read(100_000)
enc = chardet.detect(sample)["encoding"]

# ── 3) Load & normalize dictionary columns ─────────────────────────────────
dict_df = pd.read_csv(
    RAW_DICT,
    dtype=str,
    keep_default_na=False,
    encoding=enc,
    encoding_errors="replace"
)

# Fuzzy‐find the key columns
cols_lower = {c: c.lower() for c in dict_df.columns}
def pick(*keywords):
    for col, low in cols_lower.items():
        if any(k in low for k in keywords):
            return col
    raise KeyError(f"No column matches {keywords}")

dict_df = dict_df.rename(columns={
    pick("variab"):     "variable",
    pick("pregunt"):    "pregunta",
    pick("alternat"):   "alternativa",
    pick("etiquet"):    "etiqueta"
})[["variable","pregunta","alternativa","etiqueta"]]

# ── 4) Load only the first person from the data ─────────────────────────────
data_df = pd.read_csv(
    RAW_DATA,
    dtype=str,
    keep_default_na=False,
    nrows=1,
    encoding_errors="replace"
).reset_index().rename(columns={"index":"row_id"})

# ── 5) Melt to long form & join metadata ──────────────────────────────────
long = data_df.melt(
    id_vars="row_id",
    var_name="variable",
    value_name="valor"
)
joined = long.merge(dict_df, on="variable", how="left")

# ── 6) Exclude rows where alternativa is "-9" or "" ───────────────────────
valid = joined[~joined["alternativa"].isin([""])]

# ── 7) Build a JSON-like dict for that first person ──────────────────────
person0 = {"row_id": 0}
for _, row in valid.iterrows():
    person0[row["variable"]] = {
        "valor":       row["valor"],
        "pregunta":    row["pregunta"],
        "alternativa": row["alternativa"],
        "etiqueta":    row["etiqueta"]
    }

# ── 8) Pretty-print the result ────────────────────────────────────────────
print(json.dumps(person0, ensure_ascii=False, indent=2))


{
  "row_id": 0,
  "aleatorio_deck": {
    "valor": "",
    "pregunta": "",
    "alternativa": "30",
    "etiqueta": ""
  },
  "aleatorio_deck_1": {
    "valor": "",
    "pregunta": "Aleatorio Deck 1",
    "alternativa": "119",
    "etiqueta": ""
  },
  "aleatorio_deck_2": {
    "valor": "",
    "pregunta": "Aleatorio Deck 2",
    "alternativa": "120",
    "etiqueta": ""
  },
  "aleatorio_deck_3": {
    "valor": "",
    "pregunta": "Aleatorio Deck 3",
    "alternativa": "118",
    "etiqueta": ""
  },
  "aleatorio_deck_4": {
    "valor": "",
    "pregunta": "Aleatorio Deck 4",
    "alternativa": "114",
    "etiqueta": ""
  },
  "anomia_1_a": {
    "valor": "",
    "pregunta": "Qué tan de acuerdo está con las siguientes afirmaciones? Actualmente, las personas deberían vivir el día a día y no preocuparse por el mañana.",
    "alternativa": "-9",
    "etiqueta": "No contesta"
  },
  "anomia_1_b": {
    "valor": "",
    "pregunta": "Qué tan de acuerdo está con las siguientes afirmaciones? C