In [None]:

import os, re, json
from datetime import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


BASE_DIR = "/mnt/data/entrega_proddar"
NOTEBOOKS_DIR = os.path.join(BASE_DIR, "notebooks")
DATA_DIR = os.path.join(BASE_DIR, "data")
RAW_DIR = os.path.join(DATA_DIR, "raw")
PROCESSED_DIR = os.path.join(DATA_DIR, "processed")
OUTPUTS_DIR = os.path.join(BASE_DIR, "outputs")

for d in [BASE_DIR, NOTEBOOKS_DIR, DATA_DIR, RAW_DIR, PROCESSED_DIR, OUTPUTS_DIR]:
    os.makedirs(d, exist_ok=True)


INPUT_CSV = os.path.join(RAW_DIR, "beneficiarios_beca_proddar.csv")
assert os.path.exists(INPUT_CSV), f"No se encontró el CSV en {INPUT_CSV}"


In [None]:
def smart_read_csv(path):
    encodings = ["utf-8-sig", "utf-8", "latin-1", "cp1252"]
    last_err = None
    for enc in encodings:
        try:
            return pd.read_csv(path, encoding=enc)
        except Exception as e:
            last_err = e
    raise last_err

df = smart_read_csv(INPUT_CSV)
print("Filas x Columnas:", df.shape)
df.head(3)

In [None]:
import pandas as pd
import numpy as np
import re

def standardize_columns(df):
    colmap = {c: re.sub(r"\s+", "_", c.strip().lower()) for c in df.columns}
    return df.rename(columns=colmap)

def trim_strings(df):
    for c in df.select_dtypes(include=["object"]).columns:
        df[c] = df[c].astype(str).str.strip()
        df[c] = df[c].replace({"": np.nan})
    return df

def parse_dates(df):
    date_like = [c for c in df.columns if any(k in c for k in ["fecha","year","anio","año","mes"])]
    for c in date_like:
        try:
            df[c] = pd.to_datetime(df[c], errors="coerce", dayfirst=True, infer_datetime_format=True)
        except Exception:
            pass
    return df

original_columns = list(df.columns)
df = standardize_columns(df)
df = trim_strings(df)
df = parse_dates(df)


for c in df.columns:
    if df[c].dtype == "object":
        non_null = df[c].dropna()
        if len(non_null) > 0:
            sample = non_null.astype(str).str.replace(",", ".", regex=False)
            looks_numeric_ratio = (sample.str.match(r"^-?\d+(\.\d+)?$").sum()) / len(sample)
            if looks_numeric_ratio > 0.8:
                df[c] = pd.to_numeric(sample, errors="coerce")

print("Columnas originales:", original_columns)
print("Columnas procesadas:", list(df.columns))
df.head(5)

In [None]:
nulls = df.isna().sum().sort_values(ascending=False)
print("Nulos por columna (top 10):")
print(nulls.head(10))

print("\nTipos de datos:")
print(df.dtypes)


print("\nEstadísticas numéricas:")
print(df.describe(include=[np.number]).T)

print("\nEstadísticas categóricas:")
print(df.describe(include=['object']).T)

In [None]:
def choose_categorical_for_plot(df):
    candidates = []
    for c in df.columns:
        if df[c].dtype == "object" or pd.api.types.is_categorical_dtype(df[c]):
            nunique = df[c].nunique(dropna=True)
            if 2 <= nunique <= 30:
                candidates.append((c, nunique))
    boosts = ["region","comuna","sexo","genero","género","disciplina","programa","estado","tipo","modalidad","convocatoria","año","anio","year"]
    def score(name, nunique):
        base = 100 - nunique
        if any(b in name for b in boosts):
            base += 50
        return base
    if candidates:
        candidates.sort(key=lambda x: score(x[0], x[1]), reverse=True)
        return candidates[0][0]
    obj_cols = [c for c in df.columns if df[c].dtype == "object"]
    if obj_cols:
        obj_cols.sort(key=lambda c: (df[c].nunique(dropna=True), c))
        if df[obj_cols[0]].nunique(dropna=True) > 1:
            return obj_cols[0]
    return None

cat_col = choose_categorical_for_plot(df)
print("Columna categórica elegida:", cat_col)

if cat_col is not None:
    counts = df[cat_col].value_counts(dropna=True).head(15)
    plt.figure()
    counts.plot(kind="bar")  
    plt.title(f"Top 15 categorías por '{cat_col}'")
    plt.xlabel(cat_col)
    plt.ylabel("Recuento")
    plt.tight_layout()
else:
    print("No se encontró una columna categórica adecuada para graficar.")

In [None]:

processed_csv = os.path.join(PROCESSED_DIR, "beneficiarios_beca_proddar_procesado.csv")
df.to_csv(processed_csv, index=False)


if 'cat_col' in locals() and cat_col is not None:
    plot_path = os.path.join(OUTPUTS_DIR, f"visualizacion_final_por_{cat_col}.png")
    plt.savefig(plot_path, dpi=180, bbox_inches="tight")
    print("Gráfico exportado a:", plot_path)


metadata = {
    "created_at": datetime.now().isoformat(),
    "input_csv": INPUT_CSV,
    "original_columns": original_columns,
    "processed_columns": list(df.columns),
    "chosen_categorical": cat_col if 'cat_col' in locals() else None,
    "processed_csv": processed_csv,
}
with open(os.path.join(BASE_DIR, "metadata.json"), "w", encoding="utf-8") as f:
    json.dump(metadata, f, ensure_ascii=False, indent=2)

print("CSV procesado en:", processed_csv)
print("Metadata en:", os.path.join(BASE_DIR, "metadata.json"))