In [1]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
from sklearn.preprocessing import LabelEncoder

# === 1. Cargar datos ===
ruta = "./../Registros_sin_nulos.csv"
df = pd.read_csv(ruta)

# === 2. Limpiar y preparar ===
df = df.drop(columns=["nosocio", "nocredito", "sucursal"], errors="ignore")

# Asegurar que el target estÃ© definido
df["target"] = df["diasmora"].apply(lambda x: 1 if x > 7 else 0)
df = df.drop(columns=["diasmora"], errors="ignore")

# === 3. Separar variables numÃ©ricas y categÃ³ricas ===
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

# === 4. CorrelaciÃ³n entre variables numÃ©ricas ===
corr_num = df[num_cols].corr(method='spearman').round(2)

# === 5. Dependencia entre variables categÃ³ricas (Cramer's V) ===
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

cramer_results = []
for i, col1 in enumerate(cat_cols):
    for col2 in cat_cols[i+1:]:
        v = cramers_v(df[col1], df[col2])
        if v > 0.2:  # umbral para dependencia relevante
            cramer_results.append((col1, col2, round(v, 3)))

cramer_df = pd.DataFrame(cramer_results, columns=["Var1", "Var2", "CramerV"]).sort_values("CramerV", ascending=False)

# === 6. Dependencia entre numÃ©ricas y categÃ³ricas (correlation ratio Î·) ===
def correlation_ratio(categories, measurements):
    fcat, _ = pd.factorize(categories)
    cat_means = [measurements[fcat == i].mean() for i in range(len(np.unique(fcat)))]
    overall_mean = measurements.mean()
    n_cat = np.bincount(fcat)
    numerator = np.sum(n_cat * (cat_means - overall_mean)**2)
    denominator = np.sum((measurements - overall_mean)**2)
    return np.sqrt(numerator / denominator) if denominator != 0 else 0

num_cat_results = []
for cat in cat_cols:
    for num in num_cols:
        eta = correlation_ratio(df[cat], df[num])
        if eta > 0.2:
            num_cat_results.append((cat, num, round(eta, 3)))

num_cat_df = pd.DataFrame(num_cat_results, columns=["CategÃ³rica", "NumÃ©rica", "Eta"]).sort_values("Eta", ascending=False)

# === 7. Guardar resultados ===
corr_num.to_csv("./correlaciones_numericas.csv")
cramer_df.to_csv("./dependencia_categoricas.csv", index=False)
num_cat_df.to_csv("./dependencia_mixta.csv", index=False)

print("âœ… AnÃ¡lisis completado:")
print(f"ðŸ“ˆ correlaciones_numericas.csv â†’ {corr_num.shape}")
print(f"ðŸ”¤ dependencia_categoricas.csv â†’ {len(cramer_df)} pares relevantes")
print(f"ðŸ”€ dependencia_mixta.csv â†’ {len(num_cat_df)} pares relevantes")


  return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))
  return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))
  return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))
  return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))
  return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))
  return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))
  return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))
  return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))
  return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))
  return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))
  return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))
  return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))
  return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))
  return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))
  return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))
  return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))
  return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))


âœ… AnÃ¡lisis completado:
ðŸ“ˆ correlaciones_numericas.csv â†’ (12, 12)
ðŸ”¤ dependencia_categoricas.csv â†’ 10 pares relevantes
ðŸ”€ dependencia_mixta.csv â†’ 21 pares relevantes
