In [None]:
# Imports
import pandas as pd, numpy as np
import matplotlib.pyplot as plt, seaborn as sns
from scipy import stats
sns.set(style="whitegrid")
%matplotlib inline

# Carga
PATH = "data/pet_adoption_data.csv"
df = pd.read_csv(PATH)
df_original = df.copy()  # guarda copia


In [None]:
# Tipos y conteo de variables
print("Shape:", df.shape)
display(df.dtypes)
display(df.nunique().sort_values(ascending=False).head(50))

# Target balance
display(df['AdoptionLikelihood'].value_counts(dropna=False))
display((df['AdoptionLikelihood'].value_counts(normalize=True)*100).round(2))


In [None]:
# Detectar strings vacíos y ciertos tokens y convertirlos
def unify_nulls(df):
    df = df.replace(['', ' ', 'NA', 'N/A', 'na', 'nan', 'None', 'none', 'NULL', 'null', -999, 'unknown'], np.nan)
    return df

df = unify_nulls(df)
display((df.isna().sum()/len(df)*100).sort_values(ascending=False))


In [None]:
# Ejemplo de columnas a eliminar
to_drop = ['PetID']
for col in to_drop:
    if col in df.columns:
        df = df.drop(columns=[col])
print("Dropped:", to_drop)


In [None]:
# Variables categóricas candidatas
cat_cols = ['PetType','Breed','Color','Size']
bin_cols = ['Vaccinated','HealthCondition','PreviousOwner','AdoptionLikelihood']  # target ya es 0/1

# Convertir a category
for c in cat_cols:
    if c in df.columns:
        df[c] = df[c].astype('category')

# Asegurar target entero 0/1
df['AdoptionLikelihood'] = df['AdoptionLikelihood'].astype(int)

df.info()


In [None]:
# Estadísticas numéricas
display(df.select_dtypes(include=[np.number]).describe().T)

# Histogramas para numéricas
num_cols = ['AgeMonths','WeightKg','TimeInShelterDays','AdoptionFee']
num_cols = [c for c in num_cols if c in df.columns]
plt.figure(figsize=(14,8))
for i,col in enumerate(num_cols):
    plt.subplot(2,2,i+1)
    sns.histplot(df[col].dropna(), kde=True, bins=40)
    plt.title(col)
plt.tight_layout()
plt.show()

# Boxplots
plt.figure(figsize=(14,8))
for i,col in enumerate(num_cols):
    plt.subplot(2,2,i+1)
    sns.boxplot(x=df[col])
    plt.title(col)
plt.tight_layout()
plt.show()

# Categóricas: value_counts (top N)
for c in cat_cols:
    if c in df.columns:
        print("----", c, "----")
        display(df[c].value_counts().head(10))


In [None]:
for col in num_cols:
    s = df[col].dropna()
    print(col, " skew:", round(s.skew(),3), " kurtosis:", round(s.kurtosis(),3))


In [None]:
# Proporción de adopción por PetType
if 'PetType' in df.columns:
    prop = df.groupby('PetType')['AdoptionLikelihood'].mean().sort_values(ascending=False)
    display((prop*100).round(2))

# Boxplot de TimeInShelterDays por AdoptionLikelihood
plt.figure(figsize=(8,4))
sns.boxplot(x='AdoptionLikelihood', y='TimeInShelterDays', data=df)
plt.title('TimeInShelterDays vs AdoptionLikelihood')
plt.show()

# Gráficos numéricos por target
for col in num_cols:
    if col in df.columns:
        plt.figure(figsize=(6,3))
        sns.kdeplot(data=df, x=col, hue='AdoptionLikelihood', common_norm=False)
        plt.title(f'{col} by AdoptionLikelihood')
        plt.show()


In [None]:
# Correlación entre numéricas (Pearson)
num_df = df.select_dtypes(include=[np.number]).drop(columns=['AdoptionLikelihood'], errors='ignore')
corr = num_df.corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr, annot=True, fmt=".2f", cmap='vlag', center=0)
plt.title('Correlation matrix (numerical features)')
plt.show()


In [None]:
# Pairplot - si pocas columnas numéricas
sample = df.sample(min(2000, len(df)), random_state=42)
sns.pairplot(sample[num_cols + ['AdoptionLikelihood']], hue='AdoptionLikelihood', corner=True)
plt.show()


In [None]:
# Crosstab example: Size vs AdoptionLikelihood
if 'Size' in df.columns:
    display(pd.crosstab(df['Size'], df['AdoptionLikelihood'], normalize='index').round(3)*100)


Reglas de validación de datos (sugerencias)

Incluye en tu notebook un apartado con reglas detectadas, por ejemplo:

AgeMonths debe ser >= 0 y < 240 (si aparece >1000 es error).

WeightKg > 0 y < 200 (filtrar outliers).

AdoptionFee >= 0; si hay ceros verificar.

TimeInShelterDays no negativo.

Categorías con muy pocos registros agrupar en 'Other' (ej. razas raras).

In [None]:
# Ejemplo de limpieza por reglas
df = df[(df['AgeMonths'].between(0,240)) | (df['AgeMonths'].isna())]
df = df[df['WeightKg'].isna() | (df['WeightKg'].between(0.1,200))]


Features derivados sugeridos

Incluye una sección con posibles features a crear:

AgeYears = AgeMonths / 12 (más interpretable).

IsPuppy = AgeMonths < 12 (boolean).

FeePerKg = AdoptionFee / WeightKg (si WeightKg disponible).

ShelterTimeBucket = bucketizado de TimeInShelterDays (0-30,31-90,90+).

YoungAndLight combinando edad y peso.

In [None]:
df['AgeYears'] = (df['AgeMonths'] / 12).round(2)
df['IsPuppy'] = (df['AgeMonths'] < 12).astype(int)
df['ShelterTimeBucket'] = pd.cut(df['TimeInShelterDays'], bins=[-1,30,90,10000], labels=['0-30','31-90','90+'])
