# Analisis del DataSet Adult

## Exportación y Carga del Dataset Adult

In [7]:
import pandas as pd

# URL del dataset en el repositorio de UCI
url = "adult/adult.data"
columnas = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
            "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
            "hours-per-week", "native-country", "income"]
# Leer CSV y limpiar espacios / valores faltantes
df = pd.read_csv(url, names=columnas, na_values="?", skipinitialspace=True)

## Exploración visual de los datos

In [10]:
# Cantidad de filas y columnas del DataFrame
print("Forma del DataFrame:", df.shape)

# Identificar columnas numéricas y categóricas
disc_cols = df.select_dtypes(include=["int64"]).columns.tolist()
cont_cols = df.select_dtypes(include=['float64']).columns.tolist()
cat_cols = df.select_dtypes(include=['object']).columns.tolist()

print("Columnas cuantitativas discreta:", disc_cols)
print("Columnas cuantitativas continuas:", cont_cols)
print("Columnas categóricas:", cat_cols)

# Ver las primeras filas
df.head()

Forma del DataFrame: (32561, 15)
Columnas cuantitativas discreta: ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
Columnas cuantitativas continuas: []
Columnas categóricas: ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## Tratamiento de valores nulos

In [11]:
print("\nConteo de valores faltantes (NaN) por columna:")
df.isnull().sum()


Conteo de valores faltantes (NaN) por columna:


age                  0
workclass         1836
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     583
income               0
dtype: int64

In [13]:
# WorkClass es de tipo Categórico
totalWorkclass = 1836
# Ocuppation es de tipo Categórico
totalOccupation = 1843
# Native Country es de tipo Categórico
totalNativecountry = 583

# En el pero caso se obtienen
print(f"Valores faltantes: {totalWorkclass + totalOccupation + totalNativecountry}")

Valores faltantes: 4262


In [None]:
# Es más del 10% de los datos de entrenamiento, por lo que no se eliminarán filas
# Se usará en cambio la imputación de valores por moda, ya que son variables categóricas

# Columnas categóricas con valores faltantes
cols_con_nan = ['workclass', 'occupation', 'native-country']

print("--- Distribución de Valores Únicos (incluyendo NaN) ---")

for col in cols_con_nan:
    print(f"\nColumna: {col}")
    # Mostrar la frecuencia de cada valor, incluyendo NaN
    print(df[col].value_counts(dropna=False))

--- Distribución de Valores Únicos (incluyendo NaN) ---

Columna: workclass
workclass
Private             22696
Self-emp-not-inc     2541
Local-gov            2093
NaN                  1836
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: count, dtype: int64

Columna: occupation
occupation
Prof-specialty       4140
Craft-repair         4099
Exec-managerial      4066
Adm-clerical         3770
Sales                3650
Other-service        3295
Machine-op-inspct    2002
NaN                  1843
Transport-moving     1597
Handlers-cleaners    1370
Farming-fishing       994
Tech-support          928
Protective-serv       649
Priv-house-serv       149
Armed-Forces            9
Name: count, dtype: int64

Columna: native-country
native-country
United-States                 29170
Mexico                          643
NaN                             583
Philippines                     198
Germany               

In [14]:
print("\n--- Moda Calculada para Imputación ---")
modas = {}

for col in cols_con_nan:
    # La moda es el valor más frecuente. [0] toma el primer valor si hay múltiples modas.
    moda_calculada = df[col].mode()[0]
    modas[col] = moda_calculada
    print(f"Moda de {col}: '{moda_calculada}'")


--- Moda Calculada para Imputación ---
Moda de workclass: 'Private'
Moda de occupation: 'Prof-specialty'
Moda de native-country: 'United-States'


In [15]:
print("\n--- Imputación y Verificación ---")

# Realizar la imputación usando el diccionario 'modas'
for col, moda_val in modas.items():
    # Remplazar los NaN de la columna con su moda
    df[col].fillna(moda_val, inplace=True)
    
# Verificar que ya no queden valores faltantes
print("\nConteo de valores faltantes (NaN) después de la imputación:")
print(df.isnull().sum())


--- Imputación y Verificación ---

Conteo de valores faltantes (NaN) después de la imputación:
age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(moda_val, inplace=True)
