# Preprocesamiento de datos
Usaremos sci-kit learn para hacer preprocesamiento de datos, lo que requiere el modelo es un tabla numérica y scikit nos permite hacerla.

In [24]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [25]:
#1 Cargar los datos

Data = pd.read_csv("../Datos/data_adults.csv")

In [26]:
Data.shape

(48842, 15)

In [27]:
# Iniciamos usando SimpleImputer para rellenar valores nulos

imputer_nulls_cat = SimpleImputer(strategy="constant", fill_value = "?")
imputer_nulls_num = SimpleImputer(strategy="mean")



In [28]:
#pd.DataFrame(imputer_nulls_cat.fit_transform(Data[['workclass']])).value_counts()

##Ahora definimos tres tipos de variable: categóricas, categóricas ordinales,  "normales",  "Rango"
Data = Data.drop("fnlwgt", axis=1)
Data = Data.drop("education-num", axis=1)

X = Data.drop("income", axis = 1)
y = Data['income']
X.dtypes

age                int64
workclass         object
education         object
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
dtype: object

In [29]:
X['education'].value_counts()

education
HS-grad         15784
Some-college    10878
Bachelors        8025
Masters          2657
Assoc-voc        2061
11th             1812
Assoc-acdm       1601
10th             1389
7th-8th           955
Prof-school       834
9th               756
12th              657
Doctorate         594
5th-6th           509
1st-4th           247
Preschool          83
Name: count, dtype: int64

In [48]:

## Definimos las transformaciones

standar_scaler=StandardScaler()
rango=MinMaxScaler((-1,1))

one_hot=OneHotEncoder(sparse_output = False)
# Define the desired order of categories
categories = ['Preschool', '1st-4th', '5th-6th', '7th-8th', '9th', '10th', '11th', '12th',
    'HS-grad', 'Some-college', 'Assoc-acdm', 'Assoc-voc', 'Prof-school',
    'Bachelors', 'Masters', 'Doctorate'  ]

# Create an OrdinalEncoder instance
ordinalencoder = OrdinalEncoder(categories=[categories])

In [49]:

### Primero creamos cuatro pipeline [IMPUTER NULL] -> [ENCODER o SCALER]

trans_num_standard =Pipeline([('imputer_null',imputer_nulls_num),
                               ('standard_scaler',standar_scaler)])
trans_num_rango=Pipeline([('imputer_null',imputer_nulls_num),
                               ('rango',rango)])

trans_cat_OHE=Pipeline([('imputer_null',imputer_nulls_cat),
                               ('OHE',one_hot)])
trans_cat_ordinal=Pipeline([('imputer_null',imputer_nulls_cat),
                               ('Ordinal',ordinalencoder)])


### Definimos las columnas a tratar

col_num_standard =['capital-gain','capital-loss']
col_num_rango = ['age','hours-per-week']

col_cat_OHE=['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
col_cat_ordinal=['education']


## Finalmente construimos columns transformers

Preprocesamiento=ColumnTransformer(
transformers=[
    ('Standar',trans_num_standard,col_num_standard),
    ('Rango',trans_num_rango,col_num_rango),
    ('OHE',trans_cat_OHE,col_cat_OHE),
    ('Ordinal',trans_cat_ordinal,col_cat_ordinal),
]
)

In [50]:
Preprocesamiento

In [51]:
result=Preprocesamiento.fit_transform(X)

In [52]:
result=Preprocesamiento.fit_transform(X)

In [53]:
result

array([[ 0.14693247, -0.2171271 , -0.39726027, ...,  0.        ,
         0.        , 13.        ],
       [-0.14480353, -0.2171271 , -0.09589041, ...,  0.        ,
         0.        , 13.        ],
       [-0.14480353, -0.2171271 , -0.42465753, ...,  0.        ,
         0.        ,  8.        ],
       ...,
       [-0.14480353, -0.2171271 , -0.42465753, ...,  0.        ,
         0.        , 13.        ],
       [ 0.58722034, -0.2171271 , -0.26027397, ...,  0.        ,
         0.        , 13.        ],
       [-0.14480353, -0.2171271 , -0.50684932, ...,  0.        ,
         0.        , 13.        ]])