# Imputers


In [None]:
import pandas as pd
df = pd.DataFrame({
    'idade': [14,25,23,31,31,None,18,20,15,17,60]
})
df

Unnamed: 0,idade
0,14.0
1,25.0
2,23.0
3,31.0
4,31.0
5,
6,18.0
7,20.0
8,15.0
9,17.0


In [None]:
from sklearn.impute import SimpleImputer
help(SimpleImputer)

Help on class SimpleImputer in module sklearn.impute._base:

class SimpleImputer(_BaseImputer)
 |  SimpleImputer(*, missing_values=nan, strategy='mean', fill_value=None, verbose='deprecated', copy=True, add_indicator=False, keep_empty_features=False)
 |  
 |  Univariate imputer for completing missing values with simple strategies.
 |  
 |  Replace missing values using a descriptive statistic (e.g. mean, median, or
 |  most frequent) along each column, or using a constant value.
 |  
 |  Read more in the :ref:`User Guide <impute>`.
 |  
 |  .. versionadded:: 0.20
 |     `SimpleImputer` replaces the previous `sklearn.preprocessing.Imputer`
 |     estimator which is now removed.
 |  
 |  Parameters
 |  ----------
 |  missing_values : int, float, str, np.nan, None or pandas.NA, default=np.nan
 |      The placeholder for the missing values. All occurrences of
 |      `missing_values` will be imputed. For pandas' dataframes with
 |      nullable integer dtypes with missing values, `missing_v

In [None]:
constant_imputer = SimpleImputer(strategy='constant', fill_value=-1)
constant_imputer

In [None]:
df

Unnamed: 0,idade
0,14.0
1,25.0
2,23.0
3,31.0
4,31.0
5,
6,18.0
7,20.0
8,15.0
9,17.0


In [None]:
array = df['idade'].to_numpy().reshape(-1,1)
array

array([[14.],
       [25.],
       [23.],
       [31.],
       [31.],
       [nan],
       [18.],
       [20.],
       [15.],
       [17.],
       [60.]])

In [None]:
constant_imputer.fit(array) #treinamento

In [None]:
df['constant'] = constant_imputer.transform(array)
df

Unnamed: 0,idade,constant
0,14.0,14.0
1,25.0,25.0
2,23.0,23.0
3,31.0,31.0
4,31.0,31.0
5,,-1.0
6,18.0,18.0
7,20.0,20.0
8,15.0,15.0
9,17.0,17.0


In [None]:
df_pandas = pd.DataFrame({
    'idade': df['idade'].copy()
})
constant = -1
df_pandas[df_pandas.isnull()] = constant
df_pandas

Unnamed: 0,idade
0,14.0
1,25.0
2,23.0
3,31.0
4,31.0
5,-1.0
6,18.0
7,20.0
8,15.0
9,17.0


In [None]:
mean_imputer = SimpleImputer(strategy='mean')
mean_imputer

In [None]:
mean_imputer.fit(array) #treinamento

In [None]:
mean_imputer.transform(array)

array([[14. ],
       [25. ],
       [23. ],
       [31. ],
       [31. ],
       [25.4],
       [18. ],
       [20. ],
       [15. ],
       [17. ],
       [60. ]])

In [None]:
df['mean'] = mean_imputer.transform(array)
df

Unnamed: 0,idade,constant,mean
0,14.0,14.0,14.0
1,25.0,25.0,25.0
2,23.0,23.0,23.0
3,31.0,31.0,31.0
4,31.0,31.0,31.0
5,,-1.0,25.4
6,18.0,18.0,18.0
7,20.0,20.0,20.0
8,15.0,15.0,15.0
9,17.0,17.0,17.0


In [None]:
df_pandas['idade'].mean()

23.0

In [None]:
df_pandas = pd.DataFrame({
    'idade': df['idade'].copy()
})
media = df_pandas['idade'].mean()
df_pandas[df_pandas.isnull()] = media
df_pandas

Unnamed: 0,idade
0,14.0
1,25.0
2,23.0
3,31.0
4,31.0
5,25.4
6,18.0
7,20.0
8,15.0
9,17.0


In [None]:
median_imputer = SimpleImputer(strategy='median')
median_imputer

In [None]:
median_imputer.fit(array) #treinamento

In [None]:
median_imputer.transform(array)

array([[14. ],
       [25. ],
       [23. ],
       [31. ],
       [31. ],
       [21.5],
       [18. ],
       [20. ],
       [15. ],
       [17. ],
       [60. ]])

In [None]:
df['median'] = median_imputer.transform(array)
df

Unnamed: 0,idade,constant,mean,median
0,14.0,14.0,14.0,14.0
1,25.0,25.0,25.0,25.0
2,23.0,23.0,23.0,23.0
3,31.0,31.0,31.0,31.0
4,31.0,31.0,31.0,31.0
5,,-1.0,25.4,21.5
6,18.0,18.0,18.0,18.0
7,20.0,20.0,20.0,20.0
8,15.0,15.0,15.0,15.0
9,17.0,17.0,17.0,17.0


In [None]:
df_pandas = pd.DataFrame({
    'idade': df['idade'].copy()
})
mediana = df_pandas['idade'].median()
df_pandas[df_pandas.isnull()] = mediana
df_pandas

Unnamed: 0,idade
0,14.0
1,25.0
2,23.0
3,31.0
4,31.0
5,21.5
6,18.0
7,20.0
8,15.0
9,17.0


In [None]:
mode_imputer = SimpleImputer(strategy='most_frequent')
mode_imputer

In [None]:
mode_imputer.fit(array) #treinamento

In [None]:
mode_imputer.transform(array)

array([[14.],
       [25.],
       [23.],
       [31.],
       [31.],
       [31.],
       [18.],
       [20.],
       [15.],
       [17.],
       [60.]])

In [None]:
array

array([[14.],
       [25.],
       [23.],
       [31.],
       [31.],
       [nan],
       [18.],
       [20.],
       [15.],
       [17.],
       [60.]])

In [None]:
df['mode'] = mode_imputer.transform(array)
df

Unnamed: 0,idade,constant,mean,median,mode
0,14.0,14.0,14.0,14.0,14.0
1,25.0,25.0,25.0,25.0,25.0
2,23.0,23.0,23.0,23.0,23.0
3,31.0,31.0,31.0,31.0,31.0
4,31.0,31.0,31.0,31.0,31.0
5,,-1.0,25.4,21.5,31.0
6,18.0,18.0,18.0,18.0,18.0
7,20.0,20.0,20.0,20.0,20.0
8,15.0,15.0,15.0,15.0,15.0
9,17.0,17.0,17.0,17.0,17.0


In [None]:
df_pandas = pd.DataFrame({
    'idade': df['idade'].copy()
})
mode = df_pandas['idade'].mode(0)
df_pandas[df_pandas.isnull()] = constant
df_pandas

Unnamed: 0,idade
0,14.0
1,25.0
2,23.0
3,31.0
4,31.0
5,-1.0
6,18.0
7,20.0
8,15.0
9,17.0


In [None]:
mode[0]

31.0

# Encoder



In [None]:
import pandas as pd
dados = [
    'professor',
    'medico',
    'advogado',
    'medico',
    'professor',
    'policial',
    'engenheiro',
    'engenheiro',
    'policial',
    'advogado',
]

df = pd.DataFrame({
    'profissao': dados
})
df

Unnamed: 0,profissao
0,professor
1,medico
2,advogado
3,medico
4,professor
5,policial
6,engenheiro
7,engenheiro
8,policial
9,advogado


In [None]:
#!pip install category_encoders

In [None]:
from category_encoders import OrdinalEncoder
help(OrdinalEncoder)

Help on class OrdinalEncoder in module category_encoders.ordinal:

class OrdinalEncoder(category_encoders.utils.BaseEncoder, category_encoders.utils.UnsupervisedTransformerMixin)
 |  OrdinalEncoder(verbose=0, mapping=None, cols=None, drop_invariant=False, return_df=True, handle_unknown='value', handle_missing='value')
 |  
 |  Encodes categorical features as ordinal, in one ordered feature.
 |  
 |  Ordinal encoding uses a single column of integers to represent the classes. An optional mapping dict can be passed
 |  in; in this case, we use the knowledge that there is some true order to the classes themselves. Otherwise, the classes
 |  are assumed to have no true order and integers are selected at random.
 |  
 |  Parameters
 |  ----------
 |  
 |  verbose: int
 |      integer indicating verbosity of the output. 0 for none.
 |  cols: list
 |      a list of columns to encode, if None, all string columns will be encoded.
 |  drop_invariant: bool
 |      boolean for whether or not to dro

In [None]:
ordinal = OrdinalEncoder(cols='profissao')
ordinal

In [None]:
ordinal.fit(df)

In [None]:
import numpy as np
df['nulos'] = np.nan
df

Unnamed: 0,profissao,nulos
0,professor,
1,medico,
2,advogado,
3,medico,
4,professor,
5,policial,
6,engenheiro,
7,engenheiro,
8,policial,
9,advogado,


In [None]:
df = ordinal.transform(df)
df

ValueError: ignored

In [None]:
df_pandas = pd.DataFrame({
    'profissao': dados.copy(),
    'nulos': np.nan
})
mapa = {
    'professor':1,
    'medico':2,
    'advogado':3,
    'policial':4,
    'engenheiro':5
}
df_pandas['profissao'] = df_pandas['profissao'].map(mapa)
df_pandas

Unnamed: 0,profissao,nulos
0,1,
1,2,
2,3,
3,2,
4,1,
5,4,
6,5,
7,5,
8,4,
9,3,


In [None]:
from category_encoders import OneHotEncoder
help(OneHotEncoder)

Help on class OneHotEncoder in module category_encoders.one_hot:

class OneHotEncoder(category_encoders.utils.BaseEncoder, category_encoders.utils.UnsupervisedTransformerMixin)
 |  OneHotEncoder(verbose=0, cols=None, drop_invariant=False, return_df=True, handle_missing='value', handle_unknown='value', use_cat_names=False)
 |  
 |  Onehot (or dummy) coding for categorical features, produces one feature per category, each binary.
 |  
 |  Parameters
 |  ----------
 |  
 |  verbose: int
 |      integer indicating verbosity of the output. 0 for none.
 |  cols: list
 |      a list of columns to encode, if None, all string columns will be encoded.
 |  drop_invariant: bool
 |      boolean for whether or not to drop columns with 0 variance.
 |  return_df: bool
 |      boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array).
 |  use_cat_names: bool
 |      if True, category values will be included in the encoded column names. Since this can result in

In [None]:
df = pd.DataFrame({
    'profissao': dados,
    'nulos': np.nan
})
df

Unnamed: 0,profissao,nulos
0,professor,
1,medico,
2,advogado,
3,medico,
4,professor,
5,policial,
6,engenheiro,
7,engenheiro,
8,policial,
9,advogado,


In [None]:
#one_hot = OneHotEncoder(cols=['profissao'])
one_hot = OneHotEncoder(cols=['profissao'], use_cat_names=True)
one_hot

In [None]:
one_hot.fit(df) #treinamento

In [None]:
one_hot.transform(df)

Unnamed: 0,profissao_professor,profissao_medico,profissao_advogado,profissao_policial,profissao_engenheiro,nulos
0,1,0,0,0,0,
1,0,1,0,0,0,
2,0,0,1,0,0,
3,0,1,0,0,0,
4,1,0,0,0,0,
5,0,0,0,1,0,
6,0,0,0,0,1,
7,0,0,0,0,1,
8,0,0,0,1,0,
9,0,0,1,0,0,


In [None]:
df['profissao'].to_frame()

Unnamed: 0,profissao
0,professor
1,medico
2,advogado
3,medico
4,professor
5,policial
6,engenheiro
7,engenheiro
8,policial
9,advogado


In [None]:
one_hot.transform(df).drop('nulos', axis=1)

Unnamed: 0,profissao_professor,profissao_medico,profissao_advogado,profissao_policial,profissao_engenheiro
0,1,0,0,0,0
1,0,1,0,0,0
2,0,0,1,0,0
3,0,1,0,0,0
4,1,0,0,0,0
5,0,0,0,1,0
6,0,0,0,0,1
7,0,0,0,0,1
8,0,0,0,1,0
9,0,0,1,0,0


In [None]:
pd.concat([
    df['profissao'].to_frame(),
    one_hot.transform(df).drop('nulos', axis=1)
], axis=1)

Unnamed: 0,profissao,profissao_professor,profissao_medico,profissao_advogado,profissao_policial,profissao_engenheiro
0,professor,1,0,0,0,0
1,medico,0,1,0,0,0
2,advogado,0,0,1,0,0
3,medico,0,1,0,0,0
4,professor,1,0,0,0,0
5,policial,0,0,0,1,0
6,engenheiro,0,0,0,0,1
7,engenheiro,0,0,0,0,1
8,policial,0,0,0,1,0
9,advogado,0,0,1,0,0


In [None]:
help(pd.get_dummies)

Help on function get_dummies in module pandas.core.reshape.encoding:

get_dummies(data, prefix=None, prefix_sep='_', dummy_na: 'bool' = False, columns=None, sparse: 'bool' = False, drop_first: 'bool' = False, dtype: 'Dtype | None' = None) -> 'DataFrame'
    Convert categorical variable into dummy/indicator variables.
    
    Parameters
    ----------
    data : array-like, Series, or DataFrame
        Data of which to get dummy indicators.
    prefix : str, list of str, or dict of str, default None
        String to append DataFrame column names.
        Pass a list with length equal to the number of columns
        when calling get_dummies on a DataFrame. Alternatively, `prefix`
        can be a dictionary mapping column names to prefixes.
    prefix_sep : str, default '_'
        If appending prefix, separator/delimiter to use. Or pass a
        list or dictionary as with `prefix`.
    dummy_na : bool, default False
        Add a column to indicate NaNs, if False NaNs are ignored.
 

In [None]:
df_pandas = pd.DataFrame({
    'profissao': dados
})
df_pandas

Unnamed: 0,profissao
0,professor
1,medico
2,advogado
3,medico
4,professor
5,policial
6,engenheiro
7,engenheiro
8,policial
9,advogado


In [None]:
pd.concat([
    df_pandas,
    pd.get_dummies(df_pandas, columns=['profissao'])
], axis=1)

Unnamed: 0,profissao,profissao_advogado,profissao_engenheiro,profissao_medico,profissao_policial,profissao_professor
0,professor,0,0,0,0,1
1,medico,0,0,1,0,0
2,advogado,1,0,0,0,0
3,medico,0,0,1,0,0
4,professor,0,0,0,0,1
5,policial,0,0,0,1,0
6,engenheiro,0,1,0,0,0
7,engenheiro,0,1,0,0,0
8,policial,0,0,0,1,0
9,advogado,1,0,0,0,0


In [None]:
!pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from category_encoders import BinaryEncoder
help(BinaryEncoder)

Help on class BinaryEncoder in module category_encoders.binary:

class BinaryEncoder(category_encoders.basen.BaseNEncoder)
 |  BinaryEncoder(verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, *, base=2, handle_unknown='value', handle_missing='value')
 |  
 |  Binary encoding for categorical variables, similar to onehot, but stores categories as binary bitstrings.
 |  
 |  Parameters
 |  ----------
 |  
 |  verbose: int
 |      integer indicating verbosity of the output. 0 for none.
 |  cols: list
 |      a list of columns to encode, if None, all string columns will be encoded.
 |  drop_invariant: bool
 |      boolean for whether or not to drop columns with 0 variance.
 |  return_df: bool
 |      boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array).
 |  handle_unknown: str
 |      an extra column will be added in if the transform matrix has unknown categories.  This can cause
 |      unexpected changes in dimension i

In [None]:
df = pd.DataFrame({
    'profissao': dados
})
df

Unnamed: 0,profissao
0,professor
1,medico
2,advogado
3,medico
4,professor
5,policial
6,engenheiro
7,engenheiro
8,policial
9,advogado


In [None]:
binary = BinaryEncoder(cols=['profissao'])
binary

In [None]:
binary.fit(df) #treinamento

In [None]:
binary.transform(df)

Unnamed: 0,profissao_0,profissao_1,profissao_2
0,0,0,1
1,0,1,0
2,0,1,1
3,0,1,0
4,0,0,1
5,1,0,0
6,1,0,1
7,1,0,1
8,1,0,0
9,0,1,1


In [None]:
pd.concat([
    df,
    binary.transform(df)
], axis=1)

Unnamed: 0,profissao,profissao_0,profissao_1,profissao_2
0,professor,0,0,1
1,medico,0,1,0
2,advogado,0,1,1
3,medico,0,1,0
4,professor,0,0,1
5,policial,1,0,0
6,engenheiro,1,0,1
7,engenheiro,1,0,1
8,policial,1,0,0
9,advogado,0,1,1


In [None]:
df_binary = binary.transform(df)
df_binary

Unnamed: 0,profissao_0,profissao_1,profissao_2
0,0,0,1
1,0,1,0
2,0,1,1
3,0,1,0
4,0,0,1
5,1,0,0
6,1,0,1
7,1,0,1
8,1,0,0
9,0,1,1


In [None]:
binary.inverse_transform(df_binary)

Unnamed: 0,profissao
0,professor
1,medico
2,advogado
3,medico
4,professor
5,policial
6,engenheiro
7,engenheiro
8,policial
9,advogado


In [None]:
pd.concat([
    df,
    binary.transform(df),
    binary.inverse_transform(df_binary)
], axis=1)

Unnamed: 0,profissao,profissao_0,profissao_1,profissao_2,profissao.1
0,professor,0,0,1,professor
1,medico,0,1,0,medico
2,advogado,0,1,1,advogado
3,medico,0,1,0,medico
4,professor,0,0,1,professor
5,policial,1,0,0,policial
6,engenheiro,1,0,1,engenheiro
7,engenheiro,1,0,1,engenheiro
8,policial,1,0,0,policial
9,advogado,0,1,1,advogado
