In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import KBinsDiscretizer

# Resumo geral dos *discretizadores* existentes no Scikit-learn

<table align="center">
	<thead>
		<tr>
			<th>Função</th>
			<th>Observação</th>
		</tr>
    </thead>
    <tbody>
		<tr>
            <td><a href="https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.cut.html">pd.cut</a></td>
            <td>Cria intervalos de tamanhos iguais mas com frequencias de amostras desiguais em cada intervalo.</td>
        </tr>
		<tr>
            <td><a href="https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.qcut.html">pq.qcut</a></td>
            <td>Cria intervalos de tamanhos diferentes mas com frequencia de amostras iguais em cada intervalo.</td>
		</tr>
        <tr>
            <td><a href="https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html">KBinsDiscretizer</a></td>
            <td>Cria intervalos utilizando a estratégia especificada por parâmetro e permite fazer a transformação inversa.</td>
		</tr>
	</tbody>
</table>

# Exemplo de discretizações

In [2]:
amostra = pd.DataFrame(np.random.normal(loc=50, scale=20, size=100), columns=['value'])

## pd.cut

In [5]:
pd.cut(amostra['value'], bins=3).value_counts().sort_index()

(0.803, 38.276]      19
(38.276, 75.637]     67
(75.637, 112.998]    14
Name: value, dtype: int64

In [6]:
_min = amostra['value'].min()
_max = amostra['value'].max()
pd.cut(amostra['value'], bins=[_min, 25, 75, _max]).value_counts().sort_index()

(0.915, 25.0]       9
(25.0, 75.0]       75
(75.0, 112.998]    15
Name: value, dtype: int64

## pd.qcut

In [7]:
pd.qcut(amostra['value'], q=3).value_counts().sort_index()

(0.914, 43.276]      34
(43.276, 63.609]     33
(63.609, 112.998]    33
Name: value, dtype: int64

## KBinsDiscretizer

In [8]:
kbd = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
amostra['value_bins'] = kbd.fit_transform(amostra[['value']])
amostra['value_bins'].value_counts()
print(kbd.bin_edges_)

[array([  0.91504173,  38.27590137,  75.63676102, 112.99762067])]


In [7]:
kbd = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='quantile')
amostra['value_bins'] = kbd.fit_transform(amostra[['value']])
amostra['value_bins'].value_counts()

2.0    34
0.0    33
1.0    33
Name: value_bins, dtype: int64

In [8]:
kbd = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='kmeans')
amostra['value_bins'] = kbd.fit_transform(amostra[['value']])
amostra['value_bins'].value_counts()

1.0    49
0.0    34
2.0    17
Name: value_bins, dtype: int64

# Leitura complementar

1. [Binning Data with Pandas qcut and cut](https://pbpython.com/pandas-qcut-cut.html)
2. [Feature discretization](https://scikit-learn.org/stable/auto_examples/preprocessing/plot_discretization_classification.html)