In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [None]:
data = load_breast_cancer()
feature = data.data[:]
target = data.target[:]
target_labels = data.target_names

In [None]:
fig = plt.figure(figsize=(12, 7))

for idx in np.arange(30):
    ax = fig.add_subplot(5, 6, idx+1)
    ax.hist(feature[:, idx])
    plt.xticks([])
    plt.yticks([])
    plt.title('col =' + str(idx))

#### Adaptive binning:
Bin continuous numerical data for each column into quantiles and assign each row a catagoracal variable 

In [None]:
# place feature data into pandas dataframe
df = pd.DataFrame(data = feature)

First looking at a single feature column

In [None]:
quantile_list = [0, .25, .5, .75, 1.]
quantiles = df[0].quantile(quantile_list)
quantiles

In [None]:
fig, ax = plt.subplots()
df[0].hist(bins=20, alpha=0.5, grid=False)

for quantile in quantiles:
    qvl = plt.axvline(quantile, color='r')
    ax.legend([qvl], ['Quantiles'], fontsize=10)

In [None]:
quantile_labels = ['0-25Q', '25-50Q', '50-75Q', '75-100Q']
df['{}_quantile_range'.format(str(0))] = pd.qcut(
                                            df[0], 
                                            q=quantile_list)
df['{}_quantile_label'.format(str(0))] = pd.qcut(
                                            df[0], 
                                            q=quantile_list,       
                                            labels=quantile_labels)

df[[0, '{}_quantile_range'.format(str(0)), 
               '{}_quantile_label'.format(str(0))]].sample(5)

Generate {column #}_quantile_range and {column #}_quantile_label for each feature column
Write to a new dataframe with only quantile labels for downstream use

In [None]:
quantile_labels = ['0-25Q', '25-50Q', '50-75Q', '75-100Q']


df1 = pd.DataFrame()
for i in np.arange(0, 30):
    df['{}_quantile_range'.format(str(i))] = pd.qcut(
                                                df[i], 
                                                q=quantile_list)
    df['{}_quantile_label'.format(str(i))] = pd.qcut(
                                                df[i], 
                                                q=quantile_list,       
                                                labels=quantile_labels)
    df1['{}_quantile_label'.format(str(i))] = df['{}_quantile_label'.format(str(i))]
df1.head()

In [None]:
#drop reference columns that contained quantile range just because
cols = [c for c in df.columns if 'range' in str(c)]
df = df.drop(labels=cols, axis='columns')

Create new dataframe that contains dummy variables for each column (one-hot-encoding for quantile catagorical values) 

In [None]:
df_ohe = pd.DataFrame()

for i in np.arange(0, 30):
    le = LabelEncoder()
    labels = le.fit_transform(df1['{}_quantile_label'.format(i)])
    mappings = {index: label for index, label in 
                      enumerate(le.classes_)}
    df1['{}_labels'.format(i)] = labels
    
    # reference only #_labels columns from original dataframe (df) to perform one_hot_encoding
    ohe = OneHotEncoder()
    feature_arr = ohe.fit_transform(
                                  df1[['{}_labels'.format(i)]]).toarray()
    feature_labels = list(le.classes_ + '_{}'.format(str(i)))
    features = pd.DataFrame(feature_arr, 
                            columns=feature_labels)
    df_ohe = pd.concat([df_ohe, features], axis = 1)


In [None]:
df_ohe.sample(10)