In [1]:
# objective : Encoding Numerical Features . Approaches: Binning and Binarization
# Binning (or discretization ) --> involves converting continuous data into discrete intervals or bins.
# Types: equal width or uniform binning, equal freq.  or quantile binning, kmeans binning, custom binning

In [23]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.compose import ColumnTransformer

In [57]:
df = pd.read_csv('csv files/titanic.csv', usecols=['Age', 'Fare', 'Survived'])
print(df)

     Survived   Age     Fare
0           0  22.0   7.2500
1           1  38.0  71.2833
2           1  26.0   7.9250
3           1  35.0  53.1000
4           0  35.0   8.0500
..        ...   ...      ...
886         0  27.0  13.0000
887         1  19.0  30.0000
888         0   NaN  23.4500
889         1  26.0  30.0000
890         0  32.0   7.7500

[891 rows x 3 columns]


In [58]:
df.isnull().sum()

Survived      0
Age         177
Fare          0
dtype: int64

In [59]:
df.dropna(inplace=True)

In [60]:
df.isnull().sum()

Survived    0
Age         0
Fare        0
dtype: int64

In [61]:
df.shape

(714, 3)

In [64]:
x= df.iloc[:, 1:3]
y = df.iloc[:,0]

In [65]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=11)

In [78]:
kbin_age = KBinsDiscretizer (n_bins=10, encode='ordinal', strategy='quantile')
kbin_fare = KBinsDiscretizer(n_bins=7, encode='ordinal', strategy='quantile')

In [79]:
trf = ColumnTransformer ( [
    ('first', kbin_age, [0]),
    ('second', kbin_fare, [1])
])

In [80]:
X_train_trf = trf.fit_transform(X_train)
X_test_trf = trf.transform(X_test)

In [81]:
X_train_trf

array([[2., 0.],
       [0., 0.],
       [2., 0.],
       ...,
       [8., 0.],
       [4., 0.],
       [6., 2.]])

In [82]:
trf.named_transformers_['first'].n_bins_

array([10])

In [83]:
trf.named_transformers_['first'].bin_edges_

array([array([ 0.42      ,  8.72898352, 17.56616999, 24.79715449, 32.30919997,
              40.25908417, 48.29846226, 56.29171563, 64.0923913 , 72.375     ,
              80.        ])                                                   ],
      dtype=object)

In [84]:
trf.named_transformers_['second'].n_bins_

array([7])

In [85]:
trf.named_transformers_['second'].bin_edges_

array([array([  0.        ,  19.54481611,  47.24241275,  83.66529848,
              125.69006166, 193.98737235, 375.40765417, 512.3292    ])],
      dtype=object)

In [86]:
trf.named_transformers_['second'].n_bins_

array([7])

In [87]:
output = pd.DataFrame({
    'age':X_train['Age'],
    'age_transformed':X_train_trf[:,0],
    'fare':X_train['Fare'],
    'fare_transformed':X_train_trf[:,0]
})

In [88]:
output

Unnamed: 0,age,age_transformed,fare,fare_transformed
408,21.0,2.0,7.7750,2.0
479,2.0,0.0,12.2875,0.0
296,23.5,2.0,7.2292,2.0
473,23.0,2.0,13.7917,2.0
448,5.0,0.0,19.2583,0.0
...,...,...,...,...
336,29.0,3.0,66.6000,3.0
421,21.0,2.0,7.7333,2.0
116,70.5,8.0,7.7500,8.0
103,33.0,4.0,8.6542,4.0
