In [1]:
# objective : Encoding Numerical Features . Approaches: Binning and Binarization
# Binning (or discretization ) --> involves converting continuous data into discrete intervals or bins.
# Types: equal width or uniform binning, equal freq.  or quantile binning, kmeans binning, custom binning

In [8]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.compose import ColumnTransformer

In [9]:
df = pd.read_csv('csv files/titanic.csv', usecols=['Age', 'Fare', 'Survived'])
print(df)

     Survived   Age     Fare
0           0  22.0   7.2500
1           1  38.0  71.2833
2           1  26.0   7.9250
3           1  35.0  53.1000
4           0  35.0   8.0500
..        ...   ...      ...
886         0  27.0  13.0000
887         1  19.0  30.0000
888         0   NaN  23.4500
889         1  26.0  30.0000
890         0  32.0   7.7500

[891 rows x 3 columns]


In [10]:
df.isnull().sum()

Survived      0
Age         177
Fare          0
dtype: int64

In [11]:
df.dropna(inplace=True)

In [12]:
df.isnull().sum()

Survived    0
Age         0
Fare        0
dtype: int64

In [13]:
df.shape

(714, 3)

In [14]:
x= df.iloc[:, 1:3]
y = df.iloc[:,0]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=11)

In [16]:
kbin_age = KBinsDiscretizer (n_bins=10, encode='ordinal', strategy='quantile')
kbin_fare = KBinsDiscretizer(n_bins=7, encode='ordinal', strategy='quantile')

In [17]:
trf = ColumnTransformer ( [
    ('first', kbin_age, [0]),
    ('second', kbin_fare, [1])
])

In [18]:
X_train_trf = trf.fit_transform(X_train)
X_test_trf = trf.transform(X_test)

In [19]:
X_train_trf

array([[2., 0.],
       [0., 2.],
       [3., 0.],
       ...,
       [9., 0.],
       [6., 1.],
       [9., 6.]])

In [20]:
trf.named_transformers_['first'].n_bins_

array([10])

In [21]:
trf.named_transformers_['first'].bin_edges_

array([array([ 0.42, 14.5 , 19.  , 22.  , 25.  , 28.5 , 32.  , 36.  , 42.  ,
              50.  , 80.  ])                                                ],
      dtype=object)

In [22]:
trf.named_transformers_['second'].n_bins_

array([7])

In [23]:
trf.named_transformers_['second'].bin_edges_

array([array([  0.        ,   7.82322857,   8.6625    ,  13.        ,
               21.75357143,  30.13211429,  65.        , 512.3292    ])],
      dtype=object)

In [24]:
trf.named_transformers_['second'].n_bins_

array([7])

In [25]:
output = pd.DataFrame({
    'age':X_train['Age'],
    'age_transformed':X_train_trf[:,0],
    'fare':X_train['Fare'],
    'fare_transformed':X_train_trf[:,0]
})

In [26]:
output

Unnamed: 0,age,age_transformed,fare,fare_transformed
408,21.0,2.0,7.7750,2.0
479,2.0,0.0,12.2875,0.0
296,23.5,3.0,7.2292,3.0
473,23.0,3.0,13.7917,3.0
448,5.0,0.0,19.2583,0.0
...,...,...,...,...
336,29.0,5.0,66.6000,5.0
421,21.0,2.0,7.7333,2.0
116,70.5,9.0,7.7500,9.0
103,33.0,6.0,8.6542,6.0


In [27]:
# Binarization: to encode continous value into binary values (either 0 or 1 ) 
# one of the usecase : Image processing in deep learning

In [34]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Binarizer

In [39]:
df = pd.read_csv('csv files/titanic.csv')[['Age','Fare','SibSp','Parch','Survived']]
df.dropna(inplace=True)
df

Unnamed: 0,Age,Fare,SibSp,Parch,Survived
0,22.0,7.2500,1,0,0
1,38.0,71.2833,1,0,1
2,26.0,7.9250,0,0,1
3,35.0,53.1000,1,0,1
4,35.0,8.0500,0,0,0
...,...,...,...,...,...
885,39.0,29.1250,0,5,0
886,27.0,13.0000,0,0,0
887,19.0,30.0000,0,0,1
889,26.0,30.0000,0,0,1


In [41]:
df['family'] = df['SibSp'] + df['Parch']
df.head()

Unnamed: 0,Age,Fare,SibSp,Parch,Survived,family
0,22.0,7.25,1,0,0,1
1,38.0,71.2833,1,0,1,1
2,26.0,7.925,0,0,1,0
3,35.0,53.1,1,0,1,1
4,35.0,8.05,0,0,0,0


In [42]:
df.drop(columns=['SibSp','Parch'],inplace=True)
df.head()

Unnamed: 0,Age,Fare,Survived,family
0,22.0,7.25,0,1
1,38.0,71.2833,1,1
2,26.0,7.925,1,0
3,35.0,53.1,1,1
4,35.0,8.05,0,0


In [43]:
X = df.drop(columns=['Survived'])
y = df['Survived']

In [44]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [45]:
X_train.head()

Unnamed: 0,Age,Fare,family
328,31.0,20.525,2
73,26.0,14.4542,1
253,30.0,16.1,1
719,33.0,7.775,0
666,25.0,13.0,0


In [46]:
# Now lets apply binarization on column Family

In [50]:
trf = ColumnTransformer([
    ('bin',Binarizer(copy=False),['family'])
],remainder='passthrough')

In [51]:
X_train_trf = trf.fit_transform(X_train)
X_test_trf = trf.transform(X_test)

In [55]:
print(X_train)
print(pd.DataFrame(X_train_trf,columns=['family','Age','Fare']))

      Age      Fare  family
328  31.0   20.5250       2
73   26.0   14.4542       1
253  30.0   16.1000       1
719  33.0    7.7750       0
666  25.0   13.0000       0
..    ...       ...     ...
92   46.0   61.1750       1
134  25.0   13.0000       0
337  41.0  134.5000       0
548  33.0   20.5250       2
130  33.0    7.8958       0

[571 rows x 3 columns]
     family   Age      Fare
0       1.0  31.0   20.5250
1       1.0  26.0   14.4542
2       1.0  30.0   16.1000
3       0.0  33.0    7.7750
4       0.0  25.0   13.0000
..      ...   ...       ...
566     1.0  46.0   61.1750
567     0.0  25.0   13.0000
568     0.0  41.0  134.5000
569     1.0  33.0   20.5250
570     0.0  33.0    7.8958

[571 rows x 3 columns]


In [56]:
# this is binarization . Various models can be trained with this data 