 ## Data Mining and Machine Learning
 ## Feature Engineering: Feature Discretization(Binning)
 ### Edgar Acuna
 ####  Febrero 2020

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing 
from sklearn.preprocessing import KBinsDiscretizer
import Orange

In [2]:
pd.__version__

'0.24.2'

In [3]:
#Leyendo el dataset wine de la UCI
#wine_data=pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv", sep=";")
wine_data=pd.read_csv("c://PW-PR/winequality-red.csv", sep=";")
wine_data.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [4]:
wine_data.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


#### Eliminating the last column(classes)

In [5]:
df=wine_data.drop('quality',axis=1)

###  Equal-width Discretization using KBinsDiscretizer from Scikit-learn

#### Binning each column of the data in 10 values using equal width bins

In [6]:
est = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
df1=np.array(df['pH']).reshape(-1,1)
est.fit(df1)

Xt = est.transform(df1)
print(Xt)  

[[6.]
 [3.]
 [4.]
 ...
 [5.]
 [6.]
 [5.]]


In [7]:
help(KBinsDiscretizer)

Help on class KBinsDiscretizer in module sklearn.preprocessing._discretization:

class KBinsDiscretizer(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin)
 |  KBinsDiscretizer(n_bins=5, encode='onehot', strategy='quantile')
 |  
 |  Bin continuous data into intervals.
 |  
 |  Read more in the :ref:`User Guide <preprocessing_discretization>`.
 |  
 |  Parameters
 |  ----------
 |  n_bins : int or array-like, shape (n_features,) (default=5)
 |      The number of bins to produce. Raises ValueError if ``n_bins < 2``.
 |  
 |  encode : {'onehot', 'onehot-dense', 'ordinal'}, (default='onehot')
 |      Method used to encode the transformed result.
 |  
 |      onehot
 |          Encode the transformed result with one-hot encoding
 |          and return a sparse matrix. Ignored features are always
 |          stacked to the right.
 |      onehot-dense
 |          Encode the transformed result with one-hot encoding
 |          and return a dense array. Ignored features are always
 |   

In [8]:
#Converting the array into dataframe
tempodf=pd.DataFrame(Xt)
tempodf.columns=df.columns
tempodf.head()

ValueError: Length mismatch: Expected axis has 1 elements, new values have 11 elements

### Equal  width Discretization using my own program

In [None]:
wine_data.columns.tolist()

In [None]:
#Discretization by equal width\
#The output can be an either integer or an interval 
def disc_col_ew(df,str,k,out):
    df1=df[str]
    bins=np.linspace(df1.min(), df1.max(),k)
    if out=="num":
        df1=pd.cut(df1,bins=bins,include_lowest=True, right=True,labels=False)
    else:
        bins[0]=float('-inf')
        bins[k-1]=float('inf')
        df1=pd.cut(df1,bins=bins,include_lowest=True, right=True)  
    return df1

In [None]:
disc_col_ew(wine_data,'citric acid',10,out="num")

In [None]:
def disc_ew(df,k,out):
    name=df.columns.tolist()
    disc=pd.DataFrame()
    for name in df.columns.tolist():
        disc[name]=disc_col_ew(df,name,k,out)
    return disc  

In [None]:
disc_ew(wine_data,10,out="symb")

### Equal width discretization using Orange

In [None]:
veh = Orange.data.Table("c://PW-PR//vehicle.tab")
disc = Orange.preprocess.Discretize()
disc.method = Orange.preprocess.discretize.EqualWidth(n=10)
veh_disc = disc(veh)
print("Original dataset:")
for e in veh[:3]:
    print(e)

print("Discretized dataset:")
for e in veh_disc[:3]:
    print(e)

###  Equal Frequency Discretization using scikit-learn

In [None]:
est = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
est.fit(df)  
Xt = est.transform(df)
Xt  

In [None]:
#Converting the array into dataframe
tempodf=pd.DataFrame(Xt)
tempodf.columns=df.columns
tempodf.describe()

### Equal frequency Discretization using Orange 

In [None]:
disc = Orange.preprocess.Discretize()
disc.method = Orange.preprocess.discretize.EqualFreq(n=10)
veh1_disc = disc(veh)
print("Original dataset:")
for e in veh[:3]:
    print(e)

print("Discretized dataset:")
for e in veh1_disc[:3]:
    print(e)


### MDLP-Entropy Discretization using Orange

In [None]:
disc = Orange.preprocess.Discretize()
disc.method = Orange.preprocess.EntropyMDL(force=False)
veh1_disc = disc(veh)
print("Original dataset:")
for e in veh[:3]:
    print(e)

print("Discretized dataset:")
for e in veh1_disc[:3]:
    print(e)
    

In [None]:
x