## Get dummies Vs Labelencoding - Deesert predicting case study

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split as split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from pandas.tools.plotting import scatter_matrix
from mpl_toolkits.mplot3d import Axes3D
import warnings
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
df = pd.read_csv('dessert.csv')

In [3]:
df.head(3)

Unnamed: 0,id,day.of.week,num.of.guests,hour,table,dessert
0,1,Monday,2,evening,13,True
1,2,Saturday,4,night,19,True
2,3,Saturday,4,after-noon,12,False


The dessert DB contains restaurant data regarding costumers basic info and column showing if a desert was ordered or not.
The purpose of this code:
1. is to predict if a table (row in the data) will order a dessert.
2. to show the differences between Labelencoding and dummy variables

In [4]:
df.info() # 6 columns with 3 int, 2 obj and 1 bool columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
id               1000 non-null int64
day.of.week      1000 non-null object
num.of.guests    1000 non-null int64
hour             1000 non-null object
table            1000 non-null int64
dessert          1000 non-null bool
dtypes: bool(1), int64(3), object(2)
memory usage: 40.1+ KB


In [5]:
# creating list of categorical columns names
is_cat_cols = (df.dtypes == object).compress(lambda x: x==True).index.values.tolist()
#is_cat_cols.append('table')
print('\x1b[0;36;40m' + 'categorical columns :' + '\x1b[0m' , is_cat_cols)


[0;36;40mcategorical columns :[0m ['day.of.week', 'hour']


In [6]:
# spliting data to data and target (removing unique id column 0)
dessert_label_enc = df.copy()
dessert_dummies = df.copy()

## Labelencoding

In [7]:
from sklearn.preprocessing import MaxAbsScaler, LabelEncoder


le = LabelEncoder()
dessert_label_enc[is_cat_cols] = dessert_label_enc[is_cat_cols].apply(LabelEncoder().fit_transform)
print(dessert_label_enc.info(),dessert_label_enc.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
id               1000 non-null int64
day.of.week      1000 non-null int64
num.of.guests    1000 non-null int64
hour             1000 non-null int64
table            1000 non-null int64
dessert          1000 non-null bool
dtypes: bool(1), int64(5)
memory usage: 40.1 KB
None    id  day.of.week  num.of.guests  hour  table  dessert
0   1            1              2     1     13     True
1   2            2              4     2     19     True
2   3            2              4     0     12    False
3   4            4              3     2      5     True
4   5            1              7     3     10     True


In [8]:
dessert_label_enc_train, dessert_label_enc_test = split(dessert_label_enc, train_size=0.7, 
                              random_state=12345)

X,y = dessert_label_enc_train.iloc[:,[1,2,3,4]] , dessert_label_enc_train.iloc[:,[-1]] 

In [9]:
dessert_label_enc_clf = LogisticRegression(random_state=0, solver='lbfgs',
                         multi_class='multinomial').fit(X, y)

In [10]:
dessert_label_enc_train['prediction'] = dessert_label_enc_clf.predict(X)
dessert_label_enc_train[::100]

Unnamed: 0,id,day.of.week,num.of.guests,hour,table,dessert,prediction
860,861,0,4,1,9,False,False
191,192,6,5,1,4,True,True
715,716,2,2,1,7,False,False
471,472,5,3,2,8,True,True
98,99,2,4,0,4,False,False
571,572,0,2,3,16,False,True
227,228,2,2,2,3,True,True


In [11]:
## Validation

In [12]:
# train confusion matrix :
cm = confusion_matrix(y_true=dessert_label_enc_train['dessert'],
                      y_pred=dessert_label_enc_train['prediction'])
print(pd.DataFrame(cm,
             index=dessert_label_enc_clf.classes_,
             columns=dessert_label_enc_clf.classes_))

# test confusion matrix :
P = dessert_label_enc_test.drop(['dessert','id'], axis=1)
dessert_label_enc_test['prediction'] = dessert_label_enc_clf.predict(P)


cm = confusion_matrix(y_true=dessert_label_enc_test['dessert'],
                      y_pred=dessert_label_enc_test['prediction'])
print(pd.DataFrame(cm,
             index=dessert_label_enc_clf.classes_,
             columns=dessert_label_enc_clf.classes_))



       False  True 
False    165    125
True      85    325
       False  True 
False     74     63
True      32    131


## Get Dummies
dessert_dummies - contains "clean" db

is_cat_cols - contains the categories columns

In [14]:
# getting dummies
dummy_cols = pd.get_dummies(dessert_dummies[is_cat_cols])


In [15]:
dessert_dummies = dessert_dummies[['num.of.guests','table']].join(dummy_cols.ix[::]).join(dessert_dummies['dessert'])

In [245]:
dessert_dummies.head()

Unnamed: 0,num.of.guests,table,day.of.week_Friday,day.of.week_Monday,day.of.week_Saturday,day.of.week_Sunday,day.of.week_Thursday,day.of.week_Tuesday,day.of.week_Wednesday,hour_after-noon,hour_evening,hour_night,hour_noon,dessert
0,2,13,0,1,0,0,0,0,0,0,1,0,0,True
1,4,19,0,0,1,0,0,0,0,0,0,1,0,True
2,4,12,0,0,1,0,0,0,0,1,0,0,0,False
3,3,5,0,0,0,0,1,0,0,0,0,1,0,True
4,7,10,0,1,0,0,0,0,0,0,0,0,1,True


In [30]:
dessert_dummies_train, dessert_dummies_test = split(dessert_dummies, train_size=0.7, 
                              random_state=12345)

X,y = dessert_dummies_train.drop('dessert',axis=1) , dessert_dummies_train.iloc[:,[-1]] 

In [33]:
dessert_dummies_clf = LogisticRegression(random_state=0, solver='lbfgs',
                         multi_class='multinomial').fit(X, y)

In [34]:
dessert_dummies_train['prediction'] = dessert_dummies_clf.predict(X)
dessert_dummies_train[::100]

Unnamed: 0,num.of.guests,table,day.of.week_Friday,day.of.week_Monday,day.of.week_Saturday,day.of.week_Sunday,day.of.week_Thursday,day.of.week_Tuesday,day.of.week_Wednesday,hour_after-noon,hour_evening,hour_night,hour_noon,dessert,prediction
860,4,9,1,0,0,0,0,0,0,0,1,0,0,False,False
191,5,4,0,0,0,0,0,0,1,0,1,0,0,True,True
715,2,7,0,0,1,0,0,0,0,0,1,0,0,False,False
471,3,8,0,0,0,0,0,1,0,0,0,1,0,True,True
98,4,4,0,0,1,0,0,0,0,1,0,0,0,False,False
571,2,16,1,0,0,0,0,0,0,0,0,0,1,False,False
227,2,3,0,0,1,0,0,0,0,0,0,1,0,True,True


In [36]:
# train confusion matrix :
cm = confusion_matrix(y_true=dessert_dummies_train['dessert'],
                      y_pred=dessert_dummies_train['prediction'])
print(pd.DataFrame(cm,
             index=dessert_dummies_clf.classes_,
             columns=dessert_dummies_clf.classes_))

# test confusion matrix :
P = dessert_dummies_test.drop(['dessert'], axis=1)
dessert_dummies_test['prediction'] = dessert_dummies_clf.predict(P)


cm = confusion_matrix(y_true=dessert_dummies_test['dessert'],
                      y_pred=dessert_dummies_test['prediction'])
print(pd.DataFrame(cm,
             index=dessert_dummies_clf.classes_,
             columns=dessert_dummies_clf.classes_))

       False  True 
False    159    131
True      75    335
       False  True 
False     71     66
True      36    127


## Comparing between get dummies and label encoding

In [44]:
print('\x1b[0;36;40m' + 'Get dummies model values :' + '\x1b[0m' )
print (classification_report(y_true=dessert_dummies_test['dessert'],
                            y_pred=dessert_dummies_test['prediction']))

print('\x1b[0;36;40m' + 'Label encoding model values :' + '\x1b[0m' )
print (classification_report(y_true=dessert_label_enc_test['dessert'],
                            y_pred=dessert_label_enc_test['prediction']))

print('\x1b[7;49;93m' + 'It seems like for this specific dataset Label encoding performed better than get dummies categorical columns handling technique' + '\x1b[0m' )


[0;36;40mGet dummies model values :[0m
             precision    recall  f1-score   support

      False       0.66      0.52      0.58       137
       True       0.66      0.78      0.71       163

avg / total       0.66      0.66      0.65       300

[0;36;40mLabel encoding model values :[0m
             precision    recall  f1-score   support

      False       0.70      0.54      0.61       137
       True       0.68      0.80      0.73       163

avg / total       0.69      0.68      0.68       300

[7;49;93mIt seems like for this specific dataset Label encoding performed better than get dummies categorical columns handling technique[0m


## Checking the KNN model with label encoding technique

In [110]:
from sklearn.neighbors import KNeighborsClassifier

dessert_label_enc_train, dessert_label_enc_test = split(dessert_label_enc, train_size=0.7, 
                              random_state=12345)


X,y = dessert_label_enc_train.iloc[:,[1,2,3,4]] , dessert_label_enc_train.iloc[:,[-1]] 


k = 10
knn_clf = KNeighborsClassifier(n_neighbors=k)
knn_clf.fit(X, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')

In [111]:
dessert_dummies_train['prediction'] = knn_clf.predict(X)


In [112]:
M = dessert_label_enc_test.drop(['dessert','id'], axis=1)
dessert_label_enc_test['prediction'] = knn_clf.predict(M)


cm = confusion_matrix(y_true=dessert_label_enc_test['dessert'],
                      y_pred=dessert_label_enc_test['prediction'])
print(pd.DataFrame(cm,
             index=knn_clf.classes_,
             columns=knn_clf.classes_))

print('\x1b[0;36;40m' + 'KNN Label encoding model values :' + '\x1b[0m' )
print (classification_report(y_true=dessert_label_enc_test['dessert'],
                            y_pred=dessert_label_enc_test['prediction']))

       False  True 
False     82     55
True      37    126
[0;36;40mKNN Label encoding model values :[0m
             precision    recall  f1-score   support

      False       0.69      0.60      0.64       137
       True       0.70      0.77      0.73       163

avg / total       0.69      0.69      0.69       300

