# One Hot Encoding

## OHE For HealthCare Pipeline

## Importing the Required Packages

In [21]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn import metrics
from sklearn.pipeline import Pipeline

## Reading the Dataset

In [22]:
df = pd.read_csv('heart.csv')
df.sample(frac=0.02)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
277,57,1,1,124,261,0,1,141,0,0.3,2,0,3,0
66,51,1,2,100,222,0,1,143,1,1.2,1,0,2,1
163,38,1,2,138,175,0,1,173,0,0.0,2,4,2,1
205,52,1,0,128,255,0,1,161,1,0.0,2,1,3,0
252,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0
99,53,1,2,130,246,1,0,173,0,0.0,2,3,2,1


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
age         303 non-null int64
sex         303 non-null int64
cp          303 non-null int64
trestbps    303 non-null int64
chol        303 non-null int64
fbs         303 non-null int64
restecg     303 non-null int64
thalach     303 non-null int64
exang       303 non-null int64
oldpeak     303 non-null float64
slope       303 non-null int64
ca          303 non-null int64
thal        303 non-null int64
target      303 non-null int64
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


## Splitting the data into Xtrain, Xtest, ytrain, ytest.

In [24]:
X = df.drop(['target'], axis=1)
y = df['target']

from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=1)

Xtrain = Xtrain.copy()
Xtest = Xtest.copy()
ytrain = ytrain.copy()
ytest = ytest.copy()

In [25]:
Xtrain.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
count,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0
mean,54.475207,0.694215,0.979339,131.380165,246.371901,0.14876,0.53719,149.524793,0.318182,1.02314,1.409091,0.727273,2.289256
std,9.155719,0.461694,1.024385,17.40952,51.509276,0.35659,0.52397,23.560318,0.466736,1.098264,0.612796,1.006205,0.610258
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0
25%,47.0,0.0,0.0,120.0,212.0,0.0,0.0,134.5,0.0,0.0,1.0,0.0,2.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,154.0,0.0,0.8,1.0,0.0,2.0
75%,61.0,1.0,2.0,140.0,274.0,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0
max,77.0,1.0,3.0,192.0,564.0,1.0,2.0,202.0,1.0,5.6,2.0,4.0,3.0


In [26]:
Xtrain.isnull().any().any()

False

In [27]:
numeric_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

In [28]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler(with_mean=True, with_std=True)

Xnumss = pd.DataFrame(ss.fit_transform(Xtrain[numeric_features]), 
                      columns=['ss_'+x for x in numeric_features], 
                      index=Xtrain.index)
Xtrain = pd.concat([Xtrain, Xnumss], axis=1)
Xtrain = Xtrain.drop(numeric_features, axis=1)
Xtrain.sample(frac=0.02)

Unnamed: 0,sex,cp,fbs,restecg,exang,slope,ca,thal,ss_age,ss_trestbps,ss_chol,ss_thalach,ss_oldpeak
36,0,2,1,1,0,2,0,2,-0.05201,0.208354,1.121109,0.870856,-0.933529
227,1,0,0,1,1,1,0,3,-2.131517,-0.65503,-0.941037,-0.830433,0.526335
101,1,3,0,0,0,0,0,3,0.495228,2.683386,0.459666,-0.192449,2.898614
117,1,3,0,0,0,1,0,3,0.166885,-0.65503,-1.038308,0.530598,0.80006
127,0,2,0,1,0,2,1,2,1.37081,1.186855,0.595846,0.95592,-0.933529


- OneHotEncode all of the categorical features in Xtrain, include transformed categorical features in Xtrain, and drop original categorical features in Xtrain

In [29]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False, dtype=int, handle_unknown='ignore')
Xcat = pd.DataFrame(ohe.fit_transform(Xtrain[categorical_features]), 
                    columns=ohe.get_feature_names(), 
                    index=Xtrain.index)
Xtrain = pd.concat([Xtrain, Xcat], axis=1)
Xtrain.drop(categorical_features, axis=1, inplace=True)
Xtrain.sample(frac=0.02)

Unnamed: 0,ss_age,ss_trestbps,ss_chol,ss_thalach,ss_oldpeak,x0_0,x0_1,x1_0,x1_1,x1_2,...,x5_2,x6_0,x6_1,x6_2,x6_3,x6_4,x7_0,x7_1,x7_2,x7_3
234,1.699153,-0.079441,1.471285,-1.723609,1.256267,0,1,1,0,0,...,0,0,0,0,1,0,0,0,1,0
33,-0.05201,-0.367235,0.518029,0.105276,-0.477321,0,1,0,0,1,...,0,0,1,0,0,0,0,0,1,0
297,0.495228,1.877561,-1.36903,-2.531721,-0.021114,0,1,1,0,0,...,0,0,0,1,0,0,0,1,0,0
139,1.042467,-0.194559,0.323487,-1.893738,-0.751046,0,1,1,0,0,...,0,0,1,0,0,0,0,0,0,1
134,-1.474831,-0.309676,1.160018,0.57313,-0.933529,1,0,0,1,0,...,1,1,0,0,0,0,0,0,1,0


## Fitting a Logistic Regression Model to training data

In [30]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()  
lr.fit(Xtrain, ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

- Standard Scale all of the numeric features in Xtest, include transformed numeric features in Xtest, and drop original numeric features in Xtest

In [31]:
Xnumss = pd.DataFrame(ss.transform(Xtest[numeric_features]), 
                      columns=['ss_'+x for x in numeric_features], 
                      index=Xtest.index)
Xtest = pd.concat([Xtest, Xnumss], axis=1)
Xtest = Xtest.drop(numeric_features, axis=1)
Xtest.sample(frac=0.02)

Unnamed: 0,sex,cp,fbs,restecg,exang,slope,ca,thal,ss_age,ss_trestbps,ss_chol,ss_thalach,ss_oldpeak
179,1,0,0,0,1,1,1,1,0.276333,1.071737,0.576391,-1.596013,-0.38608


- OneHotEncode all of the categorical features in Xtest, include transformed categorical features in Xtest, and drop original categorical features in Xtest

In [32]:
Xcat = pd.DataFrame(ohe.transform(Xtest[categorical_features]), columns=ohe.get_feature_names(), index=Xtest.index)
Xtest = pd.concat([Xtest, Xcat], axis=1)
Xtest.drop(categorical_features, axis=1, inplace=True)
Xtest.head()

Unnamed: 0,ss_age,ss_trestbps,ss_chol,ss_thalach,ss_oldpeak,x0_0,x0_1,x1_0,x1_1,x1_2,...,x5_2,x6_0,x6_1,x6_2,x6_3,x6_4,x7_0,x7_1,x7_2,x7_3
204,0.823571,1.647326,-1.602481,-0.192449,4.723443,1,0,1,0,0,...,0,0,0,0,1,0,0,0,0,1
159,0.166885,-0.079441,-0.49359,0.57313,-0.933529,0,1,0,1,0,...,1,1,0,0,0,0,0,0,0,1
219,-0.708697,-0.079441,0.187307,0.020212,-0.933529,0,1,1,0,0,...,1,0,0,1,0,0,0,0,0,1
174,0.604676,-0.079441,-0.785404,-0.745368,1.256267,0,1,1,0,0,...,0,0,0,1,0,0,0,0,0,1
184,-0.489801,1.071737,-0.065598,-0.915497,1.43875,0,1,1,0,0,...,0,1,0,0,0,0,0,0,0,1


## Predict and Evaluate Logisitic Regression Model on Xtest

In [33]:
ypred = lr.predict(Xtest)

from sklearn import metrics
print (metrics.accuracy_score(ytest, ypred))
print (metrics.confusion_matrix(ytest, ypred))
print (metrics.classification_report(ytest, ypred))

0.7868852459016393
[[21  9]
 [ 4 27]]
              precision    recall  f1-score   support

           0       0.84      0.70      0.76        30
           1       0.75      0.87      0.81        31

    accuracy                           0.79        61
   macro avg       0.79      0.79      0.78        61
weighted avg       0.79      0.79      0.79        61



- Read in heart.csv in to a dataframe called df
- Split the data into Xtrain, Xtest, ytrain, ytest - with 20% in test, and random_state=1
- Create two lists called numeric_features (with age, trestbps, chol, thalach, oldpeak), and categorical_features (with sex, cp, fbs, restecg, exang, slope, ca, thal)

In [34]:
df = pd.read_csv('heart.csv')
X = df.drop(['target'], axis=1)
y = df['target']
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=1)
Xtrain = Xtrain.copy()
Xtest = Xtest.copy()
ytrain = ytrain.copy()
ytest = ytest.copy()
numeric_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

- Create a pipeline called "numeric_transformer" with a StandardScaler step called "ss" (use the same parameters that you used in Part A above)

In [35]:
from sklearn.pipeline import Pipeline
numeric_transformer = Pipeline(
                                steps=[('ss', StandardScaler(with_mean=True, with_std=True))]
                                )

- Create a pipeline called "categorical_transformer" with a OneHotEncoder step called "ohe" (use the same parameters that you used in Part A above)

In [36]:
categorical_transformer = Pipeline(
                                    steps=[('ohe', OneHotEncoder(sparse=False, dtype=int, handle_unknown='ignore'))]
                                    )

- Create a column transformer called "preprocessor" with two transformers
    - (a) the first transformer called "num" which uses the numeric_transformer you defined above on the numeric_features
    - (b) the second transformer called "cat" which uses the categorical_transformer you defined above on the categorical_features

In [37]:
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)],
    remainder='drop')

- Create a pipeline called "clf" with two steps: 
    - (a) the first step called "pp" which invokes the preprocessor you defined above
    - (b) the second step called "lr" which involkes a logisitc regression model (use the same parameters that you used in Part A above)

In [38]:
from sklearn.linear_model import LogisticRegression  
clf = Pipeline(steps=[('pp', preprocessor),
                      ('lr', LogisticRegression())])

## Fitting the clf pipeline to the training data

In [39]:
clf.fit(Xtrain, ytrain)

Pipeline(memory=None,
         steps=[('pp',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('ss',
                                                                   StandardScaler(copy=True,
                                                                                  with_mean=True,
                                                                                  with_std=True))],
                                                           verbose=False),
                                                  ['age', 'trestbps', 'chol',
                                                   'thalach', 'oldpeak']),
                                                 (

## Predicting and Evaluate clf pipeline on Xtest

In [40]:
ypred = clf.predict(Xtest)
from sklearn import metrics
print (metrics.accuracy_score(ytest, ypred))
print (metrics.confusion_matrix(ytest, ypred))
print (metrics.classification_report(ytest, ypred))

0.7868852459016393
[[21  9]
 [ 4 27]]
              precision    recall  f1-score   support

           0       0.84      0.70      0.76        30
           1       0.75      0.87      0.81        31

    accuracy                           0.79        61
   macro avg       0.79      0.79      0.78        61
weighted avg       0.79      0.79      0.79        61

