Feature Engineering is the pre processing step of ML which is used to transform raw data into features that can be used for creating a predictive model using ML 

In [1]:
import pandas as pd
data = pd.read_csv("healthcare-dataset-stroke-data.csv")
data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [3]:
data.dtypes

id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

#### one hot encoding using pandas library
* returns a new dataframe with encoded values
* Missing values get handled automaticallly by creating seperate columns for missing values
* it automatically generates column names based on the unique values in the categorical 

In [5]:
#using pandas 
dummies = pd.get_dummies(data['gender'],dtype ='int')
dummies

Unnamed: 0,Female,Male,Other
0,0,1,0
1,1,0,0
2,0,1,0
3,1,0,0
4,1,0,0
...,...,...,...
5105,1,0,0
5106,1,0,0
5107,1,0,0
5108,0,1,0


In [6]:
new_data = pd.concat([data,dummies],axis = 1)
new_data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,Female,Male,Other
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1,0,1,0
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1,1,0,0
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1,0,1,0
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1,1,0,0
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1,1,0,0


In [7]:
new_data = new_data.drop(['Other','gender'],axis=1)
new_data

Unnamed: 0,id,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,Female,Male
0,9046,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1,0,1
1,51676,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1,1,0
2,31112,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1,0,1
3,60182,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1,1,0
4,1665,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0,1,0
5106,44873,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0,1,0
5107,19723,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0,1,0
5108,37544,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0,0,1


In [8]:
## USING SKLEARN
# Returns sparse matrix which is more memory efficient
# doesnt handle missing values automatically  - need to be handled before doing one-hot encoding
# column names are not automaticaly generated

from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()

In [9]:
encoded_data = encoder.fit_transform(data[['ever_married']]).toarray()
encoded_data

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [0., 1.],
       [0., 1.],
       [0., 1.]])

In [10]:
pd.DataFrame(encoded_data,columns=encoder.get_feature_names_out())

Unnamed: 0,ever_married_No,ever_married_Yes
0,0.0,1.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,0.0,1.0
...,...,...
5105,0.0,1.0
5106,0.0,1.0
5107,0.0,1.0
5108,0.0,1.0


### Label encoding
it converts categorical data into numerical format, where each unique category is assigned a integer label based on its alphabetical order

In [11]:
from sklearn.preprocessing import LabelEncoder

In [12]:
data_copy = data.copy()

In [14]:
le_work = LabelEncoder()
data_copy['work_type']=le_work.fit_transform(data_copy['work_type'])

In [15]:
data_copy

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,2,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,3,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,2,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,2,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,3,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,2,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,3,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,3,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,2,Rural,166.29,25.6,formerly smoked,0


In [53]:
data_copy3 =data.copy()

In [54]:
import pandas as pd

In [55]:
from sklearn.preprocessing import LabelEncoder

In [56]:

data_cleaned = data_copy3.copy()


numeric_columns = data_cleaned.select_dtypes(include=['float64', 'int64']).columns
data_cleaned[numeric_columns] = data_cleaned[numeric_columns].fillna(data_cleaned[numeric_columns].mean())


categorical_columns = data_cleaned.select_dtypes(include=['object']).columns
data_cleaned[categorical_columns] = data_cleaned[categorical_columns].fillna('Unknown')




In [57]:
data_cleaned

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.600000,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.500000,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.400000,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.000000,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,28.893237,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.000000,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.600000,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.600000,formerly smoked,0


In [58]:
encoder={}
for column in data_cleaned.select_dtypes(include=['object']).columns:
    encoder[column] = LabelEncoder()
    data_cleaned[column] = encoder[column].fit_transform(data_cleaned[column])

In [59]:
data_cleaned

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,1,67.0,0,1,1,2,1,228.69,36.600000,1,1
1,51676,0,61.0,0,0,1,3,0,202.21,28.893237,2,1
2,31112,1,80.0,0,1,1,2,0,105.92,32.500000,2,1
3,60182,0,49.0,0,0,1,2,1,171.23,34.400000,3,1
4,1665,0,79.0,1,0,1,3,0,174.12,24.000000,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,0,80.0,1,0,1,2,1,83.75,28.893237,2,0
5106,44873,0,81.0,0,0,1,3,1,125.20,40.000000,2,0
5107,19723,0,35.0,0,0,1,3,0,82.99,30.600000,2,0
5108,37544,1,51.0,0,0,1,2,0,166.29,25.600000,1,0


In [63]:


data_cleaned['gender_original'] = encoder['gender'].inverse_transform(data_cleaned['gender'])



In [64]:
data_cleaned[['gender', 'gender_original']]

Unnamed: 0,gender,gender_original
0,1,Male
1,0,Female
2,1,Male
3,0,Female
4,0,Female
...,...,...
5105,0,Female
5106,0,Female
5107,0,Female
5108,1,Male


In [65]:
data_cleaned['worktype_original'] = encoder['work_type'].inverse_transform(data_cleaned['work_type'])

In [66]:
data_cleaned[['work_type', 'worktype_original']]

Unnamed: 0,work_type,worktype_original
0,2,Private
1,3,Self-employed
2,2,Private
3,2,Private
4,3,Self-employed
...,...,...
5105,2,Private
5106,3,Self-employed
5107,3,Self-employed
5108,2,Private
