# Missing values as features

We can use missing values as features as well for preprocessing. 

## Importing and loading data

In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
%matplotlib inline

print(pd.__version__)
print(np.__version__)

2.1.3
1.26.1


In [3]:
# Loading the data
data = pd.read_csv('datasets/stroke prediction.csv')

# Check the data
data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,30669,Male,3.0,0,0,No,children,Rural,95.12,18.0,,0
1,30468,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
2,16523,Female,8.0,0,0,No,Private,Urban,110.89,17.6,,0
3,56543,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
4,46136,Male,14.0,0,0,No,Never_worked,Rural,161.28,19.1,,0


In [4]:
# Print the shape and columns
print('Shape:', data.shape)
print('Columns:', data.columns)

Shape: (43400, 12)
Columns: Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')


In [5]:
# Checking for NULL values
print(data.isnull().sum())

id                       0
gender                   0
age                      0
hypertension             0
heart_disease            0
ever_married             0
work_type                0
Residence_type           0
avg_glucose_level        0
bmi                   1462
smoking_status       13292
stroke                   0
dtype: int64


In [7]:
# Creating a new feature from the missing values
data['smoking_status_NA'] = data['smoking_status'].isnull() * 1

# Check the data
data[['smoking_status_NA', 'smoking_status']].head()

Unnamed: 0,smoking_status_NA,smoking_status
0,1,
1,0,never smoked
2,1,
3,0,formerly smoked
4,1,
