## Impute Missing values
Prepare By: Ejaz-ur-Rehman\
Date: 23-07-2025\
Email ID: ijazfinance@gmail.com

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [3]:
# load the dataset
data = sns.load_dataset('titanic')
# display the first few rows of the dataset
data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [4]:
# check the missing values in teh dataset
data.isnull().sum().sort_values(ascending=False)    

deck           688
age            177
embarked         2
embark_town      2
survived         0
pclass           0
sex              0
sibsp            0
parch            0
fare             0
class            0
who              0
adult_male       0
alive            0
alone            0
dtype: int64

In [5]:
# drop the 'deck' column as it has too many missing values
data.drop(columns=['deck'], inplace=True)


In [6]:
# Impute missing values in 'age' column with the mean
data['age'].fillna(data['age'].mean(), inplace=True)    
# Impute missing values in 'embarked' column with the mode
data.fillna({'age': data['age'].mean(), 'embarked': data['embarked'].mode()[0]}, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['age'].fillna(data['age'].mean(), inplace=True)


In [7]:
# check the missing values in teh dataset
data.isnull().sum().sort_values(ascending=False)  


embark_town    2
survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
alive          0
alone          0
dtype: int64

In [8]:
data['embark_town'] = data['embark_town'].fillna(data['embark_town'].mode()[0])
# check the missing values in teh dataset
data.isnull().sum().sort_values(ascending=False)

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    0
alive          0
alone          0
dtype: int64

## What is KNN? Imputing through KNN:
- K-Nearest Neighbors (KNN) is a non-parametric, instance-based machine learning algorithm used for:
  - Classification (e.g., predicting categories like spam/ham)
  - Regression (predicting continuous values)
  - Imputation (filling missing values based on similar data points)
- Basic Idea of KNN:
  - "KNN predicts the value of a data point by looking at the 'K' most similar points (its neighbors) in the dataset."
- Imputing through KNN: When a value is missing, KNN finds the K most similar data points and uses their values to estimate the missing value.
  

In [9]:
# impute missing values using KNN
from sklearn.impute import KNNImputer
# call the KNN Class with the number of neighbors
imputer = KNNImputer(n_neighbors=4)

data[['age']] = imputer.fit_transform(data[['age']])
# check the missing values in teh dataset
data.isnull().sum().sort_values(ascending=False)

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    0
alive          0
alone          0
dtype: int64

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# load the dataset
df = sns.load_dataset('titanic')
# display the first few rows of the dataset
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [11]:
# chek the missing values in the dataset
df.isnull().sum().sort_values(ascending=False)

deck           688
age            177
embarked         2
embark_town      2
survived         0
pclass           0
sex              0
sibsp            0
parch            0
fare             0
class            0
who              0
adult_male       0
alive            0
alone            0
dtype: int64

In [15]:
from sklearn.preprocessing import LabelEncoder

# Create a label encoder object using labelencoder = LabelEncoder() in for loop for categorical columns
columns_to_encode = ['sex', 'embarked', 'class', 'who', 'deck', 'class' , 'embark_town', 'alive']
# dictionary to store the label encoders
label_encoders = {}

# Loop through the columns to encode
for column in columns_to_encode:
    le = LabelEncoder() # Create a new LabelEncoder instance for the column
    # Fit and transform the column, converting it to string type to avoid errors with NaN
    df[column] = le.fit_transform(df[column]) # Fit and transform the column
    # Store the label encoder in the dictionary
    label_encoders[column] = le
# Display the first few rows of the encoded dataset
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,1,22.0,1,0,7.25,2,2,1,True,7,2,0,False
1,1,1,0,38.0,1,0,71.2833,0,0,2,False,2,0,1,False
2,1,3,0,26.0,0,0,7.925,2,2,2,False,7,2,1,True
3,1,1,0,35.0,1,0,53.1,2,0,2,False,2,2,1,False
4,0,3,1,35.0,0,0,8.05,2,2,1,True,7,2,0,True
