## Impute Missing values
Prepare By: Ejaz-ur-Rehman\
Date: 23-07-2025\
Email ID: ijazfinance@gmail.com

In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [49]:
# load the dataset
data = sns.load_dataset('titanic')
# display the first few rows of the dataset
data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [50]:
# check the missing values in teh dataset
data.isnull().sum().sort_values(ascending=False)    

deck           688
age            177
embarked         2
embark_town      2
survived         0
pclass           0
sex              0
sibsp            0
parch            0
fare             0
class            0
who              0
adult_male       0
alive            0
alone            0
dtype: int64

In [51]:
# drop the 'deck' column as it has too many missing values
data.drop(columns=['deck'], inplace=True)


In [52]:
# Impute missing values in 'age' column with the mean
data['age'].fillna(data['age'].mean(), inplace=True)    
# Impute missing values in 'embarked' column with the mode
data.fillna({'age': data['age'].mean(), 'embarked': data['embarked'].mode()[0]}, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['age'].fillna(data['age'].mean(), inplace=True)


In [53]:
# check the missing values in teh dataset
data.isnull().sum().sort_values(ascending=False)  


embark_town    2
survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
alive          0
alone          0
dtype: int64

In [54]:
data['embark_town'] = data['embark_town'].fillna(data['embark_town'].mode()[0])
# check the missing values in teh dataset
data.isnull().sum().sort_values(ascending=False)

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    0
alive          0
alone          0
dtype: int64

## What is KNN? Imputing through KNN:
- K-Nearest Neighbors (KNN) is a non-parametric, instance-based machine learning algorithm used for:
  - Classification (e.g., predicting categories like spam/ham)
  - Regression (predicting continuous values)
  - Imputation (filling missing values based on similar data points)
- Basic Idea of KNN:
  - "KNN predicts the value of a data point by looking at the 'K' most similar points (its neighbors) in the dataset."
- Imputing through KNN: When a value is missing, KNN finds the K most similar data points and uses their values to estimate the missing value.
  

In [55]:
# impute missing values using KNN
from sklearn.impute import KNNImputer
# call the KNN Class with the number of neighbors
imputer = KNNImputer(n_neighbors=4)

data[['age']] = imputer.fit_transform(data[['age']])
# check the missing values in teh dataset
data.isnull().sum().sort_values(ascending=False)

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    0
alive          0
alone          0
dtype: int64

In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# load the dataset
df = sns.load_dataset('titanic')
# display the first few rows of the dataset
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [57]:
# chek the missing values in the dataset
df.isnull().sum().sort_values(ascending=False)

deck           688
age            177
embarked         2
embark_town      2
survived         0
pclass           0
sex              0
sibsp            0
parch            0
fare             0
class            0
who              0
adult_male       0
alive            0
alone            0
dtype: int64

## Usage of LabelEncoder during Impute

In [58]:
from sklearn.preprocessing import LabelEncoder

# Create a label encoder object using labelencoder = LabelEncoder() in for loop for categorical columns
columns_to_encode = ['sex', 'embarked', 'class', 'who', 'deck', 'class' , 'embark_town', 'alive']
# dictionary to store the label encoders
label_encoders = {}

# Loop through the columns to encode
for column in columns_to_encode:
    # Create a new LabelEncoder instance for the column
    le = LabelEncoder() 
    # Fit and transform the column, converting it to string type to avoid errors with NaN
    df[column] = le.fit_transform(df[column]) # Fit and transform the column
    # Store the label encoder in the dictionary
    label_encoders[column] = le
# Display the first few rows of the encoded dataset
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,1,22.0,1,0,7.25,2,2,1,True,7,2,0,False
1,1,1,0,38.0,1,0,71.2833,0,0,2,False,2,0,1,False
2,1,3,0,26.0,0,0,7.925,2,2,2,False,7,2,1,True
3,1,1,0,35.0,1,0,53.1,2,0,2,False,2,2,1,False
4,0,3,1,35.0,0,0,8.05,2,2,1,True,7,2,0,True


In [59]:
# impute the missing values using IterativeImputer
# call the IterativeImputer class with the max_iter = 10 and random_state = 42
imputer = IterativeImputer(max_iter=10, random_state=42)
# columns to impute
columns_to_impute = ['age', 'embarked', 'deck', 'embark_town']
# loop to impute ecach column
for column in columns_to_impute:
    df[column] = imputer.fit_transform(df[[column]])  # Fit and transform the column
# check the missing values in the dataset after imputation
df.isnull().sum().sort_values(ascending=False)
# Display the first few rows of the dataset after imputation
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,1,22.0,1,0,7.25,2.0,2,1,True,7.0,2.0,0,False
1,1,1,0,38.0,1,0,71.2833,0.0,0,2,False,2.0,0.0,1,False
2,1,3,0,26.0,0,0,7.925,2.0,2,2,False,7.0,2.0,1,True
3,1,1,0,35.0,1,0,53.1,2.0,0,2,False,2.0,2.0,1,False
4,0,3,1,35.0,0,0,8.05,2.0,2,1,True,7.0,2.0,0,True


## Usage of Inverse Transform during Impute

In [60]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Sample dataframe loading (replace with your own data)
# df = pd.read_csv('your_dataset.csv')  # Uncomment if needed

# Step 1: Define columns to encode
columns_to_encode = ['sex', 'embarked', 'class', 'who', 'deck', 'embark_town', 'alive']

# Step 2: Create a dictionary to store LabelEncoders
label_encoders = {}

# Step 3: Encode the columns
for column in columns_to_encode:
    le = LabelEncoder()
    # Convert to string first to avoid issues with NaNs or non-string values
    df[column] = df[column].astype(str)
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le  # Save the encoder for later inverse transformation

# Step 4: Display encoded data (optional)
print("Encoded Data:\n", df.head())

# Step 5: Inverse transform the encoded columns to original values
for column in columns_to_encode:
    le = label_encoders[column]  # Retrieve the label encoder for the column
    # Convert values to int (safe if no NaNs)
    df[column] = le.inverse_transform(df[column].astype(int))

# Step 6: Display the data after inverse transformation
print("\nDecoded Data:\n", df.head())


Encoded Data:
    survived  pclass  sex   age  sibsp  parch     fare  embarked  class  who  \
0         0       3    1  22.0      1      0   7.2500         2      2    1   
1         1       1    0  38.0      1      0  71.2833         0      0    2   
2         1       3    0  26.0      0      0   7.9250         2      2    2   
3         1       1    0  35.0      1      0  53.1000         2      0    2   
4         0       3    1  35.0      0      0   8.0500         2      2    1   

   adult_male  deck  embark_town  alive  alone  
0        True     7            2      0  False  
1       False     2            0      1  False  
2       False     7            2      1   True  
3       False     2            2      1  False  
4        True     7            2      0   True  

Decoded Data:
    survived  pclass sex   age  sibsp  parch     fare embarked class who  \
0         0       3   1  22.0      1      0   7.2500      2.0     2   1   
1         1       1   0  38.0      1      0  71.28

## Regression Imputation
- Regression imputation is a statistical method used to fill in missing data based on the relationship between the variable with missing values and other observed variables. Instead of replacing missing values with a mean or median, regression imputation uses a regression model to predict the missing values.

In [61]:
# load the dataset
df = sns.load_dataset('titanic')

# check the missing values in the dataset
df.isnull().sum().sort_values(ascending=False)

deck           688
age            177
embarked         2
embark_town      2
survived         0
pclass           0
sex              0
sibsp            0
parch            0
fare             0
class            0
who              0
adult_male       0
alive            0
alone            0
dtype: int64

In [62]:
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# Load the dataset
df = sns.load_dataset('titanic')

# Drop 'deck' due to too many missing values
df.drop(columns=['deck'], inplace=True)

# Categorical columns to encode (deck removed, duplicate 'class' removed)
columns_to_encode = ['sex', 'embarked', 'class', 'who', 'embark_town', 'alive']

# Dictionary to store the label encoders
label_encoders = {}

# Encode categorical columns
for column in columns_to_encode:
    if column in df.columns:
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column].astype(str))
        label_encoders[column] = le
    else:
        print(f"Column '{column}' not found. Skipping.")

# Optional: Inverse transform (for verification)
for column in columns_to_encode:
    le = label_encoders[column]
    df[column] = le.inverse_transform(df[column].astype(int))

# split the dataset into two parts: one with missing values and one without
df_with_missing = df[df['age'].isnull()]
# drop the rows with missing values
df_without_missing = df.dropna()

print("The shape of the original dataset:", df.shape)
print("The shape of the dataset with missing values:", df_with_missing.shape)
print("The shape of the dataset without missing values:", df_without_missing.shape)

# display the dataset with missing values
print("\nDataset with missing values:\n", df_with_missing.head())

# display the dataset without missing values
print("\nDataset without missing values:\n", df_without_missing.head())

# check the name of the columns in the dataset
print("\nColumns in the dataset:", df.columns.tolist())


The shape of the original dataset: (891, 14)
The shape of the dataset with missing values: (177, 14)
The shape of the dataset without missing values: (714, 14)

Dataset with missing values:
     survived  pclass     sex  age  sibsp  parch     fare embarked   class  \
5          0       3    male  NaN      0      0   8.4583        Q   Third   
17         1       2    male  NaN      0      0  13.0000        S  Second   
19         1       3  female  NaN      0      0   7.2250        C   Third   
26         0       3    male  NaN      0      0   7.2250        C   Third   
28         1       3  female  NaN      0      0   7.8792        Q   Third   

      who  adult_male  embark_town alive  alone  
5     man        True   Queenstown    no   True  
17    man        True  Southampton   yes   True  
19  woman       False    Cherbourg   yes   True  
26    man        True    Cherbourg    no   True  
28  woman       False   Queenstown   yes   True  

Dataset without missing values:
    survived 

In [63]:
# 1. Load and clean the data
df = sns.load_dataset('titanic')
df.drop(columns=['deck'], inplace=True)

# 2. Encode categorical features
columns_to_encode = ['sex', 'embarked', 'class', 'who', 'embark_town', 'alive']
label_encoders = {}

for column in columns_to_encode:
    if column in df.columns:
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column].astype(str))
        label_encoders[column] = le

# 3. Drop rows with missing target or features
df = df.dropna(subset=['age'])  # assuming you're predicting age

# 4. Define X and y
X = df.drop(columns=['age'])  # features
y = df['age']                 # target

# Optional: Fill any remaining missing values
imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# 5. Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Train RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# 7. Evaluate
y_pred = rf_model.predict(X_test)
print("R2:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAPE:", mean_absolute_percentage_error(y_test, y_pred))



R2: 0.33769388288226154
MAE: 8.666661815622195
MSE: 122.79433625923292
MAPE: 0.40839466096086574


In [64]:
# check the missing values in the dataset in each column
import seaborn as sns   
df_with_missing.isnull().sum().sort_values(ascending=False)


age            177
survived         0
pclass           0
sex              0
sibsp            0
parch            0
fare             0
embarked         0
class            0
who              0
adult_male       0
embark_town      0
alive            0
alone            0
dtype: int64

In [70]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# Load dataset
df = sns.load_dataset('titanic')

# Drop 'deck' due to many missing values
df.drop(columns=['deck'], inplace=True)

# Drop rows where target variable 'age' is missing for regression later
df = df[df['age'].notnull() | df[['survived', 'pclass', 'sex', 'sibsp', 'parch', 'fare', 'embarked', 'class', 'who', 'adult_male', 'embark_town', 'alive', 'alone']].notnull().all(axis=1)]

# Encode categorical columns
columns_to_encode = ['sex', 'embarked', 'class', 'who', 'embark_town', 'alive']
label_encoders = {}

for col in columns_to_encode:
    le = LabelEncoder()
    df[col] = df[col].astype(str)  # convert to string in case of NaN
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Convert boolean to int
df['adult_male'] = df['adult_male'].astype(int)
df['alone'] = df['alone'].astype(int)

# Separate data for regression imputation
df_missing_age = df[df['age'].isnull()]
df_not_missing_age = df[df['age'].notnull()]

# Features to predict 'age'
features = ['pclass', 'sex', 'sibsp', 'parch', 'fare', 'embarked', 'class', 'who',
            'adult_male', 'embark_town', 'alive', 'alone']

# Split features and target
X_train = df_not_missing_age[features]
y_train = df_not_missing_age['age']

# Train regression model
regressor = RandomForestRegressor(random_state=42)
regressor.fit(X_train, y_train)

# Predict missing ages
X_missing = df_missing_age[features]
predicted_ages = regressor.predict(X_missing)

# Fill missing ages
df.loc[df['age'].isnull(), 'age'] = predicted_ages

# Confirm imputation
print("Missing 'age' values after imputation:\n", df['age'].isnull().sum())
print("Final dataset shape:", df.shape)
print(df.head())


Missing 'age' values after imputation:
 0
Final dataset shape: (891, 14)
   survived  pclass  sex   age  sibsp  parch     fare  embarked  class  who  \
0         0       3    1  22.0      1      0   7.2500         2      2    1   
1         1       1    0  38.0      1      0  71.2833         0      0    2   
2         1       3    0  26.0      0      0   7.9250         2      2    2   
3         1       1    0  35.0      1      0  53.1000         2      0    2   
4         0       3    1  35.0      0      0   8.0500         2      2    1   

   adult_male  embark_town  alive  alone  
0           1            2      0      0  
1           0            0      1      0  
2           0            2      1      1  
3           0            2      1      0  
4           1            2      0      1  


In [80]:
predicted_ages

array([31.71651389, 35.40049714, 19.26069444, 34.46481349, 21.75216667,
       27.02486843, 34.88666667, 19.74383333, 21.23170238, 33.05079164,
       31.41960196, 34.93350794, 19.74383333, 24.314     , 33.575     ,
       39.065     , 26.52916667, 27.02486843, 31.41960196, 19.98833333,
       31.41960196, 31.41960196, 27.02486843, 28.02492727, 29.9945119 ,
       31.41960196, 46.95380556, 28.241     , 31.0735    , 30.05747556,
       24.17613109, 19.47678571, 24.35266667, 58.26725397, 26.38416667,
       19.65678571, 30.15      , 45.5       , 29.27666667, 46.95380556,
       19.74383333, 19.47678571, 38.76970437, 27.02486843, 25.64      ,
       30.2937381 , 25.6395    , 29.27666667, 30.05747556, 28.34928571,
       46.95380556, 27.46945833, 52.4625    , 19.74383333, 34.47452025,
       58.8085873 , 39.065     , 39.56333333, 19.74383333, 26.87383333,
       32.27541667, 31.41960196, 31.15619048, 19.47678571, 25.71466667,
       35.60333333, 27.02486843, 26.1075    , 53.83      , 34.46