# Week-2

## Task-1

In this example, we demonstrate how to perform KNN imputation using both a custom implementation and the built-in `KNNImputer` from `scikit-learn`.

### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.metrics import pairwise_distances

### Creating the DataFrame and identifying the columns with missing values

In [2]:
X = [
    [70.1, 154.9, 29.215, 2],
    [64.9, 157.8, np.nan, 2],
    [np.nan, 164.7, 24.73633318, 1],
    [67.5, 169.9, 23.38390377, 1],
    [68.4, 154.9, 28.5071149, np.nan],
    [77.7, 173.8, 25.72299152, 2]
]

columns = ['weight', 'height', 'BMI', 'overweight']
df = pd.DataFrame(X, columns=columns)

missing_columns = df.columns[df.isnull().any()]

### Custom KNN Imputer Function

In [3]:
def custom_knn_imputer(df, k=2):
    df_filled = df.copy()

    for col in df_filled.columns:
        missing_indices = df_filled[df_filled[col].isnull()].index

        for idx in missing_indices:
            distances = pairwise_distances(df_filled.drop(columns=[col]), df_filled.drop(columns=[col]).iloc[[idx]],
                                           metric='nan_euclidean').flatten()
            nearest_indices = np.argsort(distances)[:k + 1][1:]

            knn_values = df_filled.loc[nearest_indices, col].dropna()
            if not knn_values.empty:
                df_filled.at[idx, col] = knn_values.mean()

    return df_filled

### Applying Custom KNN Imputer

In [4]:
df_custom_imputed = custom_knn_imputer(df)
print(f'\nCustom KNN Imputed DataFrame:\n\n{df_custom_imputed}')


Custom KNN Imputed DataFrame:

   weight  height        BMI  overweight
0    70.1   154.9  29.215000         2.0
1    64.9   157.8  28.861057         2.0
2    66.2   164.7  24.736333         1.0
3    67.5   169.9  23.383904         1.0
4    68.4   154.9  28.507115         2.0
5    77.7   173.8  25.722992         2.0


### Using Inbuilt `KNN Imputer` Function

In [5]:
knn_imputer = KNNImputer(n_neighbors=2, weights='uniform', metric='nan_euclidean')
df_inbuilt_imputed = pd.DataFrame(knn_imputer.fit_transform(df), columns=columns)
print(f'\nInbuilt KNN Imputed DataFrame:\n\n{df_inbuilt_imputed}')


Inbuilt KNN Imputed DataFrame:

   weight  height        BMI  overweight
0    70.1   154.9  29.215000         2.0
1    64.9   157.8  28.861057         2.0
2    66.2   164.7  24.736333         1.0
3    67.5   169.9  23.383904         1.0
4    68.4   154.9  28.507115         2.0
5    77.7   173.8  25.722992         2.0


### Comparing the Outcomes

In [6]:
comparison = df_custom_imputed.equals(df_inbuilt_imputed)
print(f'\nImputed DF equal? :{comparison}')


Imputed DF equal? :True


## Task-2


### Importing Libraries and Loading Dataset

## Identifying Missing Values

In [8]:
df_iris = pd.read_csv('../Datasets/Week-2/iris.csv')
missing_columns = df_iris.columns[df_iris.isnull().any()]
if 'Id' in missing_columns:
    missing_columns = missing_columns.drop('Id')
print(f'Columns with missing values:{missing_columns}')

Columns with missing values:Index(['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm'], dtype='object')


## Defining Custom KNN Imputer Function

In [9]:
def custom_knn_imputer(df, k=3):
    df_filled = df.copy()

    for col in df_filled.columns:
        if col == 'Id':
            continue

        missing_indices = df_filled[df_filled[col].isnull()].index

        for idx in missing_indices:
            distances = pairwise_distances(
                df_filled.drop(columns=[col, 'Id']),
                df_filled.drop(columns=[col, 'Id']).iloc[[idx]],
                metric='nan_euclidean').flatten()
            nearest_indices = np.argsort(distances)[:k + 1][1:]
            knn_values = df_filled.loc[nearest_indices, col].dropna()
            if not knn_values.empty:
                df_filled.at[idx, col] = knn_values.mean()

    return df_filled

## Applying Custom KNN Imputer

In [10]:
dropped_columns = ['Id']
df_custom_imputed = custom_knn_imputer(df_iris)
print('\nCustom KNN Imputed DataFrame:\n', df_custom_imputed.head())


Custom KNN Imputed DataFrame:
    Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
0   1            5.1           3.5            1.4           0.2
1   2            4.9           3.0            1.4           0.2
2   3            4.7           3.2            1.3           0.2
3   4            4.6           3.1            1.5           0.2
4   5            5.0           3.6            1.4           0.2


## Using Inbuilt `KNN Imputer` Function

In [11]:
knn_imputer = KNNImputer(n_neighbors=3, weights='uniform', metric='nan_euclidean')
df_inbuilt_imputed = pd.DataFrame(knn_imputer.fit_transform(df_iris.drop(columns=dropped_columns)), columns=df_iris.columns.drop(dropped_columns))
df_inbuilt_imputed.insert(0, 'Id', df_iris['Id'])
print(f'\nInbuilt KNN Imputed DataFrame:\n{df_inbuilt_imputed.head()}')


Inbuilt KNN Imputed DataFrame:
   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
0   1            5.1           3.5            1.4           0.2
1   2            4.9           3.0            1.4           0.2
2   3            4.7           3.2            1.3           0.2
3   4            4.6           3.1            1.5           0.2
4   5            5.0           3.6            1.4           0.2
