# Fill Missing Values using KNN Imputation

In [2]:
import os
import pandas as pd
from sklearn.impute import KNNImputer

# Define the input and output folder paths
input_folder = "Input_Folder_for_GNN"
output_folder = "Input_Folder_without_Nan_KNN"

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Define the KNN Imputer
imputer = KNNImputer(n_neighbors=5)

# Process each file in the input folder
for file_name in os.listdir(input_folder):
    if file_name.endswith(".csv"):  # Ensure only CSV files are processed
        file_path = os.path.join(input_folder, file_name)
        
        # Read the dataset
        data = pd.read_csv(file_path)
        
        # Check if 'From Date' column exists
        if 'From Date' not in data.columns:
            print(f"'From Date' column not found in {file_name}, skipping file.")
            continue
        
        # Save the 'From Date' column separately
        from_date = data['From Date']
        
        # Perform KNN imputation on the numeric columns
        numeric_data = data.drop(columns=['From Date'])
        imputed_data = imputer.fit_transform(numeric_data)
        imputed_df = pd.DataFrame(imputed_data, columns=numeric_data.columns)
        
        # Combine the 'From Date' column back with the imputed data
        imputed_df.insert(0, 'From Date', from_date)
        
        # Save the imputed data to the output folder
        print(f"Imputing missing values in {file_name} using KNN Imputer...")
        print(imputed_df.isnull().sum())
        output_file_path = os.path.join(output_folder, file_name)
        imputed_df.to_csv(output_file_path, index=False)

print(f"KNN Imputation completed. Files saved in '{output_folder}'.")


Imputing missing values in UP002.csv using KNN Imputer...
From Date        0
PM2.5 (ug/m3)    0
PM10 (ug/m3)     0
NO (ug/m3)       0
NO2 (ug/m3)      0
SO2 (ug/m3)      0
CO (mg/m3)       0
Ozone (ug/m3)    0
RH (%)           0
WS (m/s)         0
WD (degree)      0
AT (degree C)    0
dtype: int64
Imputing missing values in UP003.csv using KNN Imputer...
From Date        0
PM2.5 (ug/m3)    0
PM10 (ug/m3)     0
NO (ug/m3)       0
NO2 (ug/m3)      0
SO2 (ug/m3)      0
CO (mg/m3)       0
Ozone (ug/m3)    0
RH (%)           0
WS (m/s)         0
WD (degree)      0
AT (degree C)    0
dtype: int64
Imputing missing values in UP004.csv using KNN Imputer...
From Date        0
PM2.5 (ug/m3)    0
PM10 (ug/m3)     0
NO (ug/m3)       0
NO2 (ug/m3)      0
SO2 (ug/m3)      0
CO (mg/m3)       0
Ozone (ug/m3)    0
RH (%)           0
WS (m/s)         0
WD (degree)      0
AT (degree C)    0
dtype: int64
Imputing missing values in UP008.csv using KNN Imputer...
From Date        0
PM2.5 (ug/m3)    0
PM10 (u