Steps for Handling the missing value

1. Import Libraries
2. Load data
3. Seprate Input and Output attributes
4. Find the missing values and handle it in either way
    a. Removing data
    b. Imputation

In [1]:
# Step 1: Import Libraries

import numpy as np 
import pandas as pd
from sklearn.impute import SimpleImputer 

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Step 2: Load Data
        
datasets = pd.read_csv('/content/drive/MyDrive/Dataset/Exercise-CarData.csv') 
print("\nData :\n",datasets)
print("\nData statistics\n",datasets.describe())


Data :
       Unnamed: 0  Price   Age     KM  ... Automatic    CC  Doors  Weight
0              0  13500  23.0  46986  ...         0  2000  three    1165
1              1  13750  23.0  72937  ...         0  2000      3    1165
2              2  13950  24.0  41711  ...         0  2000      3    1165
3              3  14950  26.0  48000  ...         0  2000      3    1165
4              4  13750  30.0  38500  ...         0  2000      3    1170
...          ...    ...   ...    ...  ...       ...   ...    ...     ...
1431        1431   7500   NaN  20544  ...         0  1300      3    1025
1432        1432  10845  72.0     ??  ...         0  1300      3    1015
1433        1433   8500   NaN  17016  ...         0  1300      3    1015
1434        1434   7250  70.0     ??  ...         0  1300      3    1015
1435        1435   6950  76.0      1  ...         0  1600      5    1114

[1436 rows x 11 columns]

Data statistics
         Unnamed: 0         Price  ...           CC      Weight
count  1

In [4]:
# Step 3: Seprate Input and Output attributes

# All rows, all columns except last 
X = datasets.iloc[:, :-1].values 
  
# Only last column  
Y = datasets.iloc[:, -1].values 

print("\n\nInput : \n", X) 
print("\n\nOutput: \n", Y) 



Input : 
 [[0 13500 23.0 ... 0 2000 'three']
 [1 13750 23.0 ... 0 2000 '3']
 [2 13950 24.0 ... 0 2000 '3']
 ...
 [1433 8500 nan ... 0 1300 '3']
 [1434 7250 70.0 ... 0 1300 '3']
 [1435 6950 76.0 ... 0 1600 '5']]


Output: 
 [1165 1165 1165 ... 1015 1015 1114]


In [5]:
# Step 4: Find the missing values and handle it in either way

# 4a. Removing the row with all null values

datasets.dropna(axis=0,how='all',inplace=True)
print("\nNew Data :",datasets)


#4b. Removing the row with any one null values

datasets.dropna(axis=0,how='any',inplace=True)


New Data :       Unnamed: 0  Price   Age     KM  ... Automatic    CC  Doors  Weight
0              0  13500  23.0  46986  ...         0  2000  three    1165
1              1  13750  23.0  72937  ...         0  2000      3    1165
2              2  13950  24.0  41711  ...         0  2000      3    1165
3              3  14950  26.0  48000  ...         0  2000      3    1165
4              4  13750  30.0  38500  ...         0  2000      3    1170
...          ...    ...   ...    ...  ...       ...   ...    ...     ...
1431        1431   7500   NaN  20544  ...         0  1300      3    1025
1432        1432  10845  72.0     ??  ...         0  1300      3    1015
1433        1433   8500   NaN  17016  ...         0  1300      3    1015
1434        1434   7250  70.0     ??  ...         0  1300      3    1015
1435        1435   6950  76.0      1  ...         0  1600      5    1114

[1436 rows x 11 columns]


In [6]:
updated_df = datasets;
updated_df['Age']=updated_df['Age'].fillna(updated_df['Age'].mean())
updated_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1111 entries, 0 to 1435
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  1111 non-null   int64  
 1   Price       1111 non-null   int64  
 2   Age         1111 non-null   float64
 3   KM          1111 non-null   object 
 4   FuelType    1111 non-null   object 
 5   HP          1111 non-null   object 
 6   MetColor    1111 non-null   float64
 7   Automatic   1111 non-null   int64  
 8   CC          1111 non-null   int64  
 9   Doors       1111 non-null   object 
 10  Weight      1111 non-null   int64  
dtypes: float64(2), int64(5), object(4)
memory usage: 104.2+ KB


In [7]:
datasets

Unnamed: 0.1,Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,0,13500,23.0,46986,Diesel,90,1.0,0,2000,three,1165
1,1,13750,23.0,72937,Diesel,90,1.0,0,2000,3,1165
3,3,14950,26.0,48000,Diesel,90,0.0,0,2000,3,1165
4,4,13750,30.0,38500,Diesel,90,0.0,0,2000,3,1170
5,5,12950,32.0,61000,Diesel,90,0.0,0,2000,3,1170
...,...,...,...,...,...,...,...,...,...,...,...
1425,1425,7950,80.0,??,Petrol,86,1.0,0,1300,4,1000
1429,1429,8950,78.0,24000,Petrol,86,1.0,1,1300,5,1065
1430,1430,8450,80.0,23000,Petrol,86,0.0,0,1300,3,1015
1432,1432,10845,72.0,??,Petrol,86,0.0,0,1300,3,1015


In [9]:
updated_df = datasets;
updated_df['Price']=updated_df['Price'].fillna(updated_df['Price'].mean())
updated_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1111 entries, 0 to 1435
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  1111 non-null   int64  
 1   Price       1111 non-null   int64  
 2   Age         1111 non-null   float64
 3   KM          1111 non-null   object 
 4   FuelType    1111 non-null   object 
 5   HP          1111 non-null   object 
 6   MetColor    1111 non-null   float64
 7   Automatic   1111 non-null   int64  
 8   CC          1111 non-null   int64  
 9   Doors       1111 non-null   object 
 10  Weight      1111 non-null   int64  
dtypes: float64(2), int64(5), object(4)
memory usage: 104.2+ KB


In [10]:
datasets

Unnamed: 0.1,Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,0,13500,23.0,46986,Diesel,90,1.0,0,2000,three,1165
1,1,13750,23.0,72937,Diesel,90,1.0,0,2000,3,1165
3,3,14950,26.0,48000,Diesel,90,0.0,0,2000,3,1165
4,4,13750,30.0,38500,Diesel,90,0.0,0,2000,3,1170
5,5,12950,32.0,61000,Diesel,90,0.0,0,2000,3,1170
...,...,...,...,...,...,...,...,...,...,...,...
1425,1425,7950,80.0,??,Petrol,86,1.0,0,1300,4,1000
1429,1429,8950,78.0,24000,Petrol,86,1.0,1,1300,5,1065
1430,1430,8450,80.0,23000,Petrol,86,0.0,0,1300,3,1015
1432,1432,10845,72.0,??,Petrol,86,0.0,0,1300,3,1015


In [11]:
new_X = datasets.iloc[:, :-1].values 
# Only last column  
new_Y = datasets.iloc[:, -1].values 

#  **Using SimpleImputer from Scikit-Learn Library **

In [12]:
# Step 1: Import Libraries

import numpy as np 
import pandas as pd
from sklearn.impute import SimpleImputer 

In [13]:
# Step 2: Load Data
        
datasets = pd.read_csv('/content/drive/MyDrive/Dataset/Exercise-CarData.csv') 
print("\nData :\n",datasets)
print("\nData statistics\n",datasets.describe())


Data :
       Unnamed: 0  Price   Age     KM  ... Automatic    CC  Doors  Weight
0              0  13500  23.0  46986  ...         0  2000  three    1165
1              1  13750  23.0  72937  ...         0  2000      3    1165
2              2  13950  24.0  41711  ...         0  2000      3    1165
3              3  14950  26.0  48000  ...         0  2000      3    1165
4              4  13750  30.0  38500  ...         0  2000      3    1170
...          ...    ...   ...    ...  ...       ...   ...    ...     ...
1431        1431   7500   NaN  20544  ...         0  1300      3    1025
1432        1432  10845  72.0     ??  ...         0  1300      3    1015
1433        1433   8500   NaN  17016  ...         0  1300      3    1015
1434        1434   7250  70.0     ??  ...         0  1300      3    1015
1435        1435   6950  76.0      1  ...         0  1600      5    1114

[1436 rows x 11 columns]

Data statistics
         Unnamed: 0         Price  ...           CC      Weight
count  1

In [14]:
# Step 3: Seprate Input and Output attributes

# All rows, all columns except last 
X = datasets.iloc[:, :-1].values 
  
# Only last column  
Y = datasets.iloc[:, -1].values 

print("\n\nInput : \n", X) 
print("\n\nOutput: \n", Y) 



Input : 
 [[0 13500 23.0 ... 0 2000 'three']
 [1 13750 23.0 ... 0 2000 '3']
 [2 13950 24.0 ... 0 2000 '3']
 ...
 [1433 8500 nan ... 0 1300 '3']
 [1434 7250 70.0 ... 0 1300 '3']
 [1435 6950 76.0 ... 0 1600 '5']]


Output: 
 [1165 1165 1165 ... 1015 1015 1114]


In [15]:
# Step 4: Find the missing values and handle it in either way

# 4a. Removing the row with all null values

datasets.dropna(axis=0,how='all',inplace=True)
print("\nNew Data :",datasets)


#4b. Removing the row with any one null values

datasets.dropna(axis=0,how='any',inplace=True)


New Data :       Unnamed: 0  Price   Age     KM  ... Automatic    CC  Doors  Weight
0              0  13500  23.0  46986  ...         0  2000  three    1165
1              1  13750  23.0  72937  ...         0  2000      3    1165
2              2  13950  24.0  41711  ...         0  2000      3    1165
3              3  14950  26.0  48000  ...         0  2000      3    1165
4              4  13750  30.0  38500  ...         0  2000      3    1170
...          ...    ...   ...    ...  ...       ...   ...    ...     ...
1431        1431   7500   NaN  20544  ...         0  1300      3    1025
1432        1432  10845  72.0     ??  ...         0  1300      3    1015
1433        1433   8500   NaN  17016  ...         0  1300      3    1015
1434        1434   7250  70.0     ??  ...         0  1300      3    1015
1435        1435   6950  76.0      1  ...         0  1600      5    1114

[1436 rows x 11 columns]


In [16]:
# 4b. Imputation (Replacing null values with mean value of that attribute)

# All rows, all columns except last 
new_X = datasets.iloc[:, :-1].values 
  
# Only last column  
new_Y = datasets.iloc[:, -1].values 


updated_df['Age'].fillna(updated_df['Age'].mean())

# Using Imputer function to replace NaN values with mean of that parameter value 
imputer = SimpleImputer(missing_values = np.nan,strategy = "mean")

# Fitting the data, function learns the stats 
imputer = imputer.fit(new_X[:, 1:3]) 
  
# fit_transform() will execute those stats on the input ie. X[:, 1:3] 
new_X[:, 1:3] = imputer.transform(new_X[:, 1:3]) 
  
# filling the missing value with mean 
print("\n\nNew Input with Mean Value for NaN : \n\n", new_X) 



New Input with Mean Value for NaN : 

 [[0 13500.0 23.0 ... 0 2000 'three']
 [1 13750.0 23.0 ... 0 2000 '3']
 [3 14950.0 26.0 ... 0 2000 '3']
 ...
 [1430 8450.0 80.0 ... 0 1300 '3']
 [1432 10845.0 72.0 ... 0 1300 '3']
 [1435 6950.0 76.0 ... 0 1600 '5']]
