Steps for Handling the missing value

1. Import Libraries
2. Load data
3. Seprate Input and Output attributes
4. Find the missing values and handle it in either way
    a. Removing data
    b. Imputation


In [29]:
# Step 1: Import Libraries

import numpy as np 
import pandas as pd
from sklearn.impute import SimpleImputer 

# Step 2: Load Data
        
datasets = pd.read_csv('Exercise-CarData.csv') 
print("\nData :\n",datasets)
print("\nData statistics\n",datasets.describe())


Data :
       Unnamed: 0  Price   Age     KM FuelType   HP  MetColor  Automatic    CC  \
0              0  13500  23.0  46986   Diesel   90       1.0          0  2000   
1              1  13750  23.0  72937   Diesel   90       1.0          0  2000   
2              2  13950  24.0  41711   Diesel   90       NaN          0  2000   
3              3  14950  26.0  48000   Diesel   90       0.0          0  2000   
4              4  13750  30.0  38500   Diesel   90       0.0          0  2000   
...          ...    ...   ...    ...      ...  ...       ...        ...   ...   
1431        1431   7500   NaN  20544   Petrol   86       1.0          0  1300   
1432        1432  10845  72.0     ??   Petrol   86       0.0          0  1300   
1433        1433   8500   NaN  17016   Petrol   86       0.0          0  1300   
1434        1434   7250  70.0     ??      NaN   86       1.0          0  1300   
1435        1435   6950  76.0      1   Petrol  110       0.0          0  1600   

      Doors  Weigh

In [32]:
datasets=datasets.replace('??',np.nan)
datasets=datasets.replace('????',np.nan)
print(datasets.head(10))


   Unnamed: 0  Price   Age     KM FuelType   HP  MetColor  Automatic    CC  \
0           0  13500  23.0  46986   Diesel   90       1.0          0  2000   
1           1  13750  23.0  72937   Diesel   90       1.0          0  2000   
2           2  13950  24.0  41711   Diesel   90       NaN          0  2000   
3           3  14950  26.0  48000   Diesel   90       0.0          0  2000   
4           4  13750  30.0  38500   Diesel   90       0.0          0  2000   
5           5  12950  32.0  61000   Diesel   90       0.0          0  2000   
6           6  16900  27.0    NaN   Diesel  NaN       NaN          0  2000   
7           7  18600  30.0  75889      NaN   90       1.0          0  2000   
8           8  21500  27.0  19700   Petrol  192       0.0          0  1800   
9           9  12950  23.0  71138   Diesel  NaN       NaN          0  1900   

   Doors  Weight  
0  three    1165  
1      3    1165  
2      3    1165  
3      3    1165  
4      3    1170  
5      3    1170  
6      3

In [33]:
# Step 3: Seprate Input and Output attributes
# All rows, all columns except last 
X = datasets.iloc[:, :-1].values 
  
# Only last column  
Y = datasets.iloc[:, -1].values 

print("\n\nInput : \n", X) 
print("\n\nOutput: \n", Y) 



Input : 
 [[0 13500 23.0 ... 0 2000 'three']
 [1 13750 23.0 ... 0 2000 '3']
 [2 13950 24.0 ... 0 2000 '3']
 ...
 [1433 8500 nan ... 0 1300 '3']
 [1434 7250 70.0 ... 0 1300 '3']
 [1435 6950 76.0 ... 0 1600 '5']]


Output: 
 [1165 1165 1165 ... 1015 1015 1114]


In [51]:
# Step 4: Find the missing values and handle it in either way

# 4a. Removing the row with all null values

datasets.dropna(inplace=True)
print("\nNew Data :",datasets)



New Data :       Unnamed: 0  Price   Age     KM FuelType   HP  MetColor  Automatic    CC  \
0              0  13500  23.0  46986   Diesel   90       1.0          0  2000   
1              1  13750  23.0  72937   Diesel   90       1.0          0  2000   
3              3  14950  26.0  48000   Diesel   90       0.0          0  2000   
4              4  13750  30.0  38500   Diesel   90       0.0          0  2000   
5              5  12950  32.0  61000   Diesel   90       0.0          0  2000   
...          ...    ...   ...    ...      ...  ...       ...        ...   ...   
1423        1423   7950  80.0  35821   Petrol   86       0.0          1  1300   
1424        1424   7750  73.0  34717   Petrol   86       0.0          0  1300   
1429        1429   8950  78.0  24000   Petrol   86       1.0          1  1300   
1430        1430   8450  80.0  23000   Petrol   86       0.0          0  1300   
1435        1435   6950  76.0      1   Petrol  110       0.0          0  1600   

      Doors  We

In [56]:
datasets=datasets._get_numeric_data()


In [62]:
# 4b. Imputation (Replacing null values with mean value of that attribute)
# All rows, all columns except last 
new_X = datasets.iloc[:, :-1].values 
  
# Only last column  
new_Y = datasets.iloc[:, -1].values 
print(new_X)
print(new_Y)


[[0.000e+00 1.350e+04 2.300e+01 1.000e+00 0.000e+00 2.000e+03]
 [1.000e+00 1.375e+04 2.300e+01 1.000e+00 0.000e+00 2.000e+03]
 [3.000e+00 1.495e+04 2.600e+01 0.000e+00 0.000e+00 2.000e+03]
 ...
 [1.429e+03 8.950e+03 7.800e+01 1.000e+00 1.000e+00 1.300e+03]
 [1.430e+03 8.450e+03 8.000e+01 0.000e+00 0.000e+00 1.300e+03]
 [1.435e+03 6.950e+03 7.600e+01 0.000e+00 0.000e+00 1.600e+03]]
[1165 1165 1165 ... 1065 1015 1114]


In [65]:

# Using Imputer function to replace NaN values with mean of that parameter value 
imputer = SimpleImputer(missing_values = np.nan,strategy = "mean")

# Fitting the data, function learns the stats 
imputer = imputer.fit(new_X[:, 1:11]) 
print(imputer)

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='mean', verbose=0)


In [67]:

# fit_transform() will execute those stats on the input ie. X[:, 1:3] 
new_X[:, 1:6] = imputer.transform(new_X[:, 1:6]) 
  
# filling the missing value with mean 
print("\n\nNew Input with Mean Value for NaN : \n\n", new_X) 





New Input with Mean Value for NaN : 

 [[0.000e+00 1.350e+04 2.300e+01 1.000e+00 0.000e+00 2.000e+03]
 [1.000e+00 1.375e+04 2.300e+01 1.000e+00 0.000e+00 2.000e+03]
 [3.000e+00 1.495e+04 2.600e+01 0.000e+00 0.000e+00 2.000e+03]
 ...
 [1.429e+03 8.950e+03 7.800e+01 1.000e+00 1.000e+00 1.300e+03]
 [1.430e+03 8.450e+03 8.000e+01 0.000e+00 0.000e+00 1.300e+03]
 [1.435e+03 6.950e+03 7.600e+01 0.000e+00 0.000e+00 1.600e+03]]
