Preprocessing
- unwanted data
    - columns
        - unique values -> drop
    - dependent variable has missing entries
        - drop the whole row
- independent column missing data (imputation)
    - more than 50% missing -> drop
    - less than 50% missing
        - numerical data (mean, median, algo)
            - `SimpleImputer` from `sklearn.impute` package
        - categorical data (mode, algo)
            - `SimpleImputer` from `sklearn.impute` package
- text data (encoding)
    - Text -> Numbers
        - Label Encoding
            - Dependent Column -> `LabelEncoder` from `sklearn.preprocessing` package
            - Independent Column -> `OrdinalEncoder` from `sklearn.preprocessing` package
        - One Hot Encoding
            - Dummy Variable = [0, 0, 1]
                - `OneHotEncoder` from `sklearn.preprocessing` package
- normalisation
    - min-max 
    - standard 
- skewness
    - log
    - box-cox

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer  

In [None]:
pd.set_option('display.max_columns', None)

In [37]:
df = pd.read_csv('https://raw.githubusercontent.com/digipodium/Datasets/main/regression/automobile.csv', index_col=0)
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [38]:
df.replace('?', np.nan, inplace=True)
df.isnull().sum()

symboling             0
normalized-losses    41
make                  0
fuel-type             0
aspiration            0
num-of-doors          2
body-style            0
drive-wheels          0
engine-location       0
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-type           0
num-of-cylinders      0
engine-size           0
fuel-system           0
bore                  4
stroke                4
compression-ratio     0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 4
dtype: int64

In [41]:
missing_values_cols = df.isnull().sum()
missing_values_cols[missing_values_cols > 0].index.tolist()

['normalized-losses',
 'num-of-doors',
 'bore',
 'stroke',
 'horsepower',
 'peak-rpm',
 'price']

In [14]:
df.dropna(subset='price', inplace=True)

In [18]:
df.isnull().sum() 
cols_with_missing = ['normalized-losses','num-of-doors','bore',
                     'stroke','horsepower','peak-rpm']
df[cols_with_missing]

Unnamed: 0,normalized-losses,num-of-doors,bore,stroke,horsepower,peak-rpm
0,,two,3.47,2.68,111,5000
1,,two,3.47,2.68,111,5000
2,,two,2.68,3.47,154,5000
3,164,four,3.19,3.40,102,5500
4,164,four,3.19,3.40,115,5500
...,...,...,...,...,...,...
200,95,four,3.78,3.15,114,5400
201,95,four,3.78,3.15,160,5300
202,95,four,3.58,2.87,134,5500
203,95,four,3.01,3.40,106,4800


In [35]:
num_si = SimpleImputer()
cat_si = SimpleImputer(strategy='most_frequent')
num_cols = ['normalized-losses','bore','stroke','horsepower','peak-rpm']
df[num_cols] = num_si.fit_transform(df[num_cols])

In [34]:
df[['num-of-doors']] = cat_si.fit_transform(df[['num-of-doors']])

In [32]:
df[['num-of-doors']].shape

(201, 1)