Preprocessing
- unwanted data
    - columns
        - unique values -> drop
    - dependent variable has missing entries
        - drop the whole row
- independent column missing data (imputation)
    - more than 50% missing -> drop
    - less than 50% missing
        - numerical data (mean, median, algo)
            - `SimpleImputer` from `sklearn.impute` package
        - categorical data (mode, algo)
            - `SimpleImputer` from `sklearn.impute` package
- text data (encoding)
    - Text -> Numbers
        - Label Encoding
            - Dependent Column -> `LabelEncoder` from `sklearn.preprocessing` package
            - Independent Column -> `OrdinalEncoder` from `sklearn.preprocessing` package
        - One Hot Encoding
            - Dummy Variable = [0, 0, 1]
                - `OneHotEncoder` from `sklearn.preprocessing` package
- normalisation
    - min-max 
    - standard 
- skewness
    - log
    - box-cox

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer  

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/digipodium/Datasets/main/regression/automobile.csv', index_col=0)
df.head()

In [None]:
df.replace('?', np.nan, inplace=True)
df.isnull().sum()

In [None]:
missing_values_cols = df.isnull().sum()
missing_values_cols[missing_values_cols > 0].index.tolist()

In [None]:
df.dropna(subset='price', inplace=True)

In [None]:
df.isnull().sum() 
cols_with_missing = ['normalized-losses','num-of-doors','bore',
                     'stroke','horsepower','peak-rpm']
df[cols_with_missing]

In [None]:
num_si = SimpleImputer()
cat_si = SimpleImputer(strategy='most_frequent')
num_cols = ['normalized-losses','bore','stroke','horsepower','peak-rpm']
df[num_cols] = num_si.fit_transform(df[num_cols])

In [None]:
df[['num-of-doors']] = cat_si.fit_transform(df[['num-of-doors']])

In [None]:
df[['num-of-doors']].shape

In [None]:
cat_cols = df.select_dtypes(include=['object']).columns.tolist()[:-1]
df[cat_cols]

In [None]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

In [None]:
ord_cols = []
hot_encode_cols = []
for col in df[cat_cols]:
    count = df[col].nunique()
    if count <= 2:
        ord_cols.append(col)
    else:
        hot_encode_cols.append(col)
    print(f'{col:20}{count}')
print(ord_cols)
print(hot_encode_cols)

In [None]:
# ordinal encoding
ordEnc = OrdinalEncoder()
df[ord_cols] = ordEnc.fit_transform(df[ord_cols])
df.head()

In [None]:
hot_encode_cols

In [None]:
make_hot_enc = OneHotEncoder(drop='first', sparse_output=False)
make_hot_enc.fit_transform(df[['body-style']])

pipeline implementation

In [None]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/digipodium/Datasets/main/regression/automobile.csv', index_col=0)
df.replace('?', np.nan, inplace=True)
df.dropna(subset=['price'],inplace=True)

In [None]:
X, y = df.drop(columns='price'), df['price']

In [None]:
num_cols = X.select_dtypes(include=np.number).columns.tolist()
cat_cols = X.select_dtypes(exclude=np.number).columns.tolist()
ord_cols = []
hot_encode_cols = []
for col in X[cat_cols]:
    count = X[col].nunique()
    if count <= 2:
        ord_cols.append(col)
    else:
        hot_encode_cols.append(col)
print(num_cols)
print(ord_cols)
print(hot_encode_cols)

In [None]:
num_cols = ['symboling', 'wheel-base', 'length', 'width', 'height',
    'curb-weight', 'engine-size', 'compression-ratio', 'city-mpg', 'highway-mpg',
    'bore','stroke','horsepower','peak-rpm','normalized-losses',]

hot_encode_cols = ['make', 'body-style', 'drive-wheels',
     'engine-type', 'num-of-cylinders', 'fuel-system']

In [None]:
# creating pipleline for preprocessing
num_pipeline = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())
])
num_pipeline

In [None]:
ord_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encode', OrdinalEncoder()),
])

hot_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encode', OneHotEncoder(drop='first', sparse_output=False)),
])

# compose the three pipelines
preprocessor = ColumnTransformer([
    ('ord', ord_pipeline, ord_cols),
    ('hot', hot_pipeline, hot_encode_cols),
    ('num', num_pipeline, num_cols),
])

preprocessor

In [None]:
preprocessor.fit_transform(X)