In [None]:
#Show ALL outputs in cell, not only last result
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
relative_filepath = "../../"

In [None]:
#Set relative path mapping for module imports
import sys

sys.path.append(relative_filepath)

# for path in sys.path:
#     print(path)

In [None]:
# External Dependencies
import numpy as np
import pandas as pd

In [None]:
# Read in pickled combined data
X_y_data = pd.read_pickle(relative_filepath + "data/interim/step_3a/X_y_data.pkl")

# Read in pickled train data
X_y_train = pd.read_pickle(relative_filepath + "data/interim/step_3a/X_y_train.pkl")

# Read in pickled test data
X_y_test = pd.read_pickle(relative_filepath + "data/interim/step_3a/X_y_test.pkl")

# Recap data structure
X_y_data.head()
X_y_data.shape

In [None]:
import json

dict_ml_missing_data = json.load(open(relative_filepath + "reports/dicts/dict_ml_missing_data.json"))

In [None]:
#values for config dict
input_dfs = [X_y_data,
             X_y_train,
             X_y_test]

target = "classLabel"

## Data Cleaning Checklist

In [None]:
https://elitedatascience.com/data-cleaning

Remove Unwanted observations
    Duplicate observations
    Irrelevant observations
    
Fix Structural Errors

Filter Unwanted Outliers

Handle Missing Data
    Missing categorical data
    Missing numeric data

In [None]:
# Imports

In [None]:
# Input data
X_train, y_train

In [None]:
# Finalised preprocessing handlers

# Numeric  handlers
def num_imputation_handler(X):
    pass

def power_transform_handler(X):
    pass

def outlier_handler(X):
    pass

# Categorical handlers
def cat_imputation_handler(X):
    pass

def label_encoding_handler(df):
    pass

def one_hot_encoding_handler(df):
    pass

def ordinal_encoding_handler(df):
    pass

def target_encoding_handler(df):
    pass

# Text handlers
def vectorizer_handler(df):
    pass

# Model input handlers
def scaling_handler(X):
    pass

def imbalance_handler(df):
    pass

In [None]:
# determine categorical and numerical features
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object', 'bool']).columns

In [None]:
## FILL WITH RELEVANT ##

# Column dtypes selector
numerical_cols = []
# imputation_cols = []
# power_transform_cols = []
# outlier_cols = []
# scaling_cols = []
categorical_cols = []
text_cols = []

# Function transformers for numeric pipeline
get_numeric_data = FunctionTransformer(lambda x: x[numerical_cols], validate=False)
apply_num_imputations = FunctionTransformer(FUNCTION, validate=False)
apply_power_transforms = FunctionTransformer(FUNCTION, validate=False)
apply_outlier_handling = FunctionTransformer(FUNCTION, validate=False)
apply_scaling = FunctionTransformer(FUNCTION, validate=False)
apply_balancing = FunctionTransformer(FUNCTION, validate=False)

# Function transformers for categorical pipeline
get_categorical_data = FunctionTransformer(lambda x: x[categorical_cols], validate=False)
apply_cat_imputations = FunctionTransformer(FUNCTION, validate=False) #SimpleImputer(strategy='most_frequent', fill_value='categorical', missing_values=np.nan)
apply_label_encoding = FunctionTransformer(FUNCTION, validate=False)
apply_one_hot_encoding = FunctionTransformer(FUNCTION, validate=False)
apply_ordinal_encoding = FunctionTransformer(FUNCTION, validate=False)

# Function transformers for text pipeline
get_text_data = FunctionTransformer(lambda x: x[text_cols], validate=False)
apply_vectorizer = FunctionTransformer(FUNCTION, validate=False)

In [None]:
# Individual dtype pipelines
numeric_transformer = Pipeline([
    ('selector', get_numeric_data),
    ('imputer', apply_imputations),
    ('power_transformer', apply_power_transforms),
    ('outliers', apply_outlier_handling)
])

categorical_transformer = Pipeline([
    ('selector', get_categorical_data),
    ('imputer', apply_cat_imputations),
    ('le', apply_label_encoding),
    ('ohe', apply_one_hot_encoding),
    ('ordinal', apply_ordinal_encoding)
])

text_transformer = Pipeline([
    ('selector', get_text_data),
    ('vectorizer', apply_vectorizer),
])

In [None]:
# Preprocessing pipeline with feature union
preprocessor_pl = FeatureUnion(transformer_list=[
        ('numeric', numeric_transformer),
        ('categorical', categorical_transformer),
        ('text', text_transformer)
    ])

preprocessor_pl_result = preprocessor_pl.fit_transform(X_train)
type(preprocessor_pl_result)
preprocessor_pl_result.shape

In [None]:
# Full pipeline
preprocessor_pl = Pipeline([
    ('union', FeatureUnion(transformer_list=[
        ('numeric', numeric_pipeline),
        ('categorical', categorical_pipeline),
        ('text', text_pipeline)
    ])),
    
#     ('scaler', apply_scaling),
#     ('imbalance', apply_balancing),
#     ('clf', LogisticRegression())
])

preprocessor_pl_result = preprocessor_pl.fit_transform(X_train)
type(preprocessor_pl_result)
preprocessor_pl_result.shape

In [None]:
# Preprocessing pipeline with column transformer
preprocessor_pl = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

preprocessor_pl_result = preprocessor_pl.fit_transform(X_train)
type(preprocessor_pl_result)
preprocessor_pl_result.shape

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline([
    ('preprocessor', preprocessor_pl),
    ('classifier', LogisticRegression())
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=0)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'classifier__C': [0.1, 1.0, 10, 100],
}

grid_search = GridSearchCV(clf, param_grid, cv=10)
grid_search

In [None]:
from IPython.display import display

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn import set_config

set_config(display='diagram')   

# Defining an example pipeline
model = Pipeline([('transformer', FunctionTransformer(lambda x: 2*x)), ('clf', LogisticRegression())])

display(model)

In [None]:
# Display HTML
from IPython.display import Image
from IPython.core.display import HTML 

Image(url= "https://assets.datacamp.com/production/repositories/4983/datasets/238dde66d8af1b7ebd8ffe82de9df60ad6a68d22/preprocessing3.png")

## Discovery for Building Preprocessing Handlers

### Numerical transformers

In [None]:
def num_imputation_handler(X):
    pass

In [None]:
def power_transform_handler(X):
    pass

def CODE

from scipy.stats import boxcox

y,fitted_lambda= boxcox(y,lmbda=None)

In the sklearn:
    
from sklearn.preprocessing import PowerTransformer

pt = PowerTransformer(method='box-cox')
data = pt.fit_transform(data)

In SciPy:
    
from scipy.stats import yeojohnson

y,fitted_lambda = yeojohnson(y,lmbda=None)

In Sklearn:
    
We can apply the transform by defining a PowerTransform object and setting the “method” argument to “yeo-johnson”

from sklearn.preprocessing import PowerTransformer

pt = PowerTransformer(method='yeo-johnson')
data = pt.fit_transform(data)

In [None]:
→ The ladder of powers
Data transformations are commonly power transformations, x’=xθ (where x’ is the transformed x).

In [None]:
Image(url= "https://miro.medium.com/max/656/1*8jUUiaF9dD9ZiLzH8e_9jA.png")

In [None]:
Image(url= "https://miro.medium.com/max/872/1*Jwpotn5OKYfkzoGQFYKunA.jpeg")

In [None]:
Image(url= "https://miro.medium.com/max/1400/1*RRZ4lakWAhBWRMC9r1r0Ew.jpeg")

In [None]:
def outlier_handler(X):
    pass

### Categorical transformers

In [None]:
def cat_imputation_handler(X):
    pass

In [None]:
def label_encoding_handler(df):
    pass

from sklearn import preprocessing

# label encode target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [44]:
def one_hot_encoding_handler(df):
    pass

import pandas as pd


# initialise data of lists.
dict_X_train = {
    'num_col_1': [1, 2, 3, 4],
    'cat_col_1': ['Tom', 'nick', 'krish', 'jack'],
    'cat_col_2': ['A', 'B', 'C', 'C']
}

dict_X_test = {
    'num_col_1': [1, 2, 3],
    'cat_col_1': ['krish', 'jack', 'krish'],
    'cat_col_2': ['A', 'B', 'B']
}

# Create DataFrame
X_train = pd.DataFrame(dict_X_train)
X_test = pd.DataFrame(dict_X_test)
 
# Print the output.
X_train.head()
# X_test.head()

# X_train_ohe = X_train

categorical_columns = ['cat_col_1', 'cat_col_2']

# for col in categorical_columns:
#     col_ohe = pd.get_dummies(X_train[col], prefix=col, drop_first=True)
#     X_train_ohe = pd.concat((X_train_ohe, col_ohe), axis=1).drop(col, axis=1)
    
# X_train.head()
# X_train_ohe.head()

In [45]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

class My_encoder(BaseEstimator, TransformerMixin):
   
    def __init__(self, drop ='first', sparse=False):
        self.encoder = OneHotEncoder(drop=drop, sparse=sparse)
        self.features_to_encode = []
        self.columns = []
    
    def fit(self, X_train, features_to_encode):
        
        data = X_train.copy()
        self.features_to_encode = features_to_encode
        data_to_encode = data[self.features_to_encode]
        self.columns = pd.get_dummies(data_to_encode, drop_first=True).columns
        self.encoder.fit(data_to_encode)
        return self.encoder
    
    def transform(self, X_test):
        
        data = X_test.copy()
        data.reset_index(drop=True, inplace=True)
        data_to_encode = data[self.features_to_encode]
        data_left = data.drop(self.features_to_encode, axis = 1)
        
        data_encoded = pd.DataFrame(self.encoder.transform(data_to_encode), columns=self.columns)
        
        return pd.concat([data_left, data_encoded],axis = 1)

# categorical_columns = [---list of features to one hot encode--]
enc = My_encoder()
enc.fit(X_train, categorical_columns)

X_train_ohe = enc.transform(X_train)
X_train_ohe.head()

Unnamed: 0,num_col_1,cat_col_1_jack,cat_col_1_krish,cat_col_1_nick,cat_col_2_B,cat_col_2_C
0,1,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.0,1.0,1.0,0.0
2,3,0.0,1.0,0.0,0.0,1.0
3,4,1.0,0.0,0.0,0.0,1.0


In [46]:
X_test_ohe = enc.transform(X_test)
X_test_ohe.head()

Unnamed: 0,num_col_1,cat_col_1_jack,cat_col_1_krish,cat_col_1_nick,cat_col_2_B,cat_col_2_C
0,1,0.0,1.0,0.0,0.0,0.0
1,2,1.0,0.0,0.0,1.0,0.0
2,3,0.0,1.0,0.0,1.0,0.0


In [12]:
def ordinal_encoding_handler(df):
    pass

import category_encoders as ce
import pandas as pd

train_df=pd.DataFrame({'Degree':['High school','Masters','Diploma','Bachelors','Bachelors','Masters','Phd','High school','High school']})

# create object of Ordinalencoding
encoder= ce.OrdinalEncoder(cols=['Degree'],return_df=True,
                           mapping=[{'col':'Degree',
'mapping':{'None':0,'High school':1,'Diploma':2,'Bachelors':3,'Masters':4,'phd':5}}])

#Original data
train_df

ModuleNotFoundError: No module named 'category_encoders'

In [None]:
def target_encoding_handler(df):
    pass

### Text transformers

In [None]:
# Text handlers
def vectorizer_handler(df):
    pass

### Model input transformers

In [None]:
# Model input handlers
def scaling_handler(X):
    pass

In [None]:
def imbalance_handler(df):
    pass

In [None]:
##############
###################
#################

In [None]:
# Imports 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 
from sklearn import set_config                      # to change the display
from sklearn.utils import estimator_html_repr       # to save the diagram into HTML format

# Validation
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Classification models
from sklearn.linear_model import LogisticRegression

# Evaluation / Scoring metrics
from sklearn.metrics import accuracy_score

In [None]:
# from sklearn.datasets import make_classification, load_breast_cancer

# X, y = load_breast_cancer(return_X_y = True, as_frame=True)
# X.head()

In [None]:
from sklearn.datasets import load_boston

# load data
boston = load_boston()
X = pd.DataFrame(boston.data, columns=boston.feature_names)
X.drop('CHAS', axis=1, inplace=True)
y = pd.Series(boston.target, name='MEDV')

# inspect data
X.head()

In [None]:
# Split into train & test
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
#                                                     stratify=y,
                                                    random_state=11)

In [None]:
X_train.dtypes
X_train.shape
X_train.info()

In [None]:
cat_features = X_train.select_dtypes(include=['object']).columns
num_features = X_train.select_dtypes(include=['int64', 'float64']).columns

# cat_features = []
# num_features = []

# print(cat_features)
# print(num_features)

In [None]:
len(cat_features)
len(num_features)

In [None]:
print(X_train.isnull().sum())

In [None]:
from helpers.preprocessing.outliers import boxplot_numeric_features, IQR_Outliers, CustomSampler_IQR

In [None]:
###
rows_for_plot = 6
cols_for_plot = 5
###

boxplot_numeric_features(X_train,
                         rows_for_plot=rows_for_plot,
                         cols_for_plot=cols_for_plot)

In [None]:
from scipy import stats
import numpy as np

X = X_train
z_score_thresh = 3

print("Shape before IQR outlier removal:", X.shape)

print("Shape after IQR outlier removal:", X_o.shape)

In [None]:
from scipy import stats
import numpy as np

X = X_train
z_score_thresh = 3

print("Shape before Z-score outlier removal:", X.shape)

print("Shape after Z-score outlier removal:", X_o.shape)

In [None]:
# IQR_Outliers(X_train)

In [None]:
# CustomSampler_IQR(X_train, y_train)

In [None]:
# Split into train & test
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=11)

X_train.head()

In [None]:
# evaluate model on training dataset with outliers removed
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import mean_absolute_error

# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)

# # summarize the shape of the updated training dataset
print(X_train.shape, y_train.shape)

### Model for coefficients of features
model = LogisticRegression(random_state=11)
s_scaler = StandardScaler()
###

# Pipeline with Scaler
pipeline_scaler = Pipeline([
    ('scaler', s_scaler),
    ('model', model)
])

pipeline_scaler.fit(X_train, y_train)

y_train_pred = pipeline_scaler.predict(X_train)
print("Accuracy on Train set:", accuracy_score(y_train, y_train_pred), "\n")

y_test_pred = pipeline_scaler.predict(X_test)
print("Accuracy on Test set:", accuracy_score(y_test, y_test_pred), "\n")

In [None]:
# evaluate model on training dataset with outliers removed
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import mean_absolute_error

# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)

# identify outliers in the training dataset
lof = LocalOutlierFactor()
yhat = lof.fit_predict(X_train)

# # select all rows that are not outliers
mask = yhat != -1
X_train, y_train = X_train[mask], y_train[mask]

# # summarize the shape of the updated training dataset
print(X_train.shape, y_train.shape)

### Model for coefficients of features
model = LogisticRegression(random_state=11)
s_scaler = StandardScaler()
###

# Pipeline with Scaler
pipeline_scaler = Pipeline([
    ('scaler', s_scaler),
    ('model', model)
])

pipeline_scaler.fit(X_train, y_train)

y_train_pred = pipeline_scaler.predict(X_train)
print("Accuracy on Train set:", accuracy_score(y_train, y_train_pred), "\n")

y_test_pred = pipeline_scaler.predict(X_test)
print("Accuracy on Test set:", accuracy_score(y_test, y_test_pred), "\n")

# # evaluate the model
# yhat = model.predict(X_test)

# # evaluate predictions
# mae = mean_absolute_error(y_test, yhat)
# print('MAE: %.3f' % mae)

In [None]:
from imblearn.pipeline import Pipeline
from imblearn import FunctionSampler

# Do i want to remove outliers from test dataset? Without the outlier tows then no prediction can be made
LR_Pipeline = Pipeline([
    ('Outlier_removal', FunctionSampler(func=CustomSampler_IQR, validate = False)),
    ('Imputer', SimpleImputer(strategy = "median")),
    ('LR',  LogisticRegression(C = 0.7, random_state = 42, max_iter = 1000))])

In [None]:
# Define categorical pipeline
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

In [None]:
# Define numerical pipeline
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])

In [None]:
# Combine categorical and numerical pipelines
preprocessor = ColumnTransformer([
    ('cat_transformer', cat_pipe, cat_features),
    ('num_transformer', num_pipe, num_features)
])

In [None]:
# Fit a pipeline with transformers and an estimator to the training data
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LogisticRegression())
])

pipeline.fit(X_train, y_train)

# Predict training data
y_train_pred = pipe.predict(X_train)
# print(f"Predictions on training data: {y_train_pred}")
print("Accuracy on Training set:", accuracy_score(y_train, y_train_pred), "\n")

y_test_pred = pipeline.predict(X_test)
# print(f"Predictions on test data: {y_test_pred}")
print("Accuracy on Test set:", accuracy_score(y_test, y_test_pred), "\n")

In [None]:
# set config to diagram for visualizing the pipelines/composite estimators
set_config(display='diagram')

# Lets visualize the pipeline
pipeline

In [None]:
https://towardsdatascience.com/custom-transformers-and-ml-data-pipelines-with-python-20ea2a7adb65

In [None]:
imputer = SimpleImputer(strategy="median")

# Num_vars is the list of numerical variables 
X_train_num = X_train[numeric_features]
X_train_num = imputer.fit_transform(X_train_num)

In [None]:
ordinal_encoder = OrdinalEncoder()

X_train_cat = X_train[categorical_features]
X_train_cat_ord_encoded = ordinal_encoder.fit_transform(X_train_cat)
X_train_cat_ord_encoded[:,1:10]

In [None]:
cat_encoder = OneHotEncoder()

X_train_cat_hot_encoded = cat_encoder.fit_transform(X_train_cat)
X_train_cat_hot_encoded

In [None]:
StandardScaler().fit_transform(X_train_num)

In [None]:
# Custome transformations
from sklearn.base import BaseEstimator, TransformerMixin

ratings_index = -2
reviews_index = -1
class NewVariablesAdder(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
    # Make a new variable that is rating divided by number of reviews
        ratings_over_reviews = X[:,ratings_index]/X[:,reviews_index]
        return np.c_[X, ratings_over_reviews]

In [None]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
#     ('add_variables', NewVariablesAdder()),
    ('std_scaler', StandardScaler())
])

X_train_num_transformed = num_pipeline.fit_transform(X_train_num)

In [None]:
pipeline = ColumnTransformer([
    ('numerical', num_pipeline, num_vars),
    ('categorical', OneHotEncoder(), cat_vars),
    
])

X_train = pipeline.fit_transform(X_train)

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [None]:
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])

In [None]:
('features', FeatureUnion ([
     ('Cat Columns', Pipeline([
          ('Category Extractor', TypeSelector(np.number)),
                 ('Impute Zero', SimpleImputer(strategy="constant", fill_value=0))
                                    ])),
('Numerics', Pipeline([
      ('Numeric Extractor', TypeSelector("category")),
          ('Impute Missing', SimpleImputer(strategy="constant", fill_value='missing'))
          ]))        
     ]))

In [None]:
imputer = SimpleImputer(strategy = 'median', fill_value = 0)

In [None]:
numeric_features = ['age', 'fare']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['embarked', 'sex', 'pclass']

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=0)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

In [None]:
numeric_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='mean'))
      ,('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='constant'))
      ,('encoder', OrdinalEncoder())
])

## Duplicates

In [None]:
dict_data_cleaning

In [None]:
# generate count statistics of duplicate entries
print ("## Number of duplicate rows ## \n")
if len(X_y_data[X_y_data.duplicated()]) > 0:
    print("Number of duplicated observations: ", len(X_y_data[X_y_data.duplicated()]))
    X_y_data[X_y_data.duplicated(keep=False)].sort_values(by=list(X_y_data.columns)).head()
else:
    print("No duplicated observations found")

In [None]:
#X_y_data.drop_duplicates(inplace=True)

## Missing/Null Values

In [None]:
#dict_data_cleaning

In [None]:
# drop rows with a lot of missing values.
ind_missing = df[df['num_missing'] > 35].index
df_less_missing_rows = df.drop(ind_missing, axis=0)

In [None]:
# hospital_beds_raion has a lot of missing.
# If we want to drop.
cols_to_drop = ['hospital_beds_raion']
df_less_hos_beds_raion = df.drop(cols_to_drop, axis=1)

In [None]:
# replace missing values with the median.
med = df['life_sq'].median()
print(med)
df['life_sq'] = df['life_sq'].fillna(med)

In [None]:
# impute the missing values and create the missing value indicator variables for each numeric column.
df_numeric = df.select_dtypes(include=[np.number])
numeric_cols = df_numeric.columns.values

for col in numeric_cols:
    missing = df[col].isnull()
    num_missing = np.sum(missing)
    
    if num_missing > 0:  # only do the imputation for the columns that have missing values.
        print('imputing missing values for: {}'.format(col))
        df['{}_ismissing'.format(col)] = missing
        med = df[col].median()
        df[col] = df[col].fillna(med)

In [None]:
# impute the missing values and create the missing value indicator variables for each non-numeric column.
df_non_numeric = df.select_dtypes(exclude=[np.number])
non_numeric_cols = df_non_numeric.columns.values

for col in non_numeric_cols:
    missing = df[col].isnull()
    num_missing = np.sum(missing)
    
    if num_missing > 0:  # only do the imputation for the columns that have missing values.
        print('imputing missing values for: {}'.format(col))
        df['{}_ismissing'.format(col)] = missing
        
        top = df[col].describe()['top'] # impute with the most frequent value.
        df[col] = df[col].fillna(top)

In [None]:
# categorical
df['sub_area'] = df['sub_area'].fillna('_MISSING_')


# numeric
df['life_sq'] = df['life_sq'].fillna(-999)

In [None]:
This article covers 7 ways to handle missing values in the dataset:

In [None]:
Deleting Rows with missing values



In [None]:
Impute missing values for continuous variable

data["Age"] = data["Age"].replace(np.NaN, data["Age"].mean())
data["Age"] = data["Age"].replace(np.NaN, data["Age"].median())

In [None]:
Impute missing values for categorical variable

In [None]:
Other Imputation Methods

data["Age"] = data["Age"].fillna(method='ffill')
data["Age"] = data["Age"].interpolate(method='linear', limit_direction='forward', axis=0)


In [None]:
Using Algorithms that support missing values

In [None]:
Prediction of missing values

from sklearn.linear_model import LinearRegression
import pandas as pd

data = pd.read_csv("train.csv")
data = data[["Survived", "Pclass", "Sex", "SibSp", "Parch", "Fare", "Age"]]

data["Sex"] = [1 if x=="male" else 0 for x in data["Sex"]]

test_data = data[data["Age"].isnull()]
data.dropna(inplace=True)

y_train = data["Age"]
X_train = data.drop("Age", axis=1)
X_test = test_data.drop("Age", axis=1)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [None]:
Imputation using Deep Learning Library — Datawig

import pandas as pd
#pip install datawig
import datawig

data = pd.read_csv("train.csv")

df_train, df_test = datawig.utils.random_split(data)

#Initialize a SimpleImputer model
imputer = datawig.SimpleImputer(
    input_columns=['Pclass','SibSp','Parch'], # column(s) containing information about the column we want to impute
    output_column= 'Age', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer.fit(train_df=df_train, num_epochs=50)

#Impute missing values and return original dataframe with predictions
imputed = imputer.predict(df_test)


## Do All Data Cleaning on Only Train Set and Apply Calculations to Validation/Test Later on

## Handle Data Types

In [None]:
#dict_ml_data_types

## Handle Missing Data

In [None]:
from IPython.display import Image
from IPython.core.display import HTML 

Image(url= "https://miro.medium.com/max/700/1*_RA3mCS30Pr0vUxbp25Yxw.png")

In [None]:
dict_ml_missing_data

## Handle Redundant/Irrelevant Features

In [None]:
dict_ml_redundant_features 

## Handle Redundant/Irrelevant Observations

In [None]:
dict_ml_redundant_observations

## Handle Outliers

In [None]:
dict_ml_outliers

## Handle Class Imbalance

In [None]:
dict_ml_class_imbalance

## Handle Category Encoding

In [None]:
dict_ml_category_encoding

## Handle Rescaling: Standardise/Normalise

In [None]:
dict_ml_rescaling

## Handle Other Distribution Transformations

In [None]:
TO LOOK INTO NEXT PREPROCESSING PIPELINES

In [None]:
TARGET ENCODER LOOK INTO THIS

In [None]:
# SETTINGS FOR ALL PREPROCESSING STEPS TO FEED INTO PIPELINES

imputers
scaler
pca
smote

In [None]:


preprocessor = ColumnTransformer(
   transformers=[
    ('numeric', numeric_transformer, numeric_features)
   ,('categorical', categorical_transformer, categorical_features)
]) 

In [None]:
from sklearn.ensemble import RandomForestRegressor
pipeline = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('regressor',RandomForestRegressor())
           ])