In [None]:
#Show ALL outputs in cell, not only last result
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
relative_filepath = "../../"

In [None]:
#Set relative path mapping for module imports
import sys

sys.path.append(relative_filepath)

#for path in sys.path:
#    print(path)

In [None]:
# External Dependencies
import numpy as np
import pandas as pd

In [None]:
# Read in pickled combined data
X_y_data = pd.read_pickle(relative_filepath + "data/interim/step_3a/X_y_data.pkl")

# Read in pickled train data
X_y_train = pd.read_pickle(relative_filepath + "data/interim/step_3a/X_y_train.pkl")

# Read in pickled test data
X_y_test = pd.read_pickle(relative_filepath + "data/interim/step_3a/X_y_test.pkl")

# Recap data structure
X_y_data.head()
X_y_data.shape

In [None]:
import json

dict_ml_missing_data = json.load(open(relative_filepath + "reports/dicts/dict_ml_missing_data.json"))

In [None]:
#values for config dict
input_dfs = [X_y_data,
             X_y_train,
             X_y_test]

target = "classLabel"

## Data Cleaning Checklist

In [None]:
https://elitedatascience.com/data-cleaning

Remove Unwanted observations
    Duplicate observations
    Irrelevant observations
    
Fix Structural Errors

Filter Unwanted Outliers

Handle Missing Data
    Missing categorical data
    Missing numeric data

In [1]:
# imports 
from sklearn import set_config                      # to change the display
from sklearn.utils import estimator_html_repr       # to save the diagram into HTML format

# Validation
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Classification models
from sklearn.linear_model import LogisticRegression

# Evaluation / Scoring metrics
from sklearn.metrics import accuracy_score

In [2]:
from sklearn.datasets import make_classification, load_breast_cancer

X, y = load_breast_cancer(return_X_y = True, as_frame=True)
X.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [3]:
# Split into train & test
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=11)

In [4]:
X_train.dtypes

mean radius                float64
mean texture               float64
mean perimeter             float64
mean area                  float64
mean smoothness            float64
mean compactness           float64
mean concavity             float64
mean concave points        float64
mean symmetry              float64
mean fractal dimension     float64
radius error               float64
texture error              float64
perimeter error            float64
area error                 float64
smoothness error           float64
compactness error          float64
concavity error            float64
concave points error       float64
symmetry error             float64
fractal dimension error    float64
worst radius               float64
worst texture              float64
worst perimeter            float64
worst area                 float64
worst smoothness           float64
worst compactness          float64
worst concavity            float64
worst concave points       float64
worst symmetry      

In [23]:
cat_features = X_train.select_dtypes(include=['object']).columns
num_features = X_train.select_dtypes(include=['int64', 'float64']).columns

# cat_features = []
# num_features = []

# print(cat_features)
# print(num_features)

In [24]:
len(cat_features)
len(num_features)

30

In [25]:
# Define categorical pipeline
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

In [26]:
# Define numerical pipeline
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])

In [27]:
# Combine categorical and numerical pipelines
preprocessor = ColumnTransformer([
    ('cat_transformer', cat_pipe, cat_features),
    ('num_transformer', num_pipe, num_features)
])

In [28]:
# Fit a pipeline with transformers and an estimator to the training data
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LogisticRegression())
])

pipeline.fit(X_train, y_train)

# Predict training data
y_train_pred = pipe.predict(X_train)
# print(f"Predictions on training data: {y_train_pred}")
print("Accuracy on Training set:", accuracy_score(y_train, y_train_pred), "\n")

y_test_pred = pipeline.predict(X_test)
# print(f"Predictions on test data: {y_test_pred}")
print("Accuracy on Test set:", accuracy_score(y_test, y_test_pred), "\n")

Accuracy on Training set: 0.978021978021978 

Accuracy on Test set: 0.9298245614035088 



In [29]:
# set config to diagram for visualizing the pipelines/composite estimators
set_config(display='diagram')

# Lets visualize the pipeline
pipeline

In [None]:
https://towardsdatascience.com/custom-transformers-and-ml-data-pipelines-with-python-20ea2a7adb65

In [None]:
imputer = SimpleImputer(strategy="median")

# Num_vars is the list of numerical variables 
X_train_num = X_train[numeric_features]
X_train_num = imputer.fit_transform(X_train_num)

In [None]:
ordinal_encoder = OrdinalEncoder()

X_train_cat = X_train[categorical_features]
X_train_cat_ord_encoded = ordinal_encoder.fit_transform(X_train_cat)
X_train_cat_ord_encoded[:,1:10]

In [None]:
cat_encoder = OneHotEncoder()

X_train_cat_hot_encoded = cat_encoder.fit_transform(X_train_cat)
X_train_cat_hot_encoded

In [None]:
StandardScaler().fit_transform(X_train_num)

In [None]:
# Custome transformations
from sklearn.base import BaseEstimator, TransformerMixin

ratings_index = -2
reviews_index = -1
class NewVariablesAdder(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
    # Make a new variable that is rating divided by number of reviews
        ratings_over_reviews = X[:,ratings_index]/X[:,reviews_index]
        return np.c_[X, ratings_over_reviews]

In [None]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
#     ('add_variables', NewVariablesAdder()),
    ('std_scaler', StandardScaler())
])

X_train_num_transformed = num_pipeline.fit_transform(X_train_num)

In [None]:
pipeline = ColumnTransformer([
    ('numerical', num_pipeline, num_vars),
    ('categorical', OneHotEncoder(), cat_vars),
    
])

X_train = pipeline.fit_transform(X_train)

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [None]:
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])

In [None]:
('features', FeatureUnion ([
     ('Cat Columns', Pipeline([
          ('Category Extractor', TypeSelector(np.number)),
                 ('Impute Zero', SimpleImputer(strategy="constant", fill_value=0))
                                    ])),
('Numerics', Pipeline([
      ('Numeric Extractor', TypeSelector("category")),
          ('Impute Missing', SimpleImputer(strategy="constant", fill_value='missing'))
          ]))        
     ]))

In [None]:
imputer = SimpleImputer(strategy = 'median', fill_value = 0)

In [None]:
numeric_features = ['age', 'fare']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['embarked', 'sex', 'pclass']

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=0)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

In [None]:
numeric_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='mean'))
      ,('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='constant'))
      ,('encoder', OrdinalEncoder())
])

## Duplicates

In [None]:
dict_data_cleaning

In [None]:
# generate count statistics of duplicate entries
print ("## Number of duplicate rows ## \n")
if len(X_y_data[X_y_data.duplicated()]) > 0:
    print("Number of duplicated observations: ", len(X_y_data[X_y_data.duplicated()]))
    X_y_data[X_y_data.duplicated(keep=False)].sort_values(by=list(X_y_data.columns)).head()
else:
    print("No duplicated observations found")

In [None]:
#X_y_data.drop_duplicates(inplace=True)

## Missing/Null Values

In [None]:
#dict_data_cleaning

In [None]:
# drop rows with a lot of missing values.
ind_missing = df[df['num_missing'] > 35].index
df_less_missing_rows = df.drop(ind_missing, axis=0)

In [None]:
# hospital_beds_raion has a lot of missing.
# If we want to drop.
cols_to_drop = ['hospital_beds_raion']
df_less_hos_beds_raion = df.drop(cols_to_drop, axis=1)

In [None]:
# replace missing values with the median.
med = df['life_sq'].median()
print(med)
df['life_sq'] = df['life_sq'].fillna(med)

In [None]:
# impute the missing values and create the missing value indicator variables for each numeric column.
df_numeric = df.select_dtypes(include=[np.number])
numeric_cols = df_numeric.columns.values

for col in numeric_cols:
    missing = df[col].isnull()
    num_missing = np.sum(missing)
    
    if num_missing > 0:  # only do the imputation for the columns that have missing values.
        print('imputing missing values for: {}'.format(col))
        df['{}_ismissing'.format(col)] = missing
        med = df[col].median()
        df[col] = df[col].fillna(med)

In [None]:
# impute the missing values and create the missing value indicator variables for each non-numeric column.
df_non_numeric = df.select_dtypes(exclude=[np.number])
non_numeric_cols = df_non_numeric.columns.values

for col in non_numeric_cols:
    missing = df[col].isnull()
    num_missing = np.sum(missing)
    
    if num_missing > 0:  # only do the imputation for the columns that have missing values.
        print('imputing missing values for: {}'.format(col))
        df['{}_ismissing'.format(col)] = missing
        
        top = df[col].describe()['top'] # impute with the most frequent value.
        df[col] = df[col].fillna(top)

In [None]:
# categorical
df['sub_area'] = df['sub_area'].fillna('_MISSING_')


# numeric
df['life_sq'] = df['life_sq'].fillna(-999)

In [None]:
This article covers 7 ways to handle missing values in the dataset:

In [None]:
Deleting Rows with missing values



In [None]:
Impute missing values for continuous variable

data["Age"] = data["Age"].replace(np.NaN, data["Age"].mean())
data["Age"] = data["Age"].replace(np.NaN, data["Age"].median())

In [None]:
Impute missing values for categorical variable

In [None]:
Other Imputation Methods

data["Age"] = data["Age"].fillna(method='ffill')
data["Age"] = data["Age"].interpolate(method='linear', limit_direction='forward', axis=0)


In [None]:
Using Algorithms that support missing values

In [None]:
Prediction of missing values

from sklearn.linear_model import LinearRegression
import pandas as pd

data = pd.read_csv("train.csv")
data = data[["Survived", "Pclass", "Sex", "SibSp", "Parch", "Fare", "Age"]]

data["Sex"] = [1 if x=="male" else 0 for x in data["Sex"]]

test_data = data[data["Age"].isnull()]
data.dropna(inplace=True)

y_train = data["Age"]
X_train = data.drop("Age", axis=1)
X_test = test_data.drop("Age", axis=1)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [None]:
Imputation using Deep Learning Library — Datawig

import pandas as pd
#pip install datawig
import datawig

data = pd.read_csv("train.csv")

df_train, df_test = datawig.utils.random_split(data)

#Initialize a SimpleImputer model
imputer = datawig.SimpleImputer(
    input_columns=['Pclass','SibSp','Parch'], # column(s) containing information about the column we want to impute
    output_column= 'Age', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer.fit(train_df=df_train, num_epochs=50)

#Impute missing values and return original dataframe with predictions
imputed = imputer.predict(df_test)


## Do All Data Cleaning on Only Train Set and Apply Calculations to Validation/Test Later on

## Handle Data Types

In [None]:
#dict_ml_data_types

## Handle Missing Data

In [None]:
from IPython.display import Image
from IPython.core.display import HTML 

Image(url= "https://miro.medium.com/max/700/1*_RA3mCS30Pr0vUxbp25Yxw.png")

In [None]:
dict_ml_missing_data

## Handle Redundant/Irrelevant Features

In [None]:
dict_ml_redundant_features 

## Handle Redundant/Irrelevant Observations

In [None]:
dict_ml_redundant_observations

## Handle Outliers

In [None]:
dict_ml_outliers

## Handle Class Imbalance

In [None]:
dict_ml_class_imbalance

## Handle Category Encoding

In [None]:
dict_ml_category_encoding

## Handle Rescaling: Standardise/Normalise

In [None]:
dict_ml_rescaling

## Handle Other Distribution Transformations

In [None]:
TO LOOK INTO NEXT PREPROCESSING PIPELINES

In [None]:
TARGET ENCODER LOOK INTO THIS

In [None]:
# SETTINGS FOR ALL PREPROCESSING STEPS TO FEED INTO PIPELINES

imputers
scaler
pca
smote

In [None]:


preprocessor = ColumnTransformer(
   transformers=[
    ('numeric', numeric_transformer, numeric_features)
   ,('categorical', categorical_transformer, categorical_features)
]) 

In [None]:
from sklearn.ensemble import RandomForestRegressor
pipeline = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('regressor',RandomForestRegressor())
           ])