# Jupyter Notebook for Missing Data Imputation using Supervised Learning

## 1. Import necessary packages

In [1]:
import lasagne
import deepdish
import theano
import numpy as np
from scipy.stats import mode, itemfreq
from scipy import delete
import matplotlib.pylab as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.svm import LinearSVC as SVM
import numpy as np
import os
import math
import random
from scipy import delete
from sklearn.model_selection import train_test_split
from missing_data_imputation import Imputer
from processing import impute, perturb_data
from params import bc_params
from params import feats_train_folder, labels_train_folder, perturb_folder
from params import feats_test_folder, labels_test_folder
from params import rand_num_seed
import os
import sys
import argparse
import cPickle as pkl
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from imputation_sm import Imputation_with_supervised_learning
from params import RESULTS_PATH

## 2. Read the dataset

If dataset has headers, set header = True otherwise set header = False

In [14]:
dataname = 'breast_cancer'
header = False

In [15]:
def preprocess(header):
    filename = 'breast_cancer.csv'
    if header:
        imp_input = np.genfromtxt(filename,delimiter=',', dtype=object, skip_header=1)
    else:
        imp_input = np.genfromtxt(filename,delimiter=',', dtype=object)

    return imp_input

In [16]:
input_data = preprocess(header)
print input_data

[['no-recurrence-events' '30-39' 'premeno' '30-34' '0-2' 'no' '3' 'left'
  'left_low' 'no']
 ['no-recurrence-events' '40-49' 'premeno' '20-24' '0-2' 'no' '2' 'right'
  'right_up' 'no']
 ['no-recurrence-events' '40-49' 'premeno' '20-24' '0-2' 'no' '2' 'left'
  'left_low' 'no']
 ['no-recurrence-events' '60-69' 'ge40' '15-19' '0-2' 'no' '2' 'right'
  'left_up' 'no']
 ['no-recurrence-events' '40-49' 'premeno' '0-4' '0-2' 'no' '2' 'right'
  'right_low' 'no']
 ['no-recurrence-events' '60-69' 'ge40' '15-19' '0-2' 'no' '2' 'left'
  'left_low' 'no']
 ['no-recurrence-events' '50-59' 'premeno' '25-29' '0-2' 'no' '2' 'left'
  'left_low' 'no']
 ['no-recurrence-events' '60-69' 'ge40' '20-24' '0-2' 'no' '1' 'left'
  'left_low' 'no']
 ['no-recurrence-events' '40-49' 'premeno' '50-54' '0-2' 'no' '2' 'left'
  'left_low' 'no']
 ['no-recurrence-events' '40-49' 'premeno' '20-24' '0-2' 'no' '2' 'right'
  'left_up' 'no']
 ['no-recurrence-events' '40-49' 'premeno' '0-4' '0-2' 'no' '3' 'left'
  'central' 'no']

## 3. Prepare training data

The dataset is split into 2/3 for train and 1/3 for test. The dataset has different levels of additional missing-data perturbation and imputation is done for each of levels of additional missing-data perturbation. It also stores scaler objects to be used on the test set.


In [17]:
def train(input_data, dataname):

    np.random.seed(rand_num_seed)
    random.seed(rand_num_seed)
    
    # load features and labels
    data = input_data


    # split data to (2/3) training and (1/3) test
    train_data, test_data = train_test_split(data, test_size=0.33)



    # binarize labels
    labels_train = (train_data[:, -1] == 'yes').astype(int)
    labels_train = labels_train.reshape((-1, 1))
    labels_test = (test_data[:, -1] == 'yes').astype(int)
    labels_test = labels_test.reshape((-1, 1))


    # save train labels in binary and one-hot representations
    labels_train.dump(os.path.join(
        labels_train_folder, '{}_labels_bin.np'.format(dataname)))

    # save test labels in binary and one-hot representations
    labels_test.dump(os.path.join(
        labels_test_folder, '{}_labels_bin_test.np'.format(dataname)))

    # remove labels column
    train_data = delete(train_data, -1, 1)
    test_data = delete(test_data, -1, 1)


    # save training data
    np.savetxt('data/breast_cancer_train.csv', train_data, delimiter=",", fmt="%s")


    # For training data
    print 'Preparing train data for {}'.format(dataname)

    # enumerate parameters
    monotone = True
    ratios = np.arange(0, .5, .1)

    for ratio in ratios:
        print '\nPerturbing {}% of data'.format(ratio)
        if ratio > 0:
            pert_data, _ = perturb_data(
                train_data, bc_params['cat_cols'], ratio, monotone,
                bc_params['miss_data_symbol'], bc_params['mnar_values'])
        else:
            pert_data = train_data
        path = os.path.join(perturb_folder,
                            '{}_train_pert_mono_{}_ratio_{}.csv'.format(dataname,
                                                                        monotone,
                                                                        ratio))
        # save perturbed data to disk as csv
        print '\tSaving perturbed data to {}'.format(path)
        np.savetxt(path, pert_data, delimiter=",", fmt="%s")
        # impute data given imp_methods in params.py
        for imp_method in bc_params['imp_methods']:
            print '\tImputing with {}'.format(imp_method)
            imp = Imputer()
            data = impute(pert_data, imp, imp_method, bc_params)
            path = "data/imputed/{}_{}_mono_{}_ratio_{}.csv".format(dataname,
                                                                    imp_method,
                                                                    monotone,
                                                                    ratio)
            # save data as csv
            print '\tSaving imputed data to {}'.format(path)
            np.savetxt(path, data, delimiter=",", fmt="%s")

            # binarize data
            data_scaled_bin = imp.binarize_data(data,
                                                bc_params['cat_cols'],
                                                bc_params['miss_data_symbol'])
            # convert to float
            data_scaled_bin = data_scaled_bin.astype(float)

            # add labels as last column
            data_scaled_bin = np.hstack((data_scaled_bin, labels_train))


            # save to disk
            filename = "{}_{}_bin_scaled_mono_{}_ratio_{}.np".format(dataname,
                                                                     imp_method,
                                                                     monotone,
                                                                     ratio)
            path = os.path.join(feats_train_folder, filename)
            print '\tSaving imputed scaled and binarized data to {}'.format(path)
            data_scaled_bin.dump(path)
    return train_data, labels_train, test_data, labels_test

In [18]:
train_data, labels_train, test_data, labels_test = train(input_data, dataname)

Preparing train data for breast_cancer

Perturbing 0.0% of data
	Saving perturbed data to data/perturbed/breast_cancer_train_pert_mono_True_ratio_0.0.csv
	Imputing with RandomReplace
	Saving imputed data to data/imputed/breast_cancer_RandomReplace_mono_True_ratio_0.0.csv
	Saving imputed scaled and binarized data to data/train/features/breast_cancer_RandomReplace_bin_scaled_mono_True_ratio_0.0.np
	Imputing with Summary
	Saving imputed data to data/imputed/breast_cancer_Summary_mono_True_ratio_0.0.csv
	Saving imputed scaled and binarized data to data/train/features/breast_cancer_Summary_bin_scaled_mono_True_ratio_0.0.np
	Imputing with RandomForest
	Saving imputed data to data/imputed/breast_cancer_RandomForest_mono_True_ratio_0.0.csv
	Saving imputed scaled and binarized data to data/train/features/breast_cancer_RandomForest_bin_scaled_mono_True_ratio_0.0.np
	Imputing with LogisticRegression
	Saving imputed data to data/imputed/breast_cancer_LogisticRegression_mono_True_ratio_0.0.csv
	Sav

## 4. Prepare testing data

1/3 of the dataset is used for testing. The test dataset is scaled and binarized for all categorical columns.


In [19]:
def test(test_data, labels_test, dataname):

    # For test data
    print 'Preparing test data for {}'.format(dataname)
    # instantiate Imputer
    imp = Imputer()
    for imp_method in bc_params['imp_methods']:
        print 'Imputing with {}'.format(imp_method)
        data = impute(test_data, imp, imp_method, bc_params)

        # scale and binarize, adding one col for missing value in all cat vars
        data_bin = np.copy(data)
        data_bin = imp.binarize_data(data_bin,
                                     bc_params['cat_cols'],
                                     bc_params['miss_data_symbol'])

        # convert to float
        data_bin = data_bin.astype(float)


        # add labels as last column
        path = os.path.join(feats_test_folder,
                            '{}_{}_bin_scaled_test.np'.format(dataname,
                                                              imp_method))
        data_bin = np.hstack((data_bin, labels_test))

        print "\tSaving imputed data to {}".format(path)
        data_bin.dump(path)
        del data
        del data_bin

In [20]:
test(test_data, labels_test, dataname)

Preparing test data for breast_cancer
Imputing with RandomReplace
	Saving imputed data to data/test/features/breast_cancer_RandomReplace_bin_scaled_test.np
Imputing with Summary
	Saving imputed data to data/test/features/breast_cancer_Summary_bin_scaled_test.np
Imputing with RandomForest
	Saving imputed data to data/test/features/breast_cancer_RandomForest_bin_scaled_test.np
Imputing with LogisticRegression
	Saving imputed data to data/test/features/breast_cancer_LogisticRegression_bin_scaled_test.np
Imputing with SVD
	Saving imputed data to data/test/features/breast_cancer_SVD_bin_scaled_test.np
Imputing with SVM
	Saving imputed data to data/test/features/breast_cancer_SVM_bin_scaled_test.np
Imputing with Identity
	Saving imputed data to data/test/features/breast_cancer_Identity_bin_scaled_test.np


## 5. Impute

The impute command imputes missing values in the dataset.


In [21]:
def impute(data_X, cat_values = False, trained_model=None):
    
    x = data_X
    imp = Imputer()


    if cat_values == False:
        missing_data_cond = lambda x: x == '0'
        # replace missing values with random existing values
        print 'imputing with random replacement'
        data_replace = imp.replace(x, missing_data_cond)
        print data_replace

        # replace missing values with feature summary
        print 'imputing with feature summarization (mode)'
        summ_func = lambda x: mode(x)[0]
        data_mode = imp.summarize(x, summ_func, missing_data_cond)
        print data_mode

        np.savetxt('output.csv', data_mode, delimiter=",", fmt="%s")
        return data_mode

    else:
        x = delete(x, (9), 1)

        cat_cols = bc_params['cat_cols']
        missing_data_cond = lambda x: x == '?'

        print 'imputing with random replacement'
        data_replace = imp.replace(x, missing_data_cond)
        print data_replace

        # replace missing values with feature summary
        print 'imputing with feature summarization (mode)'
        summ_func = lambda x: mode(x)[0]
        data_mode = imp.summarize(x, summ_func, missing_data_cond)
        print data_mode


        # replace categorical features with one hot row
        print 'imputing with one-hot'
        data_onehot = imp.binarize_data(x, cat_cols)
        print data_onehot

        # replace missing data with predictions using random forest
        print 'imputing with predicted values from random forest'
        clf = RandomForestClassifier(n_estimators=100, criterion='gini')
        data_rf = imp.predict(x, cat_cols, missing_data_cond, clf)
        print data_rf

        # replace missing data with predictions using SVM
        print 'imputing with predicted values usng SVM'
        clf = clf = SVM(
            penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', 
            fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, 
            random_state=None, max_iter=1000)
        data_svm = imp.predict(x, cat_cols, missing_data_cond, clf)
        print data_svm

        # replace missing data with predictions using logistic regression
        print 'imputing with predicted values usng logistic regression'
        clf = LogisticRegression(
                    penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True,
                    intercept_scaling=1)
        data_logistic = imp.predict(x, cat_cols, missing_data_cond, clf)
        print data_logistic

        # replace missing data with values obtained after factor analysis
        print 'imputing with factor analysis'
        data_facanal = imp.factor_analysis(x, cat_cols, missing_data_cond)
        print data_facanal

In [22]:
impute(input_data, True)

imputing with random replacement
[['no-recurrence-events' '30-39' 'premeno' '30-34' '0-2' 'no' '3' 'left'
  'left_low']
 ['no-recurrence-events' '40-49' 'premeno' '20-24' '0-2' 'no' '2' 'right'
  'right_up']
 ['no-recurrence-events' '40-49' 'premeno' '20-24' '0-2' 'no' '2' 'left'
  'left_low']
 ['no-recurrence-events' '60-69' 'ge40' '15-19' '0-2' 'no' '2' 'right'
  'left_up']
 ['no-recurrence-events' '40-49' 'premeno' '0-4' '0-2' 'no' '2' 'right'
  'right_low']
 ['no-recurrence-events' '60-69' 'ge40' '15-19' '0-2' 'no' '2' 'left'
  'left_low']
 ['no-recurrence-events' '50-59' 'premeno' '25-29' '0-2' 'no' '2' 'left'
  'left_low']
 ['no-recurrence-events' '60-69' 'ge40' '20-24' '0-2' 'no' '1' 'left'
  'left_low']
 ['no-recurrence-events' '40-49' 'premeno' '50-54' '0-2' 'no' '2' 'left'
  'left_low']
 ['no-recurrence-events' '40-49' 'premeno' '20-24' '0-2' 'no' '2' 'right'
  'left_up']
 ['no-recurrence-events' '40-49' 'premeno' '0-4' '0-2' 'no' '3' 'left'
  'central']
 ['no-recurrence-even

[['no-recurrence-events' '30-39' 'premeno' '30-34' '0-2' 'no' '3' 'left'
  'left_low']
 ['no-recurrence-events' '40-49' 'premeno' '20-24' '0-2' 'no' '2' 'right'
  'right_up']
 ['no-recurrence-events' '40-49' 'premeno' '20-24' '0-2' 'no' '2' 'left'
  'left_low']
 ['no-recurrence-events' '60-69' 'ge40' '15-19' '0-2' 'no' '2' 'right'
  'left_up']
 ['no-recurrence-events' '40-49' 'premeno' '0-4' '0-2' 'no' '2' 'right'
  'right_low']
 ['no-recurrence-events' '60-69' 'ge40' '15-19' '0-2' 'no' '2' 'left'
  'left_low']
 ['no-recurrence-events' '50-59' 'premeno' '25-29' '0-2' 'no' '2' 'left'
  'left_low']
 ['no-recurrence-events' '60-69' 'ge40' '20-24' '0-2' 'no' '1' 'left'
  'left_low']
 ['no-recurrence-events' '40-49' 'premeno' '50-54' '0-2' 'no' '2' 'left'
  'left_low']
 ['no-recurrence-events' '40-49' 'premeno' '20-24' '0-2' 'no' '2' 'right'
  'left_up']
 ['no-recurrence-events' '40-49' 'premeno' '0-4' '0-2' 'no' '3' 'left'
  'central']
 ['no-recurrence-events' '50-59' 'ge40' '25-29' '0-2' 

## 6. Predict

The predict command predicts the labels in the test dataset. Classifiers used are Random Forest and Decision Tree. Different hyperparameters are used.


In [23]:
def predict(dataname, trained_model=None, prediction=None):
    
    def dumpclean(obj):
        if type(obj) == dict:
            for k, v in obj.items():
                if hasattr(v, '__iter__'):
                    print k
                    dumpclean(v)
                else:
                    print '%s : %s' % (k, v)
        elif type(obj) == list:
            for v in obj:
                if hasattr(v, '__iter__'):
                    dumpclean(v)
                else:
                    print v
        else:
            print obj

    # store predictions in a dictionary
    model_preds = {}
    filepaths = np.loadtxt('include_bc.csv', dtype=object, delimiter=",")
    for (include, train_path, test_path) in filepaths:
        if include == '1':
            imputation_name = os.path.basename(train_path)[:-3]
            print("\nExecuting prediction on "
                  "test set\n{}").format(imputation_name)
            # Load train and test set
            train_data = np.load(
                os.path.join(feats_train_folder, train_path)).astype(np.float32)
            np.set_printoptions(threshold=sys.maxsize)
            test_data = np.load(
                os.path.join(feats_test_folder, test_path)).astype(np.float32)

            # Fit Tree Classifiers
            clfs = {
                'DTC(max_depth=5)':
                    DecisionTreeClassifier(max_depth=5),
                'DTC(max_depth=10)':
                    DecisionTreeClassifier(max_depth=10),
                'DTC(max_depth=20)':
                    DecisionTreeClassifier(max_depth=20),
                'DTC(max_depth=25)':
                    DecisionTreeClassifier(max_depth=25),
                'DTC(max_depth=50)':
                    DecisionTreeClassifier(max_depth=50),
                'DTC(max_depth=100)':
                    DecisionTreeClassifier(max_depth=100),
                'DTC(max_depth=500)':
                    DecisionTreeClassifier(max_depth=500),
                'DTC(max_depth=1000)':
                    DecisionTreeClassifier(max_depth=1000),
                'DTC(max_depth=2000)':
                    DecisionTreeClassifier(max_depth=2000),
                'DTC(max_depth=2500)':
                    DecisionTreeClassifier(max_depth=2500),
                'RFC(n_estimators=10, max_features="sqrt")':
                    RandomForestClassifier(n_estimators=10, max_features="sqrt"),
                'RFC(n_estimators=20, max_features="log2")':
                    RandomForestClassifier(n_estimators=20, max_features="log2"),
                'RFC(n_estimators=25, max_features="sqrt")':
                    RandomForestClassifier(n_estimators=25, max_features="sqrt"),
                'RFC(n_estimators=50, max_features="log2")':
                    RandomForestClassifier(n_estimators=50, max_features="log2"),
                'RFC(n_estimators=100, max_features="sqrt")':
                    RandomForestClassifier(n_estimators=100, max_features="sqrt"),
                'RFC(n_estimators=500, max_features="log2")':
                    RandomForestClassifier(n_estimators=500, max_features="log2"),
                'RFC(n_estimators=1000, max_features="sqrt")':
                    RandomForestClassifier(n_estimators=1000, max_features="sqrt"),
                'RFC(n_estimators=1500, max_features="log2")':
                    RandomForestClassifier(n_estimators=1500, max_features="log2"),
                'RFC(n_estimators=2000, max_features="sqrt")':
                    RandomForestClassifier(n_estimators=2000, max_features="sqrt"),
                'RFC(n_estimators=2500, max_features="log2")':
                    RandomForestClassifier(n_estimators=2500, max_features="log2")}

            for model_name, clf in clfs.items():
                clf.fit(train_data[:, :-1], train_data[:, -1].astype(int))
                y_test_hat = clf.predict(test_data[:, :-1])
                obj_val = (sum(y_test_hat != test_data[:, -1]) /
                           float(len(test_data)))

                model_preds[model_name+imputation_name] = obj_val
                print("{} on {} error rate on test set: {}").format(
                    model_name, imputation_name, obj_val)

    # dump dictionary
    pkl.dump(model_preds, open(
        os.path.join(RESULTS_PATH, 'trees_{}_results.np'.format(dataname)),
        'wb'))

    # print dictionary
    dumpclean(model_preds)

In [24]:
predict(dataname)


Executing prediction on test set
breast_cancer_RandomReplace_bin_scaled_mono_True_ratio_0.0
RFC(n_estimators=1500, max_features="log2") on breast_cancer_RandomReplace_bin_scaled_mono_True_ratio_0.0 error rate on test set: 0.290322580645
RFC(n_estimators=50, max_features="log2") on breast_cancer_RandomReplace_bin_scaled_mono_True_ratio_0.0 error rate on test set: 0.268817204301
RFC(n_estimators=500, max_features="log2") on breast_cancer_RandomReplace_bin_scaled_mono_True_ratio_0.0 error rate on test set: 0.258064516129
DTC(max_depth=20) on breast_cancer_RandomReplace_bin_scaled_mono_True_ratio_0.0 error rate on test set: 0.301075268817
DTC(max_depth=1000) on breast_cancer_RandomReplace_bin_scaled_mono_True_ratio_0.0 error rate on test set: 0.311827956989
DTC(max_depth=50) on breast_cancer_RandomReplace_bin_scaled_mono_True_ratio_0.0 error rate on test set: 0.290322580645
RFC(n_estimators=20, max_features="log2") on breast_cancer_RandomReplace_bin_scaled_mono_True_ratio_0.0 error rate o

RFC(n_estimators=1500, max_features="log2") on breast_cancer_RandomReplace_bin_scaled_mono_True_ratio_0.3 error rate on test set: 0.301075268817
RFC(n_estimators=50, max_features="log2") on breast_cancer_RandomReplace_bin_scaled_mono_True_ratio_0.3 error rate on test set: 0.344086021505
RFC(n_estimators=500, max_features="log2") on breast_cancer_RandomReplace_bin_scaled_mono_True_ratio_0.3 error rate on test set: 0.301075268817
DTC(max_depth=20) on breast_cancer_RandomReplace_bin_scaled_mono_True_ratio_0.3 error rate on test set: 0.41935483871
DTC(max_depth=1000) on breast_cancer_RandomReplace_bin_scaled_mono_True_ratio_0.3 error rate on test set: 0.408602150538
DTC(max_depth=50) on breast_cancer_RandomReplace_bin_scaled_mono_True_ratio_0.3 error rate on test set: 0.430107526882
RFC(n_estimators=20, max_features="log2") on breast_cancer_RandomReplace_bin_scaled_mono_True_ratio_0.3 error rate on test set: 0.322580645161
DTC(max_depth=10) on breast_cancer_RandomReplace_bin_scaled_mono_Tr

RFC(n_estimators=500, max_features="log2") on breast_cancer_Summary_bin_scaled_mono_True_ratio_0.1 error rate on test set: 0.258064516129
DTC(max_depth=20) on breast_cancer_Summary_bin_scaled_mono_True_ratio_0.1 error rate on test set: 0.322580645161
DTC(max_depth=1000) on breast_cancer_Summary_bin_scaled_mono_True_ratio_0.1 error rate on test set: 0.322580645161
DTC(max_depth=50) on breast_cancer_Summary_bin_scaled_mono_True_ratio_0.1 error rate on test set: 0.354838709677
RFC(n_estimators=20, max_features="log2") on breast_cancer_Summary_bin_scaled_mono_True_ratio_0.1 error rate on test set: 0.290322580645
DTC(max_depth=10) on breast_cancer_Summary_bin_scaled_mono_True_ratio_0.1 error rate on test set: 0.354838709677
DTC(max_depth=2500) on breast_cancer_Summary_bin_scaled_mono_True_ratio_0.1 error rate on test set: 0.333333333333
RFC(n_estimators=25, max_features="sqrt") on breast_cancer_Summary_bin_scaled_mono_True_ratio_0.1 error rate on test set: 0.247311827957
RFC(n_estimators=25

RFC(n_estimators=2500, max_features="log2") on breast_cancer_Summary_bin_scaled_mono_True_ratio_0.4 error rate on test set: 0.301075268817
DTC(max_depth=5) on breast_cancer_Summary_bin_scaled_mono_True_ratio_0.4 error rate on test set: 0.301075268817
DTC(max_depth=100) on breast_cancer_Summary_bin_scaled_mono_True_ratio_0.4 error rate on test set: 0.41935483871
DTC(max_depth=25) on breast_cancer_Summary_bin_scaled_mono_True_ratio_0.4 error rate on test set: 0.41935483871
RFC(n_estimators=1000, max_features="sqrt") on breast_cancer_Summary_bin_scaled_mono_True_ratio_0.4 error rate on test set: 0.333333333333
DTC(max_depth=500) on breast_cancer_Summary_bin_scaled_mono_True_ratio_0.4 error rate on test set: 0.41935483871
DTC(max_depth=2000) on breast_cancer_Summary_bin_scaled_mono_True_ratio_0.4 error rate on test set: 0.440860215054
RFC(n_estimators=10, max_features="sqrt") on breast_cancer_Summary_bin_scaled_mono_True_ratio_0.4 error rate on test set: 0.279569892473
RFC(n_estimators=100

RFC(n_estimators=1000, max_features="sqrt") on breast_cancer_RandomForest_bin_scaled_mono_True_ratio_0.2 error rate on test set: 0.301075268817
DTC(max_depth=500) on breast_cancer_RandomForest_bin_scaled_mono_True_ratio_0.2 error rate on test set: 0.333333333333
DTC(max_depth=2000) on breast_cancer_RandomForest_bin_scaled_mono_True_ratio_0.2 error rate on test set: 0.344086021505
RFC(n_estimators=10, max_features="sqrt") on breast_cancer_RandomForest_bin_scaled_mono_True_ratio_0.2 error rate on test set: 0.268817204301
RFC(n_estimators=100, max_features="sqrt") on breast_cancer_RandomForest_bin_scaled_mono_True_ratio_0.2 error rate on test set: 0.258064516129
RFC(n_estimators=2000, max_features="sqrt") on breast_cancer_RandomForest_bin_scaled_mono_True_ratio_0.2 error rate on test set: 0.290322580645

Executing prediction on test set
breast_cancer_RandomForest_bin_scaled_mono_True_ratio_0.3
RFC(n_estimators=1500, max_features="log2") on breast_cancer_RandomForest_bin_scaled_mono_True_r

RFC(n_estimators=1000, max_features="sqrt") on breast_cancer_LogisticRegression_bin_scaled_mono_True_ratio_0.0 error rate on test set: 0.268817204301
DTC(max_depth=500) on breast_cancer_LogisticRegression_bin_scaled_mono_True_ratio_0.0 error rate on test set: 0.290322580645
DTC(max_depth=2000) on breast_cancer_LogisticRegression_bin_scaled_mono_True_ratio_0.0 error rate on test set: 0.311827956989
RFC(n_estimators=10, max_features="sqrt") on breast_cancer_LogisticRegression_bin_scaled_mono_True_ratio_0.0 error rate on test set: 0.279569892473
RFC(n_estimators=100, max_features="sqrt") on breast_cancer_LogisticRegression_bin_scaled_mono_True_ratio_0.0 error rate on test set: 0.301075268817
RFC(n_estimators=2000, max_features="sqrt") on breast_cancer_LogisticRegression_bin_scaled_mono_True_ratio_0.0 error rate on test set: 0.279569892473

Executing prediction on test set
breast_cancer_LogisticRegression_bin_scaled_mono_True_ratio_0.1
RFC(n_estimators=1500, max_features="log2") on breast_

RFC(n_estimators=1000, max_features="sqrt") on breast_cancer_LogisticRegression_bin_scaled_mono_True_ratio_0.3 error rate on test set: 0.258064516129
DTC(max_depth=500) on breast_cancer_LogisticRegression_bin_scaled_mono_True_ratio_0.3 error rate on test set: 0.397849462366
DTC(max_depth=2000) on breast_cancer_LogisticRegression_bin_scaled_mono_True_ratio_0.3 error rate on test set: 0.397849462366
RFC(n_estimators=10, max_features="sqrt") on breast_cancer_LogisticRegression_bin_scaled_mono_True_ratio_0.3 error rate on test set: 0.258064516129
RFC(n_estimators=100, max_features="sqrt") on breast_cancer_LogisticRegression_bin_scaled_mono_True_ratio_0.3 error rate on test set: 0.279569892473
RFC(n_estimators=2000, max_features="sqrt") on breast_cancer_LogisticRegression_bin_scaled_mono_True_ratio_0.3 error rate on test set: 0.268817204301

Executing prediction on test set
breast_cancer_LogisticRegression_bin_scaled_mono_True_ratio_0.4
RFC(n_estimators=1500, max_features="log2") on breast_

RFC(n_estimators=2000, max_features="sqrt") on breast_cancer_SVM_bin_scaled_mono_True_ratio_0.1 error rate on test set: 0.279569892473

Executing prediction on test set
breast_cancer_SVM_bin_scaled_mono_True_ratio_0.2
RFC(n_estimators=1500, max_features="log2") on breast_cancer_SVM_bin_scaled_mono_True_ratio_0.2 error rate on test set: 0.290322580645
RFC(n_estimators=50, max_features="log2") on breast_cancer_SVM_bin_scaled_mono_True_ratio_0.2 error rate on test set: 0.311827956989
RFC(n_estimators=500, max_features="log2") on breast_cancer_SVM_bin_scaled_mono_True_ratio_0.2 error rate on test set: 0.290322580645
DTC(max_depth=20) on breast_cancer_SVM_bin_scaled_mono_True_ratio_0.2 error rate on test set: 0.365591397849
DTC(max_depth=1000) on breast_cancer_SVM_bin_scaled_mono_True_ratio_0.2 error rate on test set: 0.365591397849
DTC(max_depth=50) on breast_cancer_SVM_bin_scaled_mono_True_ratio_0.2 error rate on test set: 0.354838709677
RFC(n_estimators=20, max_features="log2") on breast

RFC(n_estimators=2500, max_features="log2") on breast_cancer_Identity_bin_scaled_mono_True_ratio_0.0 error rate on test set: 0.279569892473
DTC(max_depth=5) on breast_cancer_Identity_bin_scaled_mono_True_ratio_0.0 error rate on test set: 0.301075268817
DTC(max_depth=100) on breast_cancer_Identity_bin_scaled_mono_True_ratio_0.0 error rate on test set: 0.311827956989
DTC(max_depth=25) on breast_cancer_Identity_bin_scaled_mono_True_ratio_0.0 error rate on test set: 0.279569892473
RFC(n_estimators=1000, max_features="sqrt") on breast_cancer_Identity_bin_scaled_mono_True_ratio_0.0 error rate on test set: 0.258064516129
DTC(max_depth=500) on breast_cancer_Identity_bin_scaled_mono_True_ratio_0.0 error rate on test set: 0.311827956989
DTC(max_depth=2000) on breast_cancer_Identity_bin_scaled_mono_True_ratio_0.0 error rate on test set: 0.311827956989
RFC(n_estimators=10, max_features="sqrt") on breast_cancer_Identity_bin_scaled_mono_True_ratio_0.0 error rate on test set: 0.268817204301
RFC(n_est

RFC(n_estimators=1000, max_features="sqrt") on breast_cancer_Identity_bin_scaled_mono_True_ratio_0.3 error rate on test set: 0.279569892473
DTC(max_depth=500) on breast_cancer_Identity_bin_scaled_mono_True_ratio_0.3 error rate on test set: 0.41935483871
DTC(max_depth=2000) on breast_cancer_Identity_bin_scaled_mono_True_ratio_0.3 error rate on test set: 0.41935483871
RFC(n_estimators=10, max_features="sqrt") on breast_cancer_Identity_bin_scaled_mono_True_ratio_0.3 error rate on test set: 0.268817204301
RFC(n_estimators=100, max_features="sqrt") on breast_cancer_Identity_bin_scaled_mono_True_ratio_0.3 error rate on test set: 0.225806451613
RFC(n_estimators=2000, max_features="sqrt") on breast_cancer_Identity_bin_scaled_mono_True_ratio_0.3 error rate on test set: 0.279569892473

Executing prediction on test set
breast_cancer_Identity_bin_scaled_mono_True_ratio_0.4
RFC(n_estimators=1500, max_features="log2") on breast_cancer_Identity_bin_scaled_mono_True_ratio_0.4 error rate on test set: 0

## 7. Evaluate

Loads the complete input dataset, imputed table and calculates the performance on the input using RMSE(Root Mean Squared Error).

In [3]:
obj = Imputation_with_supervised_learning()
obj.evaluate()

0.2669549442140351