# Imputing missing values before building an estimator


In [None]:
import numpy as np
import pandas as pd

In [None]:
mamm_csv = "mamografias.csv"

In [None]:
missing_values_format = ["?"]
mamm_data = pd.read_csv(mamm_csv, na_values=missing_values_format)
mamm_data.head()

In [None]:
mamm_data.info()

In [None]:
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler

le = LabelEncoder()
#convert the categorical columns into numeric
mamm_data['Severity'] = le.fit_transform(mamm_data['Severity'])
mamm_data['Shape'] = le.fit_transform(mamm_data['Shape'])

In [None]:
mamm_data.head()

In [None]:
# Number of NULL values per feature
mamm_data.isnull().sum()

In [None]:
mamm_data.shape

In [None]:
mamm_data_without_nan = mamm_data.dropna()

In [None]:
mamm_data_without_nan.isnull().sum()

In [None]:
mamm_data_without_nan.shape

In [None]:
cols = [col for col in mamm_data.columns if col not in ['Severity']]

In [None]:
X_no_nan = mamm_data_without_nan[cols]
y_no_nan= mamm_data_without_nan['Severity']
X = mamm_data[cols]
y = mamm_data['Severity']

In [None]:
X.info()

In [None]:
X.shape

In [None]:
X_no_nan.info()

In [None]:
X_no_nan.shape

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_no_nan_scaled = scaler.fit_transform(X_no_nan)

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

# To use the experimental IterativeImputer, we need to explicitly ask for it:
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline

In [None]:
def get_full_score(regressor, X_full, y_full):
    full_scores = cross_val_score(regressor, X_full, y_full, scoring='accuracy', cv=N_SPLITS)
    return full_scores.mean(), full_scores.std()

In [None]:
def get_scores_for_imputer(regressor, imputer, X_missing, y_missing):
    estimator = make_pipeline(imputer, regressor)
    impute_scores = cross_val_score(estimator, X_missing, y_missing, scoring='accuracy', cv=N_SPLITS)
    return impute_scores

In [None]:
def get_impute_zero_score(regressor, X_missing, y_missing):
    imputer = SimpleImputer(missing_values=np.nan, add_indicator=True, strategy='constant', fill_value=0)
    zero_impute_scores = get_scores_for_imputer(regressor, imputer, X_missing, y_missing)
    return zero_impute_scores.mean(), zero_impute_scores.std()

Sustitución por la media. Este método tan sólo es válido para
variables numéricas.

In [None]:
def get_impute_mean(regressor, X_missing, y_missing):
    imputer = SimpleImputer(missing_values=np.nan, strategy="mean", add_indicator=True)
    mean_impute_scores = get_scores_for_imputer(regressor, imputer, X_missing, y_missing)
    return mean_impute_scores.mean(), mean_impute_scores.std()

Sustitución por la moda. Este método es válido para variables
categóricas

In [None]:
def get_impute_mode(regressor, X_missing, y_missing):
    imputer = SimpleImputer(missing_values=np.nan, strategy="most_frequent", add_indicator=True)
    mean_impute_scores = get_scores_for_imputer(regressor, imputer, X_missing, y_missing)
    return mean_impute_scores.mean(), mean_impute_scores.std()

the median is a more robust estimator for data with high magnitude variables which could dominate results (otherwise known as a 'long tail').

In [None]:
def get_impute_median(regressor, X_missing, y_missing):
    imputer = SimpleImputer(missing_values=np.nan, strategy="median", add_indicator=True)
    mean_impute_scores = get_scores_for_imputer(regressor, imputer, X_missing, y_missing)
    return mean_impute_scores.mean(), mean_impute_scores.std()

Algoritmos del tipo k-vecinos más cercanos. Consisten en buscar
los k valores más próximos al que queremos sustituir. Una vez
identificados se puede sustituir por la media (algoritmo k-medias)
o por la moda (algoritmo k-modas).

In [None]:
def get_impute_knn_score(regressor, X_missing, y_missing):
    imputer = KNNImputer(missing_values=np.nan, add_indicator=True)
    knn_impute_scores = get_scores_for_imputer(regressor, imputer, X_missing, y_missing)
    return knn_impute_scores.mean(), knn_impute_scores.std()

This uses round-robin linear regression, modeling each feature with missing values as a function of other features, in turn. The version implemented assumes Gaussian (output) variables. If your features are obviously non-normal, consider transforming them to look more normal to potentially improve performance.

In [None]:
def get_impute_iterative(regressor, X_missing, y_missing):
    imputer = IterativeImputer(missing_values=np.nan, add_indicator=True,random_state=0, n_nearest_features=5,sample_posterior=True)
    iterative_impute_scores = get_scores_for_imputer(regressor, imputer, X_missing, y_missing)
    return iterative_impute_scores.mean(), iterative_impute_scores.std()

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

N_SPLITS = 5
lr = LogisticRegression(C=1e5, random_state = 10)
rfc = RandomForestClassifier(max_depth=3, n_estimators=100, random_state=10)
ab = AdaBoostClassifier(n_estimators=100)
gnb = GaussianNB()
svc = SVC(kernel="linear", random_state=10)
svc_linear = LinearSVC(random_state=10)

In [None]:
x_labels = ['Drop missing',
            'Zero imputation',
            'Mean Imputation',
            'Median Imputation',
            'Mode Imputation',
            'KNN Imputation',
            'Iterative Imputation']

In [None]:
mses_lr = np.zeros(7)
stds_lr = np.zeros(7)
mses_rfc = np.zeros(7)
stds_rfc = np.zeros(7)
mses_ab = np.zeros(7)
stds_ab = np.zeros(7)
mses_gnb = np.zeros(7)
stds_gnb = np.zeros(7)
mses_svc = np.zeros(7)
stds_svc = np.zeros(7)
mses_linear_svc = np.zeros(7)
stds_linear_svc = np.zeros(7)

In [None]:
mses_lr[0], stds_lr[0] = get_full_score(lr, X_no_nan, y_no_nan)
mses_rfc[0], stds_rfc[0] = get_full_score(rfc, X_no_nan, y_no_nan)
mses_ab[0], stds_ab[0] = get_full_score(ab, X_no_nan, y_no_nan)
mses_gnb[0], stds_gnb[0] = get_full_score(gnb, X_no_nan, y_no_nan)
mses_svc[0], stds_svc[0] = get_full_score(svc, X_no_nan, y_no_nan)
mses_linear_svc[0], stds_linear_svc[0] = get_full_score(svc_linear, X_no_nan, y_no_nan)

In [None]:
mses_lr[1], stds_lr[1] = get_impute_zero_score(lr, X, y)
mses_rfc[1], stds_rfc[1] = get_impute_zero_score(rfc, X, y)
mses_ab[1], stds_ab[1] = get_impute_zero_score(ab, X, y)
mses_gnb[1], stds_gnb[1] = get_impute_zero_score(gnb, X, y)
mses_svc[1], stds_svc[1] = get_impute_zero_score(svc, X, y)
mses_linear_svc[1], stds_linear_svc[1] = get_impute_zero_score(svc_linear, X, y)

In [None]:
mses_lr[2], stds_lr[2] = get_impute_mean(lr, X, y)
mses_rfc[2], stds_rfc[2] = get_impute_mean(rfc, X, y)
mses_ab[2], stds_ab[2] = get_impute_mean(ab, X, y)
mses_gnb[2], stds_gnb[2] = get_impute_mean(gnb, X, y)
mses_svc[2], stds_svc[2] = get_impute_mean(svc, X, y)
mses_linear_svc[2], stds_linear_svc[2] = get_impute_mean(svc_linear, X, y)

In [None]:
mses_lr[3], stds_lr[3] = get_impute_median(lr, X, y)
mses_rfc[3], stds_rfc[3] = get_impute_median(rfc, X, y)
mses_ab[3], stds_ab[3] = get_impute_median(ab, X, y)
mses_gnb[3], stds_gnb[3] = get_impute_median(gnb, X, y)
mses_svc[3], stds_svc[3] = get_impute_median(svc, X, y)
mses_linear_svc[3], stds_linear_svc[3] = get_impute_median(svc_linear, X, y)

In [None]:
mses_lr[4], stds_lr[4] = get_impute_mode(lr, X, y)
mses_rfc[4], stds_rfc[4] = get_impute_mode(rfc, X, y)
mses_ab[4], stds_ab[4] = get_impute_mode(ab, X, y)
mses_gnb[4], stds_gnb[4] = get_impute_mode(gnb, X, y)
mses_svc[4], stds_svc[4] = get_impute_mode(svc, X, y)
mses_linear_svc[4], stds_linear_svc[4] = get_impute_mode(svc_linear, X, y)

In [None]:
mses_lr[5], stds_lr[5] = get_impute_knn_score(lr, X, y)
mses_rfc[5], stds_rfc[5] = get_impute_knn_score(rfc, X, y)
mses_ab[5], stds_ab[5] = get_impute_knn_score(ab, X, y)
mses_gnb[5], stds_gnb[5] = get_impute_knn_score(gnb, X, y)
mses_svc[5], stds_svc[5] = get_impute_knn_score(svc, X, y)
mses_linear_svc[5], stds_linear_svc[5] = get_impute_knn_score(svc_linear, X, y)

In [None]:
mses_lr[6], stds_lr[6] = get_impute_iterative(lr, X, y)
mses_rfc[6], stds_rfc[6] = get_impute_iterative(rfc, X, y)
mses_ab[6], stds_ab[6] = get_impute_iterative(ab, X, y)
mses_gnb[6], stds_gnb[6] = get_impute_iterative(gnb, X, y)
mses_svc[6], stds_svc[6] = get_impute_iterative(svc, X, y)
mses_linear_svc[6], stds_linear_svc[6] = get_impute_iterative(svc_linear, X, y)

In [None]:
import matplotlib.pyplot as plt


n_bars = len(mses)
xval = np.arange(n_bars)

colors = ['r', 'g', 'b', 'orange', 'black', 'skyblue', 'darkslategray']


# plot results lr
plt.figure(figsize=(12, 6))
ax1 = plt.subplot(121)
for j in xval:
    ax1.barh(j, mses_lr[j], xerr=stds_lr[j], color=colors[j], alpha=0.6, align='center')
    ax1.text(mses[j] , j + .25, str(mses_lr[j]), color=colors[j], fontweight='bold')

ax1.set_title('LogisticRegression(C=1e5, random_state = 10)')
ax1.set_xlim(left=np.min(mses_lr) * 0.9, right=np.max(mses_lr) * 1.1)
ax1.set_yticks(xval)
ax1.set_xlabel('cross_val_score accuracy')
ax1.invert_yaxis()
ax1.set_yticklabels(x_labels)

# plot results rfc
ax2 = plt.subplot(122)
for j in xval:
    ax2.barh(j, mses_rfc[j], xerr=stds_rfc[j], color=colors[j], alpha=0.6, align='center')
    ax2.text(mses_rfc[j] , j + .25, str(mses_rfc[j]), color=colors[j], fontweight='bold')

ax2.set_title('RandomForestClassifier(n_estimators=100, random_state=10)')
ax2.set_xlim(left=np.min(mses_lr) * 0.9, right=np.max(mses_lr) * 1.1)
ax2.set_yticks(xval)
ax2.set_xlabel('cross_val_score accuracy')
ax2.invert_yaxis()
ax2.set_yticklabels([''] * n_bars)
plt.savefig("figures_python/imputation_techniques/imputation_techniques_lr_rfc.pdf")
plt.show()


In [None]:
import matplotlib.pyplot as plt


n_bars = len(mses)
xval = np.arange(n_bars)

colors = ['r', 'g', 'b', 'orange', 'black', 'skyblue', 'darkslategray']


# plot results ab
plt.figure(figsize=(12, 6))
ax1 = plt.subplot(121)
for j in xval:
    ax1.barh(j, mses_ab[j], xerr=stds_ab[j], color=colors[j], alpha=0.6, align='center')
    ax1.text(mses_ab[j] , j + .25, str(mses_ab[j]), color=colors[j], fontweight='bold')

ax1.set_title('AdaBoostClassifier(n_estimators=100)')
ax1.set_xlim(left=np.min(mses_ab) * 0.9, right=np.max(mses_ab) * 1.1)
ax1.set_yticks(xval)
ax1.set_xlabel('cross_val_score accuracy')
ax1.invert_yaxis()
ax1.set_yticklabels(x_labels)

# plot results svc
ax2 = plt.subplot(122)
for j in xval:
    ax2.barh(j, mses_svc[j], xerr=stds_svc[j], color=colors[j], alpha=0.6, align='center')
    ax2.text(mses_svc[j] , j + .25, str(mses_svc[j]), color=colors[j], fontweight='bold')

ax2.set_title('SVC(kernel="linear", random_state=10)')
ax2.set_xlim(left=np.min(mses_ab) * 0.9, right=np.max(mses_ab) * 1.1)
ax2.set_yticks(xval)
ax2.set_xlabel('cross_val_score accuracy')
ax2.invert_yaxis()
ax2.set_yticklabels([''] * n_bars)
plt.savefig("figures_python/imputation_techniques/imputation_techniques_ab_svc.pdf")
plt.show()



In [None]:
import matplotlib.pyplot as plt


n_bars = len(mses)
xval = np.arange(n_bars)

colors = ['r', 'g', 'b', 'orange', 'black', 'skyblue', 'darkslategray']
# plot results gnb
plt.figure(figsize=(12, 6))
ax1 = plt.subplot(121)
for j in xval:
    ax1.barh(j, mses_gnb[j], xerr=stds_gnb[j], color=colors[j], alpha=0.6, align='center')
    ax1.text(mses_gnb[j] , j + .25, str(mses_gnb[j]), color=colors[j], fontweight='bold')

ax1.set_title('GaussianNB()')
ax1.set_xlim(left=np.min(mses_gnb) * 0.9, right=np.max(mses_gnb) * 1.1)
ax1.set_yticks(xval)
ax1.set_xlabel('cross_val_score accuracy')
ax1.invert_yaxis()
ax1.set_yticklabels(x_labels)

# plot results svc_linear
ax2 = plt.subplot(122)
for j in xval:
    ax2.barh(j, mses_linear_svc[j], xerr=stds_linear_svc[j], color=colors[j], alpha=0.6, align='center')
    ax2.text(mses_linear_svc[j] , j + .25, str(mses_linear_svc[j]), color=colors[j], fontweight='bold')

ax2.set_title('LinearSVC(random_state=10)')
ax2.set_xlim(left=np.min(mses_gnb) * 0.9, right=np.max(mses_gnb) * 1.1)
ax2.set_yticks(xval)
ax2.set_xlabel('cross_val_score accuracy')
ax2.invert_yaxis()
ax2.set_yticklabels([''] * n_bars)
plt.savefig("figures_python/imputation_techniques/imputation_techniques_gnb_svc_linear.pdf")
plt.show()


# Same as before but comparing standard and minmax scaling

### StandarScaler

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_no_nan_scaled = scaler.fit_transform(X_no_nan)

In [None]:
min_max = MinMaxScaler()
X_min_max = min_max.fit_transform(X)
X_no_nan_min_max = min_max.fit_transform(X_no_nan)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.gaussian_process import GaussianProcessClassifier
'''
SVM (linear)":SVC(kernel="linear", random_state=10)
SVC(kernel="rbf", random_state=10)
SVC(kernel="sigmoid", random_state=10)
SVC(kernel="poly", random_state=10)
MultinomialNB()
'''

gnb = GaussianProcessClassifier()
svc = SVC(kernel="poly", random_state=10)
svc_linear = LinearSVC(random_state=10)

In [None]:
mses_knn = np.zeros(7)
stds_knn = np.zeros(7)

In [None]:
mses_gnb_scaled = np.zeros(7)
stds_gnb_scaled = np.zeros(7)
mses_svc_scaled = np.zeros(7)
stds_svc_scaled = np.zeros(7)
mses_svc_linear_scaled = np.zeros(7)
stds_svc_linear_scaled = np.zeros(7)

In [None]:
mses_gnb_minmax = np.zeros(7)
stds_gnb_minmax = np.zeros(7)
mses_svc_minmax = np.zeros(7)
stds_svc_minmax = np.zeros(7)
mses_svc_linear_minmax = np.zeros(7)
stds_svc_linear_minmax = np.zeros(7)

In [None]:
mses_gnb_scaled[0], stds_gnb_scaled[0] = get_full_score(gnb, X_no_nan_scaled, y_no_nan)
mses_svc_scaled[0], stds_svc_scaled[0] = get_full_score(svc, X_no_nan_scaled, y_no_nan)
mses_svc_linear_scaled[0], stds_svc_linear_scaled[0] = get_full_score(svc_linear, X_no_nan_scaled, y_no_nan)

In [None]:
mses_gnb_minmax[0], stds_gnb_minmax[0] = get_full_score(gnb, X_no_nan_min_max, y_no_nan)
mses_svc_minmax[0], stds_svc_minmax[0] = get_full_score(svc, X_no_nan_min_max, y_no_nan)
mses_svc_linear_minmax[0], stds_svc_linear_minmax[0] = get_full_score(svc_linear, X_no_nan_min_max, y_no_nan)

In [None]:
mses_gnb_scaled[1], stds_gnb_scaled[1] = get_impute_zero_score(gnb, X_scaled, y)
mses_svc_scaled[1], stds_svc_scaled[1] = get_impute_zero_score(svc, X_scaled, y)
mses_svc_linear_scaled[1], stds_svc_linear_scaled[1] = get_impute_zero_score(svc_linear, X_scaled, y)

In [None]:
mses_gnb_minmax[1], stds_gnb_minmax[1] = get_impute_zero_score(gnb, X_min_max, y)
mses_svc_minmax[1], stds_svc_minmax[1] = get_impute_zero_score(svc, X_min_max, y)
mses_svc_linear_minmax[1], stds_svc_linear_minmax[1] = get_impute_zero_score(svc_linear, X_min_max, y)

In [None]:
mses_gnb_scaled[2], stds_gnb_scaled[2] = get_impute_mean(gnb, X_scaled, y)
mses_svc_scaled[2], stds_svc_scaled[2] = get_impute_mean(svc, X_scaled, y)
mses_svc_linear_scaled[2], stds_svc_linear_scaled[2] = get_impute_mean(svc_linear, X_scaled, y)

In [None]:
mses_gnb_minmax[2], stds_gnb_minmax[2] = get_impute_mean(gnb, X_min_max, y)
mses_svc_minmax[2], stds_svc_minmax[2] = get_impute_mean(svc, X_min_max, y)
mses_svc_linear_minmax[2], stds_svc_linear_minmax[2] = get_impute_mean(svc_linear, X_min_max, y)

In [None]:
mses_gnb_scaled[3], stds_gnb_scaled[3] = get_impute_median(gnb, X_scaled, y)
mses_svc_scaled[3], stds_svc_scaled[3] = get_impute_median(svc, X_scaled, y)
mses_svc_linear_scaled[3], stds_svc_linear_scaled[3] = get_impute_median(svc_linear, X_scaled, y)

In [None]:
mses_gnb_minmax[3], stds_gnb_minmax[3] = get_impute_median(gnb, X_min_max, y)
mses_svc_minmax[3], stds_svc_minmax[3] = get_impute_median(svc, X_min_max, y)
mses_svc_linear_minmax[3], stds_svc_linear_minmax[3] = get_impute_median(svc_linear, X_min_max, y)

In [None]:
mses_gnb_scaled[4], stds_gnb_scaled[4] = get_impute_mode(gnb, X_scaled, y)
mses_svc_scaled[4], stds_svc_scaled[4] = get_impute_mode(svc, X_scaled, y)
mses_svc_linear_scaled[4], stds_svc_linear_scaled[4] = get_impute_mode(svc_linear, X_scaled, y)

In [None]:
mses_gnb_minmax[4], stds_gnb_minmax[4] = get_impute_mode(gnb, X_min_max, y)
mses_svc_minmax[4], stds_svc_minmax[4] = get_impute_mode(svc, X_min_max, y)
mses_svc_linear_minmax[4], stds_svc_linear_minmax[4] = get_impute_mode(svc_linear, X_min_max, y)

In [None]:
mses_gnb_scaled[5], stds_gnb_scaled[5] = get_impute_knn_score(gnb, X_scaled, y)
mses_svc_scaled[5], stds_svc_scaled[5] = get_impute_knn_score(svc, X_scaled, y)
mses_svc_linear_scaled[5], stds_svc_linear_scaled[5] = get_impute_knn_score(svc_linear, X_scaled, y)

In [None]:
mses_gnb_minmax[5], stds_gnb_minmax[5] = get_impute_knn_score(gnb, X_min_max, y)
mses_svc_minmax[5], stds_svc_minmax[5] = get_impute_knn_score(svc, X_min_max, y)
mses_svc_linear_minmax[5], stds_svc_linear_minmax[5] = get_impute_knn_score(svc_linear, X_min_max, y)

In [None]:
mses_gnb_scaled[6], stds_gnb_scaled[6] = get_impute_iterative(gnb, X_scaled, y)
mses_svc_scaled[6], stds_svc_scaled[6] = get_impute_iterative(svc, X_scaled, y)
mses_svc_linear_scaled[6], stds_svc_linear_scaled[6] = get_impute_iterative(svc_linear, X_scaled, y)

In [None]:
mses_gnb_minmax[6], stds_gnb_minmax[6] = get_impute_iterative(gnb, X_min_max, y)
mses_svc_minmax[6], stds_svc_scaled[6] = get_impute_iterative(svc, X_min_max, y)
mses_svc_linear_minmax[6], stds_svc_linear_scaled[6] = get_impute_iterative(svc_linear, X_min_max, y)

In [None]:
import matplotlib.pyplot as plt


n_bars = len(mses)
xval = np.arange(n_bars)

colors = ['r', 'g', 'b', 'orange', 'black', 'skyblue', 'darkslategray']
# plot results gnb
plt.figure(figsize=(12, 6))
ax1 = plt.subplot(131)
for j in xval:
    ax1.barh(j, mses_gnb[j], xerr=stds_gnb[j], color=colors[j], alpha=0.6, align='center')
    ax1.text(mses_gnb[j] , j + .25, str(round(mses_gnb[j],3)), color=colors[j], fontweight='bold')

ax1.set_title('GaussianProcessClassifier()')
ax1.set_xlim(left=np.min(mses_gnb) * 0.9, right=np.max(mses_gnb) * 1.1)
ax1.set_yticks(xval)
ax1.set_xlabel('cross_val_score accuracy')
ax1.invert_yaxis()
ax1.set_yticklabels(x_labels)


# plot results gnb scaled
ax2 = plt.subplot(132)
for j in xval:
    ax2.barh(j, mses_gnb_scaled[j], xerr=stds_gnb_scaled[j], color=colors[j], alpha=0.6, align='center')
    ax2.text(mses_gnb_scaled[j] , j + .25, str(round(mses_gnb_scaled[j],3)), color=colors[j], fontweight='bold')

ax2.set_title('(Scaled)')
ax2.set_xlim(left=np.min(mses_gnb) * 0.9, right=np.max(mses_gnb) * 1.1)
ax2.set_yticks(xval)
ax2.set_xlabel('cross_val_score accuracy')
ax2.invert_yaxis()
ax2.set_yticklabels([''] * n_bars)

# plot results gnb minmax
ax3 = plt.subplot(133)
for j in xval:
    ax3.barh(j, mses_gnb_minmax[j], xerr=stds_gnb_minmax[j], color=colors[j], alpha=0.6, align='center')
    ax3.text(mses_gnb_minmax[j] , j + .25, str(round(mses_gnb_minmax[j],3)), color=colors[j], fontweight='bold')

ax3.set_title('(Normalized)')
ax3.set_xlim(left=np.min(mses_gnb) * 0.9, right=np.max(mses_gnb) * 1.1)
ax3.set_yticks(xval)
ax3.set_xlabel('cross_val_score accuracy')
ax3.invert_yaxis()
ax3.set_yticklabels([''] * n_bars)
plt.savefig("figures_python/imputation_techniques/imputation_techniques_gpc_scaled_minmax.pdf")
plt.show()

In [None]:
import matplotlib.pyplot as plt


n_bars = len(mses)
xval = np.arange(n_bars)

colors = ['r', 'g', 'b', 'orange', 'black', 'skyblue', 'darkslategray']
# plot results svc
plt.figure(figsize=(12, 6))
ax1 = plt.subplot(131)
for j in xval:
    ax1.barh(j, mses_svc[j], xerr=stds_svc[j], color=colors[j], alpha=0.6, align='center')
    ax1.text(mses_svc[j] , j + .25, str(round(mses_svc[j],3)), color=colors[j], fontweight='bold')

ax1.set_title('SVC(kernel="poly", random_state=10)')
ax1.set_xlim(left=np.min(mses_svc) * 0.9, right=np.max(mses_svc) * 1.1)
ax1.set_yticks(xval)
ax1.set_xlabel('cross_val_score accuracy')
ax1.invert_yaxis()
ax1.set_yticklabels(x_labels)


# plot results svc scaled
ax2 = plt.subplot(132)
for j in xval:
    ax2.barh(j, mses_svc_scaled[j], xerr=stds_svc_scaled[j], color=colors[j], alpha=0.6, align='center')
    ax2.text(mses_svc_scaled[j] , j + .25, str(round(mses_svc_scaled[j],3)), color=colors[j], fontweight='bold')

ax2.set_title('(Scaled)')
ax2.set_xlim(left=np.min(mses_svc) * 0.9, right=np.max(mses_svc) * 1.1)
ax2.set_yticks(xval)
ax2.set_xlabel('cross_val_score accuracy')
ax2.invert_yaxis()
ax2.set_yticklabels([''] * n_bars)

# plot results svc minmax
ax3 = plt.subplot(133)
for j in xval:
    ax3.barh(j, mses_svc_minmax[j], xerr=stds_svc_minmax[j], color=colors[j], alpha=0.6, align='center')
    ax3.text(mses_svc_minmax[j] , j + .25, str(round(mses_svc_minmax[j],3)), color=colors[j], fontweight='bold')

ax3.set_title('(Normalized)')
ax3.set_xlim(left=np.min(mses_svc) * 0.9, right=np.max(mses_svc) * 1.1)
ax3.set_yticks(xval)
ax3.set_xlabel('cross_val_score accuracy')
ax3.invert_yaxis()
ax3.set_yticklabels([''] * n_bars)
plt.savefig("figures_python/imputation_techniques/imputation_techniques_svc_poly_scaled_minmax.pdf")
plt.show()

In [None]:
import matplotlib.pyplot as plt


n_bars = len(mses)
xval = np.arange(n_bars)

colors = ['r', 'g', 'b', 'orange', 'black', 'skyblue', 'darkslategray']
# plot results svc_linear
plt.figure(figsize=(12, 6))
ax1 = plt.subplot(131)
for j in xval:
    ax1.barh(j, mses_linear_svc[j], xerr=stds_linear_svc[j], color=colors[j], alpha=0.6, align='center')
    ax1.text(mses_linear_svc[j] , j + .25, str(round(mses_linear_svc[j],3)), color=colors[j], fontweight='bold')

ax1.set_title('LinearSVC(random_state=10)')
ax1.set_xlim(left=np.min(mses_linear_svc) * 0.9, right=np.max(mses_linear_svc) * 1.1)
ax1.set_yticks(xval)
ax1.set_xlabel('cross_val_score accuracy')
ax1.invert_yaxis()
ax1.set_yticklabels(x_labels)


# plot results svc_linear scaled
ax2 = plt.subplot(132)
for j in xval:
    ax2.barh(j, mses_svc_linear_scaled[j], xerr=stds_svc_linear_scaled[j], color=colors[j], alpha=0.6, align='center')
    ax2.text(mses_svc_linear_scaled[j] , j + .25, str(round(mses_svc_linear_scaled[j],3)), color=colors[j], fontweight='bold')

ax2.set_title('(Scaled)')
ax2.set_xlim(left=np.min(mses_svc) * 0.9, right=np.max(mses_svc) * 1.1)
ax2.set_yticks(xval)
ax2.set_xlabel('cross_val_score accuracy')
ax2.invert_yaxis()
ax2.set_yticklabels([''] * n_bars)

# plot results svc_linear minmax
ax3 = plt.subplot(133)
for j in xval:
    ax3.barh(j, mses_svc_linear_minmax[j], xerr=stds_svc_linear_minmax[j], color=colors[j], alpha=0.6, align='center')
    ax3.text(mses_svc_linear_minmax[j] , j + .25, str(round(mses_svc_linear_minmax[j],3)), color=colors[j], fontweight='bold')

ax3.set_title('(Normalized)')
ax3.set_xlim(left=np.min(mses_svc) * 0.9, right=np.max(mses_svc) * 1.1)
ax3.set_yticks(xval)
ax3.set_xlabel('cross_val_score accuracy')
ax3.invert_yaxis()
ax3.set_yticklabels([''] * n_bars)
plt.savefig("figures_python/imputation_techniques/imputation_techniques_svc_linear_scaled_minmax.pdf")
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors = 6)

In [None]:
mses_knn = np.zeros(7)
stds_knn = np.zeros(7)
mses_knn_scaled = np.zeros(7)
stds_knn_scaled = np.zeros(7)
mses_knn_minmax = np.zeros(7)
stds_knn_minmax = np.zeros(7)

In [None]:
mses_knn[0], stds_knn[0] = get_full_score(knn, X_no_nan, y_no_nan)
mses_knn_scaled[0], stds_knn_scaled[0] = get_full_score(knn, X_no_nan_scaled, y_no_nan)
mses_knn_minmax[0], stds_knn_minmax[0] = get_full_score(knn, X_no_nan_min_max, y_no_nan)

In [None]:
mses_knn[1], stds_knn[1] = get_impute_zero_score(knn, X, y)
mses_knn_scaled[1], stds_knn_scaled[1] = get_impute_zero_score(knn, X_scaled, y)
mses_knn_minmax[1], stds_knn_minmax[1] = get_impute_zero_score(knn, X_min_max, y)

In [None]:
mses_knn[2], stds_knn[2] = get_impute_mean(knn, X, y)
mses_knn_scaled[2], stds_knn_scaled[2] = get_impute_mean(knn, X_scaled, y)
mses_knn_minmax[2], stds_knn_minmax[2] = get_impute_mean(knn, X_min_max, y)

In [None]:
mses_knn[3], stds_knn[3] = get_impute_median(knn, X, y)
mses_knn_scaled[3], stds_knn_scaled[3] = get_impute_median(knn, X_scaled, y)
mses_knn_minmax[3], stds_knn_minmax[3] = get_impute_median(knn, X_min_max, y)

In [None]:
mses_knn[4], stds_knn[4] = get_impute_mode(knn, X, y)
mses_knn_scaled[4], stds_knn_scaled[4] = get_impute_mode(knn, X_scaled, y)
mses_knn_minmax[4], stds_knn_minmax[4] = get_impute_mode(knn, X_min_max, y)

In [None]:
mses_knn[5], stds_knn[5] = get_impute_knn_score(knn, X, y)
mses_knn_scaled[5], stds_knn_scaled[5] = get_impute_knn_score(knn, X_scaled, y)
mses_knn_minmax[5], stds_knn_minmax[5] = get_impute_knn_score(knn, X_min_max, y)

In [None]:
mses_knn[6], stds_knn[6] = get_impute_iterative(knn, X, y)
mses_knn_scaled[6], stds_knn_scaled[6] = get_impute_iterative(knn, X_scaled, y)
mses_knn_minmax[6], stds_knn_minmax[6] = get_impute_iterative(knn, X_min_max, y)

In [None]:
import matplotlib.pyplot as plt


n_bars = len(mses)
xval = np.arange(n_bars)

colors = ['r', 'g', 'b', 'orange', 'black', 'skyblue', 'darkslategray']
# plot results knn
plt.figure(figsize=(12, 6))
ax1 = plt.subplot(131)
for j in xval:
    ax1.barh(j, mses_knn[j], xerr=stds_knn[j], color=colors[j], alpha=0.6, align='center')
    ax1.text(mses_knn[j] , j + .25, str(round(mses_knn[j],3)), color=colors[j], fontweight='bold')

ax1.set_title('KNeighborsClassifier(n_neighbors = 6)')
ax1.set_xlim(left=np.min(mses_knn) * 0.9, right=np.max(mses_knn) * 1.1)
ax1.set_yticks(xval)
ax1.set_xlabel('cross_val_score accuracy')
ax1.invert_yaxis()
ax1.set_yticklabels(x_labels)


# plot results knn scaled
ax2 = plt.subplot(132)
for j in xval:
    ax2.barh(j, mses_knn_scaled[j], xerr=stds_knn_scaled[j], color=colors[j], alpha=0.6, align='center')
    ax2.text(mses_knn_scaled[j] , j + .25, str(round(mses_knn_scaled[j],3)), color=colors[j], fontweight='bold')

ax2.set_title('(Scaled)')
ax2.set_xlim(left=np.min(mses_knn) * 0.9, right=np.max(mses_knn) * 1.1)
ax2.set_yticks(xval)
ax2.set_xlabel('cross_val_score accuracy')
ax2.invert_yaxis()
ax2.set_yticklabels([''] * n_bars)

# plot results knn minmax
ax3 = plt.subplot(133)
for j in xval:
    ax3.barh(j, mses_knn_minmax[j], xerr=stds_knn_minmax[j], color=colors[j], alpha=0.6, align='center')
    ax3.text(mses_knn_minmax[j] , j + .25, str(round(mses_knn_minmax[j],3)), color=colors[j], fontweight='bold')

ax3.set_title('(Normalized)')
ax3.set_xlim(left=np.min(mses_knn) * 0.9, right=np.max(mses_knn) * 1.1)
ax3.set_yticks(xval)
ax3.set_xlabel('cross_val_score accuracy')
ax3.invert_yaxis()
ax3.set_yticklabels([''] * n_bars)
plt.savefig("figures_python/imputation_techniques/imputation_techniques_knn_scaled_minmax.pdf")
plt.show()

The numeric data is standard-scaled after mean-imputation, while the categorical data is one-hot encoded after imputing missing values with a new category 

Numeric Features:

* Age: float
* BI-RADS: float
* Margin: float
* Density: float

Categorical Features:
* Shape: categories encoded as strings {'L' 'R' 'I' 'N' 'O'}
* Severity(target): categories encoded as strings {'maligno', 'benigno'}
