In [2]:
import pandas as pd
import numpy as np

In [3]:
from sklearn import datasets
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.impute import KNNImputer, SimpleImputer
from pca_imputer import PCAImputer

In [4]:
X, y = datasets.fetch_california_housing(return_X_y=True, as_frame=True)
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [5]:
np.random.seed(42)
nans = np.random.binomial(1, 0.05, size = X.shape,) == 1
X_nan = X.where(~nans, np.nan)
X_nan

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [15]:
imputer_simple = SimpleImputer()
imputer_knn = KNNImputer(n_neighbors=2)
imputer_pca = PCAImputer(n_components=5)

In [16]:
X_simple_filled = pd.DataFrame(imputer_simple.fit_transform(X_nan), columns=X_nan.columns, index=X_nan.index)

In [17]:
X_knn_filled = pd.DataFrame(imputer_knn.fit_transform(X_nan), columns=X_nan.columns, index=X_nan.index)

In [18]:
X_pca_filled = pd.DataFrame(imputer_pca.fit_transform(X_nan), columns=X_nan.columns, index=X_nan.index)

In [19]:
mean_squared_error(X, X_pca_filled, multioutput='raw_values') / mean_squared_error(X, X_knn_filled, multioutput='raw_values')

array([1.26367012, 0.92957916, 0.34456485, 1.04554945, 0.7280415 ,
       0.98195819, 2.24764567, 0.53495875])

In [20]:
mean_squared_error(X, X_pca_filled, multioutput='uniform_average') / mean_squared_error(X, X_knn_filled, multioutput='uniform_average')

0.7280721871142055

In [22]:
r2_score(X, X_pca_filled, multioutput='uniform_average'), r2_score(X, X_knn_filled, multioutput='uniform_average')


(0.9730128979897437, 0.9724185565956475)

In [24]:
%%timeit
X_knn_filled = pd.DataFrame(imputer_knn.fit_transform(X_nan), columns=X_nan.columns, index=X_nan.index)

5.76 s ± 70.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [25]:
%%timeit
X_pca_filled = pd.DataFrame(imputer_pca.fit_transform(X_nan), columns=X_nan.columns, index=X_nan.index)

171 ms ± 6.66 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
