# A Common processing pipeline

1. Impute missing values (e.g. using median)
2. Transform features (e.g. using a polynomial)
3. Fit a linear regression

In [19]:
import numpy as np
from numpy import nan

X = np.array([[nan,0,3],[3,7,9],[3,5,2],[4,nan,6],[8,8,1]])
y = np.array([14,16,-1,8,-5])

In [2]:
# impute the missing values using the mean
from sklearn.impute import SimpleImputer
imp = SimpleImputer(strategy='mean')
Ximp = imp.fit_transform(X)
Ximp

array([[4.5, 0. , 3. ],
       [3. , 7. , 9. ],
       [3. , 5. , 2. ],
       [4. , 5. , 6. ],
       [8. , 8. , 1. ]])

In [3]:
# Transform the feature space using a polynomial, degree = 3
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=3)
Ximp_trans = poly.fit_transform(Ximp)

In [22]:
# fit a linear regression model
from sklearn.linear_model import LinearRegression
model = LinearRegression().fit(Ximp_trans,y)
yfit=model.predict(Ximp_trans)

[[  1.      4.5     0.      3.     20.25    0.     13.5     0.      0.
    9.     91.125   0.     60.75    0.      0.     40.5     0.      0.
    0.     27.   ]
 [  1.      3.      7.      9.      9.     21.     27.     49.     63.
   81.     27.     63.     81.    147.    189.    243.    343.    441.
  567.    729.   ]
 [  1.      3.      5.      2.      9.     15.      6.     25.     10.
    4.     27.     45.     18.     75.     30.     12.    125.     50.
   20.      8.   ]
 [  1.      4.      5.      6.     16.     20.     24.     25.     30.
   36.     64.     80.     96.    100.    120.    144.    125.    150.
  180.    216.   ]
 [  1.      8.      8.      1.     64.     64.      8.     64.      8.
    1.    512.    512.     64.    512.     64.      8.    512.     64.
    8.      1.   ]]


In [9]:
# create some test data
Xtest = np.array([[nan,1,4],[4,8,10],[4,6,3],[5,nan,7],[9,9,2]])
ytest = np.array([15,17,0,9,-4])

In [10]:
# impute the missing values
Xtest_imp = imp.fit_transform(Xtest)
Xtest_imp

array([[ 5.5,  1. ,  4. ],
       [ 4. ,  8. , 10. ],
       [ 4. ,  6. ,  3. ],
       [ 5. ,  6. ,  7. ],
       [ 9. ,  9. ,  2. ]])

In [11]:
# Transform the feature space
Xtest_imp_trans = poly.fit_transform(Xtest_imp)

In [12]:
preds = model.predict(Xtest_imp_trans)
print(preds)

[ 20.20852346  16.72892509  -5.3003913    8.94238088 -11.11390686]


In [13]:
model.score(Xtest_imp_trans, ytest)

0.6859213020169925

In [15]:
from sklearn.pipeline import make_pipeline
model = make_pipeline(SimpleImputer(strategy='mean'), PolynomialFeatures(degree=3),LinearRegression())

In [24]:
print(y)
model.fit(X,y)
preds=model.predict(Xtest)
print(preds)

[14 16 -1  8 -5]


ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [17]:
model.score(Xtest,ytest)

0.6141482204733815