# A Common processing pipeline

1. Impute missing values (e.g. using median)
2. Transform features (e.g. using a polynomial)
3. Fit a linear regression

In [1]:
import numpy as np
from numpy import nan

X = np.array([[nan,0,3],[3,7,9],[3,5,2],[4,nan,6],[8,8,1]])
y = np.array([14,16,-1,8,-5])

In [2]:
# impute the missing values using the mean
from sklearn.impute import SimpleImputer
imp = SimpleImputer(strategy='mean')
Ximp = imp.fit_transform(X)
Ximp

array([[4.5, 0. , 3. ],
       [3. , 7. , 9. ],
       [3. , 5. , 2. ],
       [4. , 5. , 6. ],
       [8. , 8. , 1. ]])

In [3]:
# Transform the feature space using a polynomial, degree = 3
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=3)
Ximp_trans = poly.fit_transform(Ximp)

In [4]:
# fit a linear regression model
from sklearn.linear_model import LinearRegression
model = LinearRegression().fit(Ximp_trans,y)
yfit=model.predict(Ximp_trans)

In [5]:
# create some test data
Xtest = np.array([[nan,1,4],[4,8,10],[4,6,3],[5,nan,7],[9,9,2]])
ytest = np.array([15,17,0,9,-4])

In [6]:
# impute the missing values
Xtest_imp = imp.fit_transform(Xtest)
Xtest_imp

array([[ 5.5,  1. ,  4. ],
       [ 4. ,  8. , 10. ],
       [ 4. ,  6. ,  3. ],
       [ 5. ,  6. ,  7. ],
       [ 9. ,  9. ,  2. ]])

In [7]:
# Transform the feature space
Xtest_imp_trans = poly.fit_transform(Xtest_imp)

In [8]:
preds = model.predict(Xtest_imp_trans)
print(preds)

[ 20.20852346  16.72892509  -5.3003913    8.94238088 -11.11390686]


In [9]:
model.score(Xtest_imp_trans, ytest)

0.6859213020169925

In [10]:
from sklearn.pipeline import make_pipeline
model = make_pipeline(SimpleImputer(strategy='mean'), PolynomialFeatures(degree=3),LinearRegression())

In [11]:
print(y)
model.fit(X,y)
preds=model.predict(Xtest)
print(preds)

[14 16 -1  8 -5]
[ 15.05233793  16.72892509  -5.3003913   16.16457818 -11.11390686]


In [12]:
model.score(Xtest,ytest)

0.6141482204733815