# Machine learning pipeline in Python

- [Original article](https://medium.com/@yanhann10/a-brief-view-of-machine-learning-pipeline-in-python-5f50b941fca8)

In [21]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer

from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split

In [22]:
iris = load_iris()
X=iris.data
y=iris.target
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)

In [23]:
#this involves 1 classifier (Decision Trees)
steps = [('imputation', Imputer(missing_values='NaN', strategy = 'most_frequent', axis=0)),
          ('clf', DecisionTreeClassifier())]

pipeline = Pipeline(steps)
clf = pipeline.fit(X_train,y_train)

In [28]:
# this involves 2 classifiers
classifiers = [
    KNeighborsClassifier(5),
    RandomForestClassifier()]

#loop through each of the 2 classifiers
for clf in classifiers:
    steps = [('imputation', Imputer(missing_values='NaN', strategy = 'most_frequent', axis=0)),
          ('clf', clf)]
    pipeline = Pipeline(steps)

In [29]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits=10, random_state=1234)
results = cross_val_score(pipeline, X_train, y_train, cv=kfold)

print(results.mean())

0.9318181818181819
