## Import Dataset

In [None]:
import pandas as pd
import numpy as np
df=pd.read_csv("Iris.csv")
df.head()

## Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split
x=df.iloc[:,0:4]
y=df.Species
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=1)

## Building a Simple Pipeline

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
pipe1 = Pipeline([('minmax', MinMaxScaler()),
                  ('knn', KNeighborsClassifier(n_neighbors=3))])
pipe1.fit(x_train, y_train)
score =pipe1.score(x_test, y_test)
print('kNN pipeline test accuracy:',score)

## Building Multiple Pipelines

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import train_test_split
x=df.iloc[:,0:4]
y=df.Species
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=1)

# Construct some pipelines
pipe_lr = Pipeline([('scl', StandardScaler()),
                    ('pca', PCA(n_components=2)),
                    ('clf', LogisticRegression())])

pipe_nb = Pipeline([('scl', StandardScaler()),
                     ('pca', PCA(n_components=2)),
                     ('clf', GaussianNB())])

pipe_knn = Pipeline([('scl', StandardScaler()),
                     ('pca', PCA(n_components=2)),
                    ('clf', KNeighborsClassifier())])

# List of pipelines for ease of iteration
pipelines = [pipe_lr, pipe_nb,pipe_knn]

# Dictionary of pipelines and classifier types for ease of reference
pipe_dict = {0: 'Logistic Regression', 1: 'Naive Bayes', 
             2: 'KNearest'}

# Fit the pipelines
for pipe in pipelines:
    pipe.fit(X_train, y_train)

# Compare accuracies
for idx, model in enumerate(pipelines):
    print(pipe_dict[idx],'Pipeline test accuracy:',model.score(X_test, y_test))

# Identify the most accurate model on test data
best_acc = 0.0
best_clf = 0
best_pipe = ''
for idx, model in enumerate(pipelines):
    if model.score(X_test, y_test) > best_acc:
        best_acc = model.score(X_test, y_test)
        best_pipe = model
        best_clf = idx
print()
print('Classifier with best accuracy: %s' % pipe_dict[best_clf])

## Saving the best model for later use

In [None]:
# Save pipeline to file
from sklearn.externals import joblib
joblib.dump(best_pipe, 'best_pipeline.pkl', compress=1)
print('Saved %s pipeline to file' % pipe_dict[best_clf])

## Load the model whenever you want

In [None]:
# load the model from disk
loaded_model = joblib.load("best_pipeline.pkl")
result = loaded_model.score(X_test, y_test)
print(result)