# Pipeline

In [None]:
import os
import io
import sys
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
% matplotlib inline
py.init_notebook_mode()

sys.path.append('../..')

random_state = 42

## Datensatz laden

Quelle: [https://www.kaggle.com/uciml/pima-indians-diabetes-database](https://www.kaggle.com/uciml/pima-indians-diabetes-database)

In [None]:
df = pd.read_csv('../../datasets/pima-indians-diabetes.csv')

In [None]:
df.head()

## Pipeline definieren

In [None]:
# from imblearn.pipeline import Pipeline
# from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import Imputer, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors  import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.combine import SMOTETomek
from utils.transformer import ItemSelector

features_to_impute = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

pipe = Pipeline([
    ('union',  FeatureUnion([
        ('imputed', Pipeline([
                ('selector', ItemSelector(features_to_impute)),
                ('impute_nan', Imputer(missing_values=0, strategy='mean', axis=0)),
            ])),
        ('not_imputed', ItemSelector('Pregnancies'))
        ])),
    ('scaler', MinMaxScaler(feature_range=(0, 1))),
    ('classify', KNeighborsClassifier(n_neighbors=5, metric='minkowski'))
])

## Pipeline testen

In [None]:
from sklearn.model_selection import train_test_split

df_X = df.drop('Outcome', axis=1)
df_y = df['Outcome']

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

scores = cross_val_score(pipe, df_X, df_y, cv=10)

print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report

y_pred = cross_val_predict(pipe, df_X, df_y, cv=10)

print(classification_report(df_y, y_pred, target_names=['Outcome = 0', 'Outcome = 1']))

In [None]:
from utils.plots import plot_confusion_matrix
from sklearn.metrics import confusion_matrix

y_pred = cross_val_predict(pipe, df_X, df_y, cv=10)

cnf_matrix = confusion_matrix(df_y, y_pred)
np.set_printoptions(precision=2)

plt.figure(figsize=(15, 10))
plot_confusion_matrix(cnf_matrix, classes=['Outcome = 0', 'Outcome = 1'], normalize=True,
                      title='Confusion Matrix')

plt.show()

In [None]:
from utils.plots import plot_learning_curve

plot_learning_curve(pipe, 'Lernkurve', df_X, df_y,  cv=5, train_sizes=np.linspace(.1, 1.0, 5));