In [None]:
import os
import io
import sys
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
% matplotlib inline
py.init_notebook_mode()

sys.path.append('../..')

random_state = 42

## Datensatz laden

Quelle: [https://www.kaggle.com/uciml/pima-indians-diabetes-database](https://www.kaggle.com/uciml/pima-indians-diabetes-database)

In [None]:
df = pd.read_csv('../../datasets/pima-indians-diabetes.csv')

In [None]:
df.head()

In [None]:
df_X = df.drop('Outcome', axis=1)
df_y = df['Outcome']

## Pipeline definieren

In [None]:
# from imblearn.pipeline import Pipeline
# from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import Imputer, MinMaxScaler, Normalizer
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors  import KNeighborsClassifier
from imblearn.combine import SMOTETomek
from utils.transformer import ItemSelector

features_to_impute = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

pipe = Pipeline([
    ('union',  FeatureUnion([
        ('imputed', Pipeline([
                ('selector', ItemSelector(features_to_impute)),
                ('impute_nan', Imputer(missing_values=0, strategy='mean', axis=0)),
            ])),
        ('not_imputed', ItemSelector('Pregnancies'))
        ])),
    ('scaler', MinMaxScaler(feature_range=(0, 1))),
    ('classify', KNeighborsClassifier(n_neighbors=5, metric='minkowski'))
])

## Suche nach den besten Parametern

### Grid definieren

In [None]:
param_grid = [
    {
        'classify': [KNeighborsClassifier(n_neighbors=5, metric='minkowski')],
        'classify__n_neighbors': [2, 5, 10, 15, 20],
        'classify__metric': ['minkowski', 'euclidean', 'manhattan']
    },
]

[Liste der Distanzmetriken](http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.DistanceMetric.html)

### Grid Search ausführen

In [None]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(pipe, cv=5, n_jobs=-1, param_grid=param_grid, scoring='accuracy')
grid.fit(df_X, df_y);

[Liste der Scoring-Parameter](http://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter)

### Auswertung

In [None]:
grid.best_score_ 

In [None]:
grid.best_params_ 