# Workflow

In [None]:
import os
import io
import sys
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
% matplotlib inline
py.init_notebook_mode()

random_state = 42

## Datensatz laden

Quelle: [https://www.kaggle.com/uciml/pima-indians-diabetes-database](https://www.kaggle.com/uciml/pima-indians-diabetes-database)

In [None]:
df = pd.read_csv('../../datasets/pima-indians-diabetes.csv')

In [None]:
df.head()

# Vorverarbeitung (Pre-Processing)

## Behandlung von fehlenden Werten

### Null-Werte ersetzen

In [None]:
column_list = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

for column in column_list:
    df[column].replace(to_replace=0, value=np.NaN, inplace=True)

### Anzahl der fehlenden Werte ermitteln

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15, 10))
sns.barplot(x=df.columns, y=(df.isnull().sum() / df.shape[0]) * 100);
ax.set(xlabel='Merkmale', ylabel='Anteil fehlender Werte in Prozent');

for p in ax.patches:
    x = p.get_bbox().get_points()[:, 0]
    y = p.get_bbox().get_points()[1, 1]
    ax.annotate('{:3.0f}%'.format(y), (x.mean(), y), ha='center', va='bottom')

### Fehlende Werte ersetzen

In [None]:
from sklearn.preprocessing import Imputer

df_tmp = df.copy()

imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputed_data = imp.fit_transform(df_tmp.values)

df = pd.DataFrame(imputed_data, columns=df_tmp.columns)

In [None]:
print('Anzahl fehlender Werte pro Merkmal:')
print(df.isnull().sum())

## Skalieren der Werte

In [None]:
# Verteilung der Werte vor der Skalierung
# df.plot(kind='box', figsize=(12, 8));

### Skalierung anwenden

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))
normalized_data = scaler.fit_transform(df.drop('Outcome', axis=1).values)

df.loc[:, df.columns != 'Outcome'] = normalized_data

### Verteilung der Werte nach der Skalierung

In [None]:
df.plot(kind='box', figsize=(12, 8));

# Aufteilung in Trainings- und Testdaten (Sampling)

In [None]:
from sklearn.model_selection import train_test_split

df_X = df.drop('Outcome', axis=1)
df_y = df['Outcome']

X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.33, random_state=random_state)

# Trainieren & Evaluieren des Models

## Einfacher Ansatz

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

clf = KNeighborsClassifier(n_neighbors=5)
clf = clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print('Accuracy: {:.2f}'.format(accuracy_score(y_test, y_pred)))

Accuracy: Anteil der Testbeispiele die richtig klassifiziert wurden.

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred, target_names=['Outcome = 0', 'Outcome = 1']))

## Kreuzvalidierungsverfahren (Cross-validation)

[Kreuzvalidierungsverfahren](https://de.wikipedia.org/wiki/Kreuzvalidierungsverfahren)

### Accuracy

[sklearn.model_selection.cross_val_score](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

clf = KNeighborsClassifier(n_neighbors=5)

scores = cross_val_score(clf, df_X, df_y, cv=10)

print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

### Precision, Recall & F1-Score

[sklearn.metrics.classification_report](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html) & [sklearn.model_selection.cross_val_predict](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_predict.html)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report

clf = KNeighborsClassifier(n_neighbors=5)
y_pred = cross_val_predict(clf, df_X, df_y, cv=10)

print(classification_report(df_y, y_pred, target_names=['Outcome = 0', 'Outcome = 1']))

### Wahrheitsmatrix (Confusion Matrix)

In [None]:
sys.path.append('../..')
from utils.plots import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier(n_neighbors=5)
y_pred = cross_val_predict(clf, df_X, df_y, cv=10)

cnf_matrix = confusion_matrix(df_y, y_pred)
np.set_printoptions(precision=2)

plt.figure(figsize=(15, 10))
plot_confusion_matrix(cnf_matrix, classes=['Outcome = 0', 'Outcome = 1'], normalize=True,
                      title='Confusion Matrix')

plt.show()

### Lernkurve

In [None]:
from utils.plots import plot_learning_curve
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier(n_neighbors=5)

plot_learning_curve(clf, 'Lernkurve', df_X, df_y, cv=5, train_sizes=np.linspace(.1, 1.0, 5));