# Overview

In [None]:
import os
import io
import sys
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
% matplotlib inline
py.init_notebook_mode()

# sns.set(style="whitegrid")

random_state = 42

## Datensatz laden

Quelle: [https://www.kaggle.com/uciml/pima-indians-diabetes-database](https://www.kaggle.com/uciml/pima-indians-diabetes-database)

In [None]:
df = pd.read_csv('../../datasets/pima-indians-diabetes.csv')

# Übersicht

In [None]:
df.head(n=5)

### Informationen zum Datensatz

* **Pregnancies:** Number of times pregnant
* **Glucose:** Plasma glucose concentration a 2 hours in an oral glucose tolerance test
* **BloodPressure:** Diastolic blood pressure (mm Hg)
* **SkinThickness:** Triceps skin fold thickness (mm)
* **Insulin:** 2-Hour serum insulin (mu U/ml)
* **BMI:** Body mass index (weight in kg/(height in m)^2)
* **DiabetesPedigreeFunction:** Diabetes pedigree function
* **Age:** Age (years)
* **Outcome:** Class variable (0 or 1)


## Beschreibende Statistik zum Datensatz

In [None]:
df.describe()

# Visualisierung

## Verteilung der Klassen

[seaborn.countplot](https://seaborn.pydata.org/generated/seaborn.countplot.html)

In [None]:
plt.figure(figsize=(12, 8))
sns.countplot(x='Outcome', data=df);

## Verteilung der Werte pro Merkmal

### Histogram

[pandas.DataFrame.hist](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.hist.html)

In [None]:
df.hist(figsize=(12, 8));

### Boxplot

[pandas.DataFrame.boxplot](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.boxplot.html)

In [None]:
df.boxplot(figsize=(12, 8));
# df.drop('Outcome', axis=1).boxplot(figsize=(12, 8));
# df.boxplot(column='Insulin', figsize=(12, 8));

## Beziehung zwischen den einzelnen Merkmalen

[seaborn.pairplot](https://seaborn.pydata.org/generated/seaborn.pairplot.html)

In [None]:
sns.pairplot(df, hue='Outcome');

## Darstellung mittels Parallelen Koordinaten

[plotly.graph_objs.Parcoords](https://plot.ly/python/parallel-coordinates-plot/)

In [None]:
dimensions = []
for column in df.drop('Outcome', axis=1).columns:
    dimensions.append(dict(label=column, values=df[column]))

data = [
    go.Parcoords(
        line=dict(color=df['Outcome'],
                  colorscale=[[0, 'rgb(228,26,28)'], [1, 'rgb(77,175,74)']]),
        dimensions=dimensions
    )
]

layout = go.Layout(
    plot_bgcolor = '#E5E5E5',
    paper_bgcolor = '#E5E5E5'
)

fig = dict(data=data, layout=layout)
py.iplot(fig)

## Korrelation zwischen den Merkmalen

[seaborn.heatmap](https://seaborn.pydata.org/generated/seaborn.heatmap.html)

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True);

## Fehlende Werte

[seaborn.barplot](https://seaborn.pydata.org/generated/seaborn.barplot.html)

In [None]:
df_tmp = df.copy()

column_list = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

for column in column_list:
    df_tmp[column].replace(to_replace=0, value=np.NaN, inplace=True)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15, 10))
sns.barplot(x=df_tmp.columns, y=(df_tmp.isnull().sum() / df_tmp.shape[0]) * 100);
ax.set(xlabel='Merkmale', ylabel='Anteil fehlender Werte in Prozent');

for p in ax.patches:
    x = p.get_bbox().get_points()[:, 0]
    y = p.get_bbox().get_points()[1, 1]
    ax.annotate('{:3.0f}%'.format(y), (x.mean(), y), ha='center', va='bottom')

## Analyse der Merkmale mit Hilfe der PCA

In [None]:
from sklearn import decomposition
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
X_scaled = scaler.fit_transform(df.drop('Outcome', axis=1))

In [None]:
pca_matrix = decomposition.PCA(n_components=X_scaled.shape[1])
pca_matrix.fit(X_scaled)

pd.DataFrame(pca_matrix.components_, columns=df.drop('Outcome', axis=1).columns, index=['PC-{}'.format(i) for i in range(1, X_scaled.shape[1] + 1)])

In [None]:
trace1 = go.Bar(
    x=['PC-%s' %i for i in range(1, pca_matrix.n_components_ + 1)],
    y=np.round(pca_matrix.explained_variance_ratio_, decimals=3) * 100,
    showlegend=False
)

trace2 = go.Scatter(
    x=['PC-%s' %i for i in range(1, pca_matrix.n_components_ + 1)], 
    y=np.cumsum(np.round(pca_matrix.explained_variance_ratio_, decimals=3) * 100),
    name='Kumulierte erklärte Varianz'
)

layout = go.Layout(
    title='Erklärte Varianz durch die Hauptkomponenten',
    yaxis=dict(
        title='Erklärte Varianz in Prozent'
    ),
)

fig = dict(data=[trace1, trace2], layout=layout)
py.iplot(fig)

## Analyse der Merkmale mit Hilfe der LDA

In [None]:
from sklearn import discriminant_analysis
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
X_scaled = scaler.fit_transform(df.drop('Outcome', axis=1))

In [None]:
lda_matrix = discriminant_analysis.LinearDiscriminantAnalysis(n_components=X_scaled.shape[1], solver='eigen')
lda_matrix.fit(X_scaled, df['Outcome'])

pd.DataFrame(lda_matrix.coef_, columns=df.drop('Outcome', axis=1).columns, index=['Outcome'])