In [None]:
import sys
import os
import pandas as pd
import numpy as np

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from IPython.display import display
%matplotlib inline

In [None]:
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls
py.init_notebook_mode(connected=True)

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
SEED = 7
np.random.seed(SEED)

In [None]:
df = pd.read_csv('./data/diabetes.csv')
df_name = df.columns
df_name

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
g = sns.pairplot(df, hue = "Outcome", palette = 'husl')

In [None]:
def plotHist(df,nameOfFeature):
    cls_train = df[nameOfFeature]
    data_array = cls_train
    hist_data = np.histogram(data_array)
    binsize = .5

    trace1 = go.Histogram(
        x=data_array,
        autobinx=False,
        xbins=dict(
            start=df[nameOfFeature].min()-1,
            end=df[nameOfFeature].max()+1,
            size=binsize
        )
    )

    trace_data = [trace1]
    layout = go.Layout(
        title='The distribution of ' + nameOfFeature,
        xaxis=dict(
            title=nameOfFeature,
            titlefont=dict(
                family='Courier New, monospace',
                size=18,
                color='#7f7f7f'
            )
        ),
        yaxis=dict(
            title='Number of labels',
            titlefont=dict(
                family='Courier New, monospace',
                size=18,
                color='#7f7f7f'
            )
        )
    )
    fig = go.Figure(data=trace_data, layout=layout)
    py.iplot(fig)

In [None]:
plotHist(df,'Pregnancies')

In [None]:
from scipy.stats import skew
from scipy.stats import kurtosis
def plotBarCat(df,feature,target):
    
    
    
    x0 = df[df[target]==0][feature] #x0 contains values only for outcome 0
    x1 = df[df[target]==1][feature] #x1 contains values only for outcome 1

    trace1 = go.Histogram(
        x=x0,
        opacity=0.75
    )
    trace2 = go.Histogram(
        x=x1,
        opacity=0.75
    )

    data = [trace1, trace2]
    layout = go.Layout(barmode='overlay',
                      title=feature,
                       yaxis=dict(title='Count'
        ))
    fig = go.Figure(data=data, layout=layout)

    py.iplot(fig, filename='overlaid histogram')
    
    def DescribeFloatSkewKurt(df,target):
        """
            A fundamental task in many statistical analyses is to characterize
            the location and variability of a data set. A further
            characterization of the data includes skewness and kurtosis.
            Skewness is a measure of symmetry, or more precisely, the lack
            of symmetry. A distribution, or data set, is symmetric if it
            looks the same to the left and right of the center point.
            Kurtosis is a measure of whether the data are heavy-tailed
            or light-tailed relative to a normal distribution. That is,
            data sets with high kurtosis tend to have heavy tails, or
            outliers. Data sets with low kurtosis tend to have light
            tails, or lack of outliers. A uniform distribution would
            be the extreme case
        """
        print('-*-'*25)
        print("{0} mean : ".format(target), np.mean(df[target]))
        print("{0} var  : ".format(target), np.var(df[target]))
        print("{0} skew : ".format(target), skew(df[target]))
        print("{0} kurt : ".format(target), kurtosis(df[target]))
        print('-*-'*25)
    
    DescribeFloatSkewKurt(df,target)

In [None]:
plotBarCat(df,df_name[0],'Outcome')

In [None]:
plotBarCat(df,df_name[1],'Outcome')

In [None]:
plotBarCat(df,df_name[2],'Outcome')

In [None]:
plotBarCat(df,df_name[3],'Outcome')

In [None]:
plotBarCat(df,df_name[4],'Outcome')

In [None]:
plotBarCat(df,df_name[5],'Outcome')

In [None]:
plotBarCat(df,df_name[6],'Outcome')

In [None]:
plotBarCat(df,df_name[7],'Outcome')

In [None]:
plotBarCat(df,df_name[8],'Outcome')

In [None]:
def PlotPie(df, nameOfFeature):
    labels = [str(df[nameOfFeature].unique()[i]) for i in range(df[nameOfFeature].nunique())]
    values = [df[nameOfFeature].value_counts()[i] for i in range(df[nameOfFeature].nunique())]

    trace=go.Pie(labels=labels,values=values)

    py.iplot([trace])

In [None]:
PlotPie(df, 'Outcome')

In [None]:
def OutLiersBox(df, nameOfFeature):
    trace0 = go.Box(
                y = df[nameOfFeature],
                name = "All Points",
                jitter = 0.3,
                pointpos = -1.8,
                boxpoints = 'all',
                marker = dict(
                    color = 'rgb(7,40,89)'),
                line = dict(
                    color = 'rgb(7,40,89)')
    )
    
    trace1 = go.Box(
                y = df[nameOfFeature],
                name = "Only Whiskers",
                boxpoints = False,
                marker = dict(
                    color = 'rgb(9,56,125)'),
                line = dict(
                    color = 'rgb(9,56,125)')
    )
    
    trace2 = go.Box(
                y = df[nameOfFeature],
                name = "Suspected Outliers",
                boxpoints = "suspectedoutliers",
                marker = dict(
                    color = 'rgb(8,81,156)',
                    outliercolor = 'rgba(219, 64, 82, 0.6)',
                    line = dict(
                        outliercolor = 'rgba(219, 64, 82, 0.6)',
                        outlierwidth = 2)),
                line = dict(
                    color = 'rgb(8,81,156)')
    )
    
    trace3 = go.Box(
                y = df[nameOfFeature],
                name = "Whiskers and Outliers",
                boxpoints = 'outliers',
                marker = dict(
                    color = 'rgb(107,174,214)'),
                line = dict(
                    color = 'rgb(107,174,214)')
    )
    
    data = [trace0, trace1, trace2, trace3]
    layout = go.Layout(
            title = "{} Outliers".format(nameOfFeature)
    )
    
    fig = go.Figure(data = data, layout = layout)
    py.iplot(fig, filename = "Outliers")
    

In [None]:
OutLiersBox(df, df_name[0])

In [None]:
OutLiersBox(df, df_name[1])

In [None]:
OutLiersBox(df, df_name[2])

In [None]:
OutLiersBox(df, df_name[3])

In [None]:
OutLiersBox(df, df_name[4])

In [None]:
OutLiersBox(df, df_name[5])

In [None]:
OutLiersBox(df, df_name[6])

In [None]:
OutLiersBox(df, df_name[7])

# Outliers Investigation Pairs

In [None]:
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib.font_manager

In [None]:
from sklearn import svm
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import  IsolationForest
from sklearn.neighbors import LocalOutlierFactor

In [None]:
def OutlierDetection(df, feature1, feature2, outliers_fraction = .1):
    new_df = df.copy()
    rng = np.random.RandomState(42)
    
    n_samples = new_df.shape[0]
    clusters_separation = [0]
    
    classifiers = {
        "One-Class SVM" : svm.OneClassSVM(nu = 0.95*outliers_fraction+0.05,
                                         kernel = "rbf", gamma = 0.1),
        "Robust Covariance" : EllipticEnvelope(contamination = outliers_fraction),
        "Isolation Forest" : IsolationForest(max_samples = n_samples, 
                                            contamination = outliers_fraction, 
                                            random_state = rng),
        "Local Outlier Factor" : LocalOutlierFactor(
                                                    n_neighbors = 35,
                                                    contamination = outliers_fraction)
    }
    
    xx, yy = np.meshgrid(np.linspace(new_df[feature1].min()-new_df[feature1].min()*10/100,
                                    new_df[feature1].max()-new_df[feature1].max()*10/100, 50),
                        np.linspace(new_df[feature2].min()-new_df[feature2].min()*10/100,
                                   new_df[feature2].max()-new_df[feature2].max()*10/100, 50))
    n_inliers = int((1. - outliers_fraction)*n_samples)
    n_outliers = int(outliers_fraction*n_samples)
    ground_truth = np.ones(n_samples, dtype = int)
    ground_truth[-n_outliers:] = -1
    
    for i, offset in enumerate(clusters_separation):
        np.random.seed(42)
        
        X = new_df[[feature1, feature2]].values.tolist()
        
        plt.figure(figsize=(9,7))
        for i, (clf_name, clf) in enumerate(classifiers.items()):
            if clf_name == "Local Outlier Factor":
                y_pred = clf.fit_predict(X)
                scores_pred = clf.negative_outlier_factor_
            else:
                clf.fit(X)
                scores_pred = clf.decision_function(X)
                y_pred = clf.predict(X)
            
            threshold = stats.scoreatpercentile(scores_pred, 100*outliers_fraction)
            n_errors = (y_pred != ground_truth).sum()
            
            unique, counts = np.unique(y_pred, return_counts = True)
            print(clf_name, dict(zip(unique, counts)))
            
            new_df[feature1+"_"+feature2+clf_name] = y_pred
            
            if clf_name == "Local Outlier Factor":
                Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()])
            else:
                Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
            
            Z = Z.reshape(xx.shape)
            subplot = plt.subplot(2, 2, i+1)
            subplot.contourf(xx, yy, Z, levels = np.linspace(Z.min(), threshold, 7),
                            cmap = plt.cm.Blues_r)
            
            a = subplot.contour(xx, yy, Z, levels = [threshold],
                               linewidths =2, colors = 'red')
            subplot.contourf(xx, yy, Z, levels = [threshold, Z.max()],
                            colors = 'orange')
            b = plt.scatter(new_df[feature1], new_df[feature2], c = "white",
                           s = 20, edgecolor = 'k')
            
            subplot.axis('tight')
            subplot.set_xlabel("%s"%(feature1))
            
            plt.ylabel(feature2)
            plt.title("%d %s (errors: %d)"%(i+1, clf_name, n_errors))
            
        plt.subplots_adjust(0.04, 0.1, 0.96, 0.94, 0.1, 0.26)
        
    plt.show()
    return new_df
            
            

In [None]:
tt = OutlierDetection(df, "Pregnancies", "BloodPressure",.1)