In [None]:
import sys
import os
import pandas as pd
import numpy as np

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from IPython.display import display
%matplotlib inline

In [None]:
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls
py.init_notebook_mode(connected=True)

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
SEED = 7
np.random.seed(SEED)

In [None]:
df = pd.read_csv('./data/diabetes.csv')
df_name = df.columns
df_name

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
g = sns.pairplot(df, hue = "Outcome", palette = 'husl')

In [None]:
def plotHist(df,nameOfFeature):
    cls_train = df[nameOfFeature]
    data_array = cls_train
    hist_data = np.histogram(data_array)
    binsize = .5

    trace1 = go.Histogram(
        x=data_array,
        autobinx=False,
        xbins=dict(
            start=df[nameOfFeature].min()-1,
            end=df[nameOfFeature].max()+1,
            size=binsize
        )
    )

    trace_data = [trace1]
    layout = go.Layout(
        title='The distribution of ' + nameOfFeature,
        xaxis=dict(
            title=nameOfFeature,
            titlefont=dict(
                family='Courier New, monospace',
                size=18,
                color='#7f7f7f'
            )
        ),
        yaxis=dict(
            title='Number of labels',
            titlefont=dict(
                family='Courier New, monospace',
                size=18,
                color='#7f7f7f'
            )
        )
    )
    fig = go.Figure(data=trace_data, layout=layout)
    py.iplot(fig)

In [None]:
plotHist(df,'Pregnancies')

In [None]:
from scipy.stats import skew
from scipy.stats import kurtosis
def plotBarCat(df,feature,target):
    
    
    
    x0 = df[df[target]==0][feature] #x0 contains values only for outcome 0
    x1 = df[df[target]==1][feature] #x1 contains values only for outcome 1

    trace1 = go.Histogram(
        x=x0,
        opacity=0.75
    )
    trace2 = go.Histogram(
        x=x1,
        opacity=0.75
    )

    data = [trace1, trace2]
    layout = go.Layout(barmode='overlay',
                      title=feature,
                       yaxis=dict(title='Count'
        ))
    fig = go.Figure(data=data, layout=layout)

    py.iplot(fig, filename='overlaid histogram')
    
    def DescribeFloatSkewKurt(df,target):
        """
            A fundamental task in many statistical analyses is to characterize
            the location and variability of a data set. A further
            characterization of the data includes skewness and kurtosis.
            Skewness is a measure of symmetry, or more precisely, the lack
            of symmetry. A distribution, or data set, is symmetric if it
            looks the same to the left and right of the center point.
            Kurtosis is a measure of whether the data are heavy-tailed
            or light-tailed relative to a normal distribution. That is,
            data sets with high kurtosis tend to have heavy tails, or
            outliers. Data sets with low kurtosis tend to have light
            tails, or lack of outliers. A uniform distribution would
            be the extreme case
        """
        print('-*-'*25)
        print("{0} mean : ".format(target), np.mean(df[target]))
        print("{0} var  : ".format(target), np.var(df[target]))
        print("{0} skew : ".format(target), skew(df[target]))
        print("{0} kurt : ".format(target), kurtosis(df[target]))
        print('-*-'*25)
    
    DescribeFloatSkewKurt(df,target)

In [None]:
plotBarCat(df,df_name[0],'Outcome')

In [None]:
plotBarCat(df,df_name[1],'Outcome')

In [None]:
plotBarCat(df,df_name[2],'Outcome')

In [None]:
plotBarCat(df,df_name[3],'Outcome')

In [None]:
plotBarCat(df,df_name[4],'Outcome')

In [None]:
plotBarCat(df,df_name[5],'Outcome')

In [None]:
plotBarCat(df,df_name[6],'Outcome')

In [None]:
plotBarCat(df,df_name[7],'Outcome')

In [None]:
plotBarCat(df,df_name[8],'Outcome')

In [None]:
def PlotPie(df, nameOfFeature):
    labels = [str(df[nameOfFeature].unique()[i]) for i in range(df[nameOfFeature].nunique())]
    values = [df[nameOfFeature].value_counts()[i] for i in range(df[nameOfFeature].nunique())]

    trace=go.Pie(labels=labels,values=values)

    py.iplot([trace])

In [None]:
PlotPie(df, 'Outcome')

In [None]:
def OutLiersBox(df, nameOfFeature):
    trace0 = go.Box(
                y = df[nameOfFeature],
                name = "All Points",
                jitter = 0.3,
                pointpos = -1.8,
                boxpoints = 'all',
                marker = dict(
                    color = 'rgb(7,40,89)'),
                line = dict(
                    color = 'rgb(7,40,89)')
    )
    
    trace1 = go.Box(
                y = df[nameOfFeature],
                name = "Only Whiskers",
                boxpoints = False,
                marker = dict(
                    color = 'rgb(9,56,125)'),
                line = dict(
                    color = 'rgb(9,56,125)')
    )
    
    trace2 = go.Box(
                y = df[nameOfFeature],
                name = "Suspected Outliers",
                boxpoints = "suspectedoutliers",
                marker = dict(
                    color = 'rgb(8,81,156)',
                    outliercolor = 'rgba(219, 64, 82, 0.6)',
                    line = dict(
                        outliercolor = 'rgba(219, 64, 82, 0.6)',
                        outlierwidth = 2)),
                line = dict(
                    color = 'rgb(8,81,156)')
    )
    
    trace3 = go.Box(
                y = df[nameOfFeature],
                name = "Whiskers and Outliers",
                boxpoints = 'outliers',
                marker = dict(
                    color = 'rgb(107,174,214)'),
                line = dict(
                    color = 'rgb(107,174,214)')
    )
    
    data = [trace0, trace1, trace2, trace3]
    layout = go.Layout(
            title = "{} Outliers".format(nameOfFeature)
    )
    
    fig = go.Figure(data = data, layout = layout)
    py.iplot(fig, filename = "Outliers")
    

In [None]:
OutLiersBox(df, df_name[0])

In [None]:
OutLiersBox(df, df_name[1])

In [None]:
OutLiersBox(df, df_name[2])

In [None]:
OutLiersBox(df, df_name[3])

In [None]:
OutLiersBox(df, df_name[4])

In [None]:
OutLiersBox(df, df_name[5])

In [None]:
OutLiersBox(df, df_name[6])

In [None]:
OutLiersBox(df, df_name[7])

# Outliers Investigation Pairs

In [None]:
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib.font_manager

In [None]:
from sklearn import svm
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import  IsolationForest
from sklearn.neighbors import LocalOutlierFactor

In [None]:
def OutlierDetection(df, feature1, feature2, outliers_fraction = .1):
    new_df = df.copy()
    rng = np.random.RandomState(42)
    
    n_samples = new_df.shape[0]
    clusters_separation = [0]
    
    classifiers = {
        "One-Class SVM" : svm.OneClassSVM(nu = 0.95*outliers_fraction+0.05,
                                         kernel = "rbf", gamma = 0.1),
        "Robust Covariance" : EllipticEnvelope(contamination = outliers_fraction),
        "Isolation Forest" : IsolationForest(max_samples = n_samples, 
                                            contamination = outliers_fraction, 
                                            random_state = rng),
        "Local Outlier Factor" : LocalOutlierFactor(
                                                    n_neighbors = 35,
                                                    contamination = outliers_fraction)
    }
    
    xx, yy = np.meshgrid(np.linspace(new_df[feature1].min()-new_df[feature1].min()*10/100,
                                    new_df[feature1].max()-new_df[feature1].max()*10/100, 50),
                        np.linspace(new_df[feature2].min()-new_df[feature2].min()*10/100,
                                   new_df[feature2].max()-new_df[feature2].max()*10/100, 50))
    n_inliers = int((1. - outliers_fraction)*n_samples)
    n_outliers = int(outliers_fraction*n_samples)
    ground_truth = np.ones(n_samples, dtype = int)
    ground_truth[-n_outliers:] = -1
    
    for i, offset in enumerate(clusters_separation):
        np.random.seed(42)
        
        X = new_df[[feature1, feature2]].values.tolist()
        
        plt.figure(figsize=(9,7))
        for i, (clf_name, clf) in enumerate(classifiers.items()):
            if clf_name == "Local Outlier Factor":
                y_pred = clf.fit_predict(X)
                scores_pred = clf.negative_outlier_factor_
            else:
                clf.fit(X)
                scores_pred = clf.decision_function(X)
                y_pred = clf.predict(X)
            
            threshold = stats.scoreatpercentile(scores_pred, 100*outliers_fraction)
            n_errors = (y_pred != ground_truth).sum()
            
            unique, counts = np.unique(y_pred, return_counts = True)
            print(clf_name, dict(zip(unique, counts)))
            
            new_df[feature1+"_"+feature2+clf_name] = y_pred
            
            if clf_name == "Local Outlier Factor":
                Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()])
            else:
                Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
            
            Z = Z.reshape(xx.shape)
            subplot = plt.subplot(2, 2, i+1)
            subplot.contourf(xx, yy, Z, levels = np.linspace(Z.min(), threshold, 7),
                            cmap = plt.cm.Blues_r)
            
            a = subplot.contour(xx, yy, Z, levels = [threshold],
                               linewidths =2, colors = 'red')
            subplot.contourf(xx, yy, Z, levels = [threshold, Z.max()],
                            colors = 'orange')
            b = plt.scatter(new_df[feature1], new_df[feature2], c = "white",
                           s = 20, edgecolor = 'k')
            
            subplot.axis('tight')
            subplot.set_xlabel("%s"%(feature1))
            
            plt.ylabel(feature2)
            plt.title("%d %s (errors: %d)"%(i+1, clf_name, n_errors))
            
        plt.subplots_adjust(0.04, 0.1, 0.96, 0.94, 0.1, 0.26)
        
    plt.show()
    return new_df
            
            

In [None]:
tt = OutlierDetection(df, "Pregnancies", "BloodPressure",.1)

In [None]:
from pandas import set_option
from pandas.tools.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier


In [None]:
X = df[df_name[0:8]]
Y = df[df_name[8]]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25, random_state = 0, stratify = df['Outcome'])

# Spot Check Algorithms

In [None]:
def GetBasedModel():
    basedModels = []
    basedModels.append(("LR", LogisticRegression()))
    basedModels.append(("LDA", LinearDiscriminantAnalysis()))
    basedModels.append(("KNN", KNeighborsClassifier()))
    basedModels.append(("CART", DecisionTreeClassifier()))
    basedModels.append(("NB", GaussianNB()))
    basedModels.append(("SVM", SVC(probability = True)))
    basedModels.append(("AB", AdaBoostClassifier()))
    basedModels.append(("GBM", GradientBoostingClassifier()))
    basedModels.append(("RF", RandomForestClassifier()))
    basedModels.append(("ET", ExtraTreesClassifier()))
    
    
    return basedModels

In [None]:
def BasedLine2(X_train, y_train, models):
    num_folds = 10
    scoring = 'accuracy'
    
    results = []
    names = []
    
    for name, model in models:
        kfold = StratifiedKFold(n_splits = num_folds, random_state = SEED)
        cv_results = cross_val_score(model, X_train, y_train, cv = kfold, scoring = scoring)
        results.append(cv_results)
        names.append(name)
        msg = "%s : %f (%f)" %(name, cv_results.mean(),cv_results.std())
        print(msg)
        
    return names, results

In [None]:
class PlotBoxR(object):
    
    def __Trace(self, nameOfFeature, value):
        
        trace = go.Box(
            y = value,
            name = nameOfFeature, 
            marker = dict(
                color = 'rgb(0,128,128)',
            )
        )
        return trace
    def PlotResult(self, names, results):
        data = []
        
        for i in range(len(names)):
            data.append(self.__Trace(names[i], results[i]))
            
        py.iplot(data)
                    

In [None]:
models = GetBasedModel()

In [None]:
names, results = BasedLine2(X_train, y_train, models)

In [None]:
PlotBoxR().PlotResult(names, results)

In [None]:
def ScoreDataFrame(names, results):
    def floatingDecimals(f_val, dec = 3):
        prc = "{:." +str(dec)+ "f}"
        return float(prc.format(f_val))
    
    scores = []
    for r in results:
        scores.append(floatingDecimals(r.mean(), 4))
    
    scoreDataFrame = pd.DataFrame({'Model':names, 'Score':scores})
    return scoreDataFrame

In [None]:
basedLineScore = ScoreDataFrame(names, results)
basedLineScore

# Feature Engineering

## Data Preprocessing - Standardization

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

def GetScaledModel(nameOfScaler):
    
    if nameOfScaler == 'standard':
        scaler = StandardScaler()
    elif nameOfScaler == 'minmax':
        scaler = MinMaxScaler()
        
    pipelines = []
    
    pipelines.append((nameOfScaler+'LR', Pipeline([('Scaler', scaler), ('LR', LogisticRegression())]) ))
    pipelines.append((nameOfScaler+'LDA', Pipeline([('Scaler', scaler), ('LDA', LinearDiscriminantAnalysis())]) ))
    pipelines.append((nameOfScaler+'KNN', Pipeline([('Scaler', scaler), ('KNN', KNeighborsClassifier())]) ))
    pipelines.append((nameOfScaler+'CART', Pipeline([('Scaler', scaler), ('CART', DecisionTreeClassifier())]) ))
    pipelines.append((nameOfScaler+'NB', Pipeline([('Scaler', scaler), ('LR', GaussianNB())]) ))
    pipelines.append((nameOfScaler+'SVM', Pipeline([('Scaler', scaler), ('SVM', SVC())]) ))
    pipelines.append((nameOfScaler+'AB', Pipeline([('Scaler', scaler), ('AB', AdaBoostClassifier())]) ))
    pipelines.append((nameOfScaler+'GMB', Pipeline([('Scaler', scaler), ('GMB', GradientBoostingClassifier())]) ))
    pipelines.append((nameOfScaler+'RF', Pipeline([('Scaler', scaler), ('Rf', RandomForestClassifier())]) ))
    pipelines.append((nameOfScaler+'ET', Pipeline([('Scaler', scaler), ('ET', ExtraTreesClassifier())]) ))
    
    return pipelines

## StandardScaler - Model Scores

In [None]:
models = GetScaledModel('standard')

In [None]:
names, results = BasedLine2(X_train, y_train, models)

In [None]:
PlotBoxR().PlotResult(names, results)

In [None]:
scaledScoreStandard = ScoreDataFrame(names, results)

#### Comparing BasedModels Score with StandardScaled Models

In [None]:
compareModels = pd.concat([basedLineScore, scaledScoreStandard], axis = 1)
compareModels

## MinMaxScaler - Model Score

In [None]:
models = GetScaledModel('minmax')

In [None]:
names, results = BasedLine2(X_train, y_train, models)

In [None]:
PlotBoxR().PlotResult(names, results)

In [None]:
scaledScoreMinMax = ScoreDataFrame(names, results)

#### Compared BasedLine models score with MinMax scaled models.

In [None]:
compareModels = pd.concat([basedLineScore, scaledScoreMinMax], axis = 1)
compareModels

## Comparison of all the above models i.e., basedLine, StandardScaler, MinMaxScaler

In [None]:
compareModels = pd.concat([basedLineScore, scaledScoreStandard, scaledScoreMinMax], axis = 1)

In [None]:
compareModels

# Remove Outliers/Anomalies

In [None]:
df_t = df.copy()

In [None]:
df_t_name = df_t.columns

In [None]:
def TurkyOutliers(df_out, nameOfFeature, drop = False):
    valueOfFeature = df_out[nameOfFeature]
    
    Q1 = np.percentile(valueOfFeature, 25.)
    
    Q3 = np.percentile(valueOfFeature, 75.)
    
    step = (Q3-Q1)*1.5
    
    outliers = valueOfFeature[~((valueOfFeature >= Q1-step) & (valueOfFeature <= Q3 + step))].index.tolist()
    feature_outliers = valueOfFeature[~((valueOfFeature >= Q1-step)&(valueOfFeature <= Q3+step))].values
    
    print("Number of outliers (inc duplicates): {} and outliers: {}".format(len(outliers), feature_outliers))
    
    if drop:
        good_data = df_out.drop(df_out.index[outliers]).reset_index(drop = True)
        print("New dataset with removed outliers has {} samples with {} features each ".format(*good_data.shape))
        return good_data
    else:
        print("Nothing happens, df.shape = ",df_out.shape)
        return df_out

## Outliers Detected vs After Cleaning Outliers

### Feature 0

In [None]:
feature_number = 0
OutLiersBox(df, df_name[feature_number])

In [None]:
df_clean = TurkyOutliers(df_t, df_name[feature_number], True)
OutLiersBox(df_clean, df_name[feature_number])

### Feature 1

In [None]:
feature_number = 1
OutLiersBox(df, df_name[feature_number])

In [None]:
df_clean = TurkyOutliers(df_clean, df_name[feature_number], True)
OutLiersBox(df_clean, df_name[feature_number])

### Feature 2

In [None]:
feature_number = 2
OutLiersBox(df, df_name[feature_number])

In [None]:
df_clean = TurkyOutliers(df_clean, df_name[feature_number], True)
OutLiersBox(df_clean, df_name[feature_number])

### Feature 3

In [None]:
feature_number = 3
OutLiersBox(df, df_name[feature_number])

In [None]:
df_clean = TurkyOutliers(df_clean, df_name[feature_number], True)
OutLiersBox(df_clean, df_name[feature_number])

### Feature 4

In [None]:
feature_number = 4
OutLiersBox(df, df_name[feature_number])

In [None]:
df_clean = TurkyOutliers(df_clean, df_name[feature_number], True)
OutLiersBox(df_clean, df_name[feature_number])

### Feature 5

In [None]:
feature_number = 5
OutLiersBox(df, df_name[feature_number])

In [None]:
df_clean = TurkyOutliers(df_clean, df_name[feature_number], True)
OutLiersBox(df_clean, df_name[feature_number])

### Feature 6

In [None]:
feature_number = 6
OutLiersBox(df, df_name[feature_number])

In [None]:
df_clean = TurkyOutliers(df_clean, df_name[feature_number], True)
OutLiersBox(df_clean, df_name[feature_number])

### Feature 7

In [None]:
feature_number = 7
OutLiersBox(df, df_name[feature_number])

In [None]:
df_clean = TurkyOutliers(df_clean, df_name[feature_number], True)
OutLiersBox(df_clean, df_name[feature_number])

### Feature 8

In [None]:
feature_number = 8
OutLiersBox(df, df_name[feature_number])

In [None]:
df_clean = TurkyOutliers(df_clean, df_name[feature_number], True)
OutLiersBox(df_clean, df_name[feature_number])

## Cleaning Report

In [None]:
print('df shape: {}, new df shape: {}, we lost {} rows, {}% of our data'.format(df.shape[0],df_clean.shape[0],
                                                              df.shape[0]-df_clean.shape[0],
                                                        (df.shape[0]-df_clean.shape[0])/df.shape[0]*100))

## Outlier Detection Plots

In [None]:
tt = OutlierDetection(df, 'Pregnancies', 'BloodPressure', .1)

## Outlier Detection Plots after data cleaning

In [None]:
tt_t = OutlierDetection(df_clean, 'Pregnancies', 'BloodPressure', .1)

## Comparing accuracy of models after cleaning

In [None]:
df_clean_name = df_clean.columns

In [None]:
X_c = df_clean[df_clean_name[0:8]]

In [None]:
Y_c = df_clean[df_clean_name[8]]

In [None]:
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_c, Y_c, test_size = 0.25,
                                                            random_state = 0, stratify = df_clean['Outcome'])

In [None]:
models = GetScaledModel('minmax')

In [None]:
names, results = BasedLine2(X_train_c, y_train_c, models)

In [None]:
PlotBoxR().PlotResult(names, results)

In [None]:
scaledScoreMinMax_c = ScoreDataFrame(names, results)
compareModels = pd.concat([basedLineScore, scaledScoreStandard, 
                           scaledScoreMinMax,scaledScoreMinMax_c], axis = 1)

In [None]:
compareModels

## Correlation

### High correlation leads to over-fitting

In [None]:
def HeatMap(df, x = True):
    correlations = df.corr()
    
    cmap = sns.diverging_palette(220, 10, as_cmap = True)
    fig, ax = plt.subplots(figsize = (10, 10))
    fig = sns.heatmap(correlations, cmap = cmap, vmax = 1.0, center = 0,
                      fmt = '.2f', square = True, linewidths = .5, annot = x, cbar_kws = {"shrink" : .75})
    fig.set_xticklabels(fig.get_xticklabels(), rotation = 90, fontsize = 10)
    fig.set_yticklabels(fig.get_xticklabels(), rotation = 0, fontsize = 10)
    plt.tight_layout()
    plt.show()
    
HeatMap(df, x = True)

## Visualizing Feature Importance

In [None]:
#Using EXTRA TREE CLASSIFIER

clf = ExtraTreesClassifier(n_estimators = 250, random_state = SEED)

In [None]:
clf.fit(X_train_c, y_train_c)

In [None]:
feature_importance = clf.feature_importances_

In [None]:
feature_importance = 100.0 * (feature_importance/feature_importance.max())

In [None]:
sorted_idx = np.argsort(feature_importance)

In [None]:
pos = np.arange(sorted_idx.shape[0]) + .5

In [None]:
plt.subplot(1,2,2)
plt.barh(pos, feature_importance[sorted_idx], align = 'center')
plt.yticks(pos, df.columns[sorted_idx])
plt.xlabel("Relative Importance")
plt.ylabel("Variable Importance")
plt.show()

## Calculating Accuracy with the most important features (Top 4)

In [None]:
df_feature_imp = df_clean[['Glucose', 'BMI', 'Age', 'DiabetesPedigreeFunction', 'Outcome']]

In [None]:
df_feature_imp_name = df_feature_imp.columns

In [None]:
X = df_feature_imp[df_feature_imp_name[0 : df_feature_imp.shape[1]-1]]
Y = df_feature_imp[df_feature_imp_name[df_feature_imp.shape[1]-1]]

In [None]:
X_train_im, X_test_im, y_train_im, y_test_im = train_test_split(X, Y, test_size = 0.1,
                                                                random_state = 0, stratify = df_feature_imp['Outcome'])

In [None]:
models = GetScaledModel('minmax')

In [None]:
names, results = BasedLine2(X_train_im, y_train_im, models)

In [None]:
PlotBoxR().PlotResult(names, results)

In [None]:
scaledScoreMinMax_im = ScoreDataFrame(names, results)

In [None]:
compareModels = pd.concat([basedLineScore, scaledScoreStandard, scaledScoreMinMax, scaledScoreMinMax_c,
                          scaledScoreMinMax_im], axis = 1)

In [None]:
compareModels

## We still could improve predictions...

## Algorithm Tuning

In [None]:
df_unscaled = df_clean[['Glucose', 'BMI', 'Age', 'DiabetesPedigreeFunction', 'Outcome']]
df_imp_scaled_name = df_unscaled.columns

In [None]:
df_imp_scaled = MinMaxScaler().fit_transform(df_unscaled)

In [None]:
X = df_imp_scaled[:, 0:4]
Y = df_imp_scaled[:,4]

In [None]:
X_train_sc, X_test_sc, y_train_sc, y_test_sc = train_test_split(X, Y, test_size = 0.1,
                                                               random_state = 0, stratify = df_imp_scaled[:,4])

### RandomSearch & Grid Search Cross-Validation

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import uniform

### RandomSearch Class

In [None]:
class RandomSearch(object):
    def __init__(self, X_train, y_train, model,hyperparameters):
        
        self.X_train = X_train
        self.y_train = y_train
        self.model = model
        self.hyperparameters = hyperparameters
    
    def RandomSearch(self):
        cv = 10
        clf = RandomizedSearchCV(self.model, self.hyperparameters,
                                 random_state = 1, n_iter = 100,
                                 cv = cv, verbose = 0, n_jobs = -1)
        best_model = clf.fit(self.X_train, self.y_train)
        message = (best_model.best_score_, best_model.best_params_)
        print("Best : %f using %s" % (message))
        
        return best_model, best_model.best_params_
    
    def BestModelPredict(self, X_test):
        best_model, _ = self.RandomSearch()
        pred = best_model.predict(X_test)
        return pred

### GridSearch Class

In [None]:
class GridSearch(object):
    def __init__(self, X_train, y_train, model, hyperparameters):
        self.X_train = X_train
        self.y_train = y_train
        self.model = model
        self.hyperparameters = hyperparameters
    
    def GridSearch(self):
        cv = 10
        clf = GridSearchCV(self.model, self.hyperparameters,
                          cv = cv, verbose = 0, n_jobs = -1)
        best_model = clf.fit(self.X_train, self.y_train)
        message = (best_model.best_score_, best_model.best_params_)
        print("Best : %f using %s" %(message))
        
        return best_model, best_model.best_params_
    
    def BestModelPredict(self, X_test):
        best_model, _ = self.GridSearch()
        pred = best_model.predict(X_test)
        return pred

## Logistic Regression

In [None]:
model = LogisticRegression()

In [None]:
penalty = ['l1', 'l2']

In [None]:
C = uniform(loc = 0, scale = 4)

In [None]:
hyperparameters = dict(C=C, penalty = penalty)

In [None]:
LR_RandSearch = RandomSearch(X_train_sc, y_train_sc, model, hyperparameters)

In [None]:
Prediction_LR = LR_RandSearch.BestModelPredict(X_test_sc)

In [None]:
def floatingDecimals(f_val, dec = 3):
    prc = "{:."+str(dec)+"f}"
    
    return float(prc.format(f_val))

In [None]:
print("Prediction on test set is :", floatingDecimals((y_test_sc == Prediction_LR).mean(), 7))