In [1]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import numpy as np

import pickle
def plot_decision_regions(X, y, classifier,test_idx=None, resolution=0.02):
    # setup marker generator and color map
    markers = ('s', 'x', 'v', '^', 'o')
    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
    cmap = ListedColormap(colors[:len(np.unique(y))])
    # plot the decision surface
    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
                           np.arange(x2_min, x2_max, resolution))
    Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
    Z = Z.reshape(xx1.shape)
    plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())
    # plot all samples
    X_test, y_test = X[test_idx, :], y[test_idx]
    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
                    alpha=0.8, c=cmap(idx),
                    marker=markers[idx], label=cl)
    # highlight test samples
    if test_idx:
        X_test, y_test = X[test_idx, :], y[test_idx]
        plt.scatter(X_test[:, 0], X_test[:, 1], c='',
                    alpha=1.0, linewidth=1, marker='v',
                    s=55, label='test set')

In [3]:
import pandas as pd
diabetes = pd.read_csv("Diabities2.csv")
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,blood pressure,skin thickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,79.799479,33.6,0.627,50,1
1,1,85.0,66.0,29.0,79.799479,26.6,0.351,31,0
2,8,183.0,64.0,20.536458,79.799479,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [4]:
diabetes.columns

Index(['Pregnancies', 'Glucose', 'blood pressure', 'skin thickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [5]:
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
ppn = SGDClassifier(loss='perceptron')
lr = SGDClassifier(loss='log')
svm = SGDClassifier(loss='hinge')

In [6]:
diabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
Pregnancies                 768 non-null int64
Glucose                     768 non-null float64
blood pressure              768 non-null float64
skin thickness              768 non-null float64
Insulin                     768 non-null float64
BMI                         768 non-null float64
DiabetesPedigreeFunction    768 non-null float64
Age                         768 non-null int64
Outcome                     768 non-null int64
dtypes: float64(6), int64(3)
memory usage: 54.1 KB


In [7]:
X = diabetes[['Glucose','BMI','Age','DiabetesPedigreeFunction','Insulin', 'blood pressure']]

In [8]:
X

Unnamed: 0,Glucose,BMI,Age,DiabetesPedigreeFunction,Insulin,blood pressure
0,148.0,33.6,50,0.627,79.799479,72.0
1,85.0,26.6,31,0.351,79.799479,66.0
2,183.0,23.3,32,0.672,79.799479,64.0
3,89.0,28.1,21,0.167,94.000000,66.0
4,137.0,43.1,33,2.288,168.000000,40.0
...,...,...,...,...,...,...
763,101.0,32.9,63,0.171,180.000000,76.0
764,122.0,36.8,27,0.340,79.799479,70.0
765,121.0,26.2,30,0.245,112.000000,72.0
766,126.0,30.1,47,0.349,79.799479,60.0


In [9]:
y = diabetes['Outcome']

In [10]:
y

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [12]:
import numpy as np
X_train = np.array(X_train)

In [13]:
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

In [14]:
from sklearn import svm
machine1 = svm.SVC(kernel = 'linear',probability=True)
machine1.fit(X_train,y_train)
y_pred = machine1.predict(X_test)

In [15]:
# from sklearn.metrics import accuracy_score
# accuracy_score(y_test, y_pred)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
from sklearn.metrics import accuracy_score 
print ("Accuracy : ", accuracy_score(y_test, y_pred))
cm

Accuracy :  0.8246753246753247


array([[94,  9],
       [18, 33]], dtype=int64)

In [16]:

diabetes_predict = "flask_model_latest.pkl"
pickle.dump(machine1,open(diabetes_predict,'wb'))
loaded_model = pickle.load(open("flask_model_latest.pkl", "rb"))
loaded_model.predict(X_test)
loaded_model.score(X_test,y_test)

0.8246753246753247

In [17]:
param_grid = {'C': [0.1, 1,10,20,30, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001],'kernel': ['rbf']} 

rs = RandomizedSearchCV(machine1, param_distributions= param_grid, cv=10,return_train_score=False, n_iter=25)

rs.fit(X_train, y_train)
print("tuned hyperparameters :(best parameters) ",rs.best_params_)
print("accuracy :",rs.best_score_*100)

# make predictions on test data 
grid_predictions =rs.predict(X_test) 
test_accuracy=accuracy_score(y_test,grid_predictions)*100
print("Accuracy for our testing dataset with tuning is : {:.2f}%".format(test_accuracy) )

tuned hyperparameters :(best parameters)  {'kernel': 'rbf', 'gamma': 0.0001, 'C': 1}
accuracy : 74.2671009771987
Accuracy for our testing dataset with tuning is : 78.57%




In [18]:
model = machine1
model.fit(X_train, y_train)
 
# print prediction results
predictions = model.predict(X_test)
test_accuracy=accuracy_score(y_test,predictions)*100
print("Accuracy for our testing dataset with tuning is : {:.2f}%".format(test_accuracy) )
print(classification_report(y_test, predictions))

Accuracy for our testing dataset with tuning is : 82.47%
              precision    recall  f1-score   support

           0       0.84      0.91      0.87       103
           1       0.79      0.65      0.71        51

    accuracy                           0.82       154
   macro avg       0.81      0.78      0.79       154
weighted avg       0.82      0.82      0.82       154



In [19]:

from sklearn.model_selection import GridSearchCV
 
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 20,100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}
 
grid = GridSearchCV(machine1, param_grid, refit = True, verbose =2)
 
# fitting the model for grid search
grid.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


Fitting 3 folds for each of 30 candidates, totalling 90 fits
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] ....................... C=0.1, gamma=1, kernel=rbf, total=   0.2s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] ....................... C=0.1, gamma=1, kernel=rbf, total=   0.1s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] ....................... C=0.1, gamma=1, kernel=rbf, total=   0.1s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ..................... C=0.1, gamma=0.1, kernel=rbf, total=   0.1s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ..................... C=0.1, gamma=0.1, kernel=rbf, total=   0.1s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ..................... C=0.1, gamma=0.1, kernel=rbf, total=   0.1s
[CV] C=0.1, gamma=0.01, kernel=rbf ...................................
[CV] ...........

[CV] ....................... C=100, gamma=1, kernel=rbf, total=   0.1s
[CV] C=100, gamma=1, kernel=rbf ......................................
[CV] ....................... C=100, gamma=1, kernel=rbf, total=   0.1s
[CV] C=100, gamma=1, kernel=rbf ......................................
[CV] ....................... C=100, gamma=1, kernel=rbf, total=   0.2s
[CV] C=100, gamma=0.1, kernel=rbf ....................................
[CV] ..................... C=100, gamma=0.1, kernel=rbf, total=   0.1s
[CV] C=100, gamma=0.1, kernel=rbf ....................................
[CV] ..................... C=100, gamma=0.1, kernel=rbf, total=   0.1s
[CV] C=100, gamma=0.1, kernel=rbf ....................................
[CV] ..................... C=100, gamma=0.1, kernel=rbf, total=   0.1s
[CV] C=100, gamma=0.01, kernel=rbf ...................................
[CV] .................... C=100, gamma=0.01, kernel=rbf, total=   0.1s
[CV] C=100, gamma=0.01, kernel=rbf ...................................
[CV] .

[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:    9.8s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='linear',
                           max_iter=-1, probability=True, random_state=None,
                           shrinking=True, tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.1, 1, 10, 20, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=2)

In [20]:

# print best parameter after tuning
print(grid.best_params_)
 
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

{'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)


In [21]:
grid_predictions = grid.predict(X_test)
 
# print classification report
print(classification_report(y_test, grid_predictions))

test_accuracy=accuracy_score(y_test,grid_predictions)*100
print("Accuracy for our testing dataset with tuning is : {:.2f}%".format(test_accuracy) )

              precision    recall  f1-score   support

           0       0.80      0.93      0.86       103
           1       0.79      0.53      0.64        51

    accuracy                           0.80       154
   macro avg       0.80      0.73      0.75       154
weighted avg       0.80      0.80      0.79       154

Accuracy for our testing dataset with tuning is : 79.87%


In [22]:
#####Random forest
X = diabetes[['Glucose','BMI','Age','DiabetesPedigreeFunction','Insulin', 'blood pressure']]
# 'Glucose','BMI','DiabetesPedigreeFunction','Age', 'Pregnancies','Insulin'

y = diabetes['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [23]:
# Build an model (Random forest classifier)
from sklearn.ensemble import RandomForestClassifier

clf= RandomForestClassifier()
clf.fit(X_train,y_train);
## Evaluating the model
# = clf.score(X_test,y_test)
score = clf.score(X_test, y_test)
print(score)


0.7402597402597403




In [25]:
########## build a model (random forest)

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 80, num = 20)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [2,4]
# Minimum number of samples required to split a node
min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the param grid
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(param_grid)
rf_Model = RandomForestClassifier(n_jobs=-1, n_estimators=500)
rf_Grid = GridSearchCV(estimator = rf_Model, param_grid = param_grid, cv = 10, verbose=2, n_jobs = 4)
rf_Grid.fit(X_train, y_train)
rf_Grid.best_params_
print (f'Train Accuracy - : {rf_Grid.score(X_train,y_train):.3f}')
print (f'Test Accuracy - : {rf_Grid.score(X_test,y_test):.2f}')


{'n_estimators': [10, 13, 17, 21, 24, 28, 32, 35, 39, 43, 46, 50, 54, 57, 61, 65, 68, 72, 76, 80], 'max_features': ['auto', 'sqrt'], 'max_depth': [2, 4], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2], 'bootstrap': [True, False]}
Fitting 10 folds for each of 640 candidates, totalling 6400 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   35.4s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:   44.6s
[Parallel(n_jobs=4)]: Done 357 tasks      | elapsed:  1.0min
[Parallel(n_jobs=4)]: Done 640 tasks      | elapsed:  1.3min
[Parallel(n_jobs=4)]: Done 1005 tasks      | elapsed:  1.8min
[Parallel(n_jobs=4)]: Done 1450 tasks      | elapsed:  2.3min
[Parallel(n_jobs=4)]: Done 1977 tasks      | elapsed:  3.0min
[Parallel(n_jobs=4)]: Done 2584 tasks      | elapsed:  3.7min
[Parallel(n_jobs=4)]: Done 3273 tasks      | elapsed:  4.6min
[Parallel(n_jobs=4)]: Done 4042 tasks      | elapsed:  5.5min
[Parallel(n_jobs=4)]: Done 4893 tasks      | elapsed:  6.6min
[Parallel(n_jobs=4)]: Done 5824 tasks      | elapsed:  7.7min
[Parallel(n_jobs=4)]: Done 6400 out of 6400 | elapsed:  8.5min finished


Train Accuracy - : 0.827
Test Accuracy - : 0.75


In [26]:
from sklearn.model_selection import RandomizedSearchCV
rf_RandomGrid = RandomizedSearchCV(estimator = rf_Model, param_distributions = param_grid, cv = 10, verbose=2, n_jobs = -1)
rf_RandomGrid.fit(X_train, y_train)

rf_Grid.best_params_
rf_RandomGrid.best_params_
print (f'Train Accuracy - : {rf_RandomGrid.score(X_train,y_train):.3f}')
print (f'Test Accuracy - : {rf_RandomGrid.score(X_test,y_test):.3f}')

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   32.9s finished


Train Accuracy - : 0.829
Test Accuracy - : 0.760


In [27]:
import sklearn
print(sklearn.__version__)

0.21.3


In [28]:
## Enter the new data
X_test.head(1)

Unnamed: 0,Glucose,BMI,Age,DiabetesPedigreeFunction,Insulin,blood pressure
97,71.0,20.4,22,0.323,76.0,48.0


In [29]:
Pregnancies = input()
Glucose = input()
BloodPressure = input()
SkinThickness = input()
Insulin = input()
BMI = input()
DiabetesPedigreeFunction = input()
Age = input()

78
45
45
45
45
45
45
45


In [30]:
row_df = pd.DataFrame([pd.Series([Glucose,BloodPressure,Insulin,BMI ,DiabetesPedigreeFunction ,Age])])

In [31]:
row_df

Unnamed: 0,0,1,2,3,4,5
0,45,45,45,45,45,45


In [32]:
prob = loaded_model.predict_proba(row_df)[0][1]
print(f"The probability of you having Diabetes is {prob}")

The probability of you having Diabetes is 0.9999999999999699


In [33]:
loaded_model.predict(row_df)[0]

1