In [1]:
#related libraries

In [2]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [3]:
#import and split the data

In [4]:
diabetes = pd.read_csv("../input/diabetes/diabetes.csv")
df = diabetes.copy()
df = df.dropna()
y = df["Outcome"]
X = df.drop(['Outcome'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.30, 
                                                    random_state=42)

In [5]:
#set and fit the model

In [6]:
svc_model= SVC(kernel="linear").fit(X_train,y_train)

In [7]:
svc_model

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [8]:
#prediction

In [9]:
y_pred=svc_model.predict(X_test)
accuracy_score(y_test,y_pred)    #it is better if it closes to 1 for classification success

0.7445887445887446

In [10]:
#model tuning

In [11]:
# we will tune "c value " which is hiperparameter for this model

In [12]:
svc_params={"C": np.arange(1,10)}
svc_cv_model= GridSearchCV(svc_model,svc_params,cv=10,n_jobs=-1, verbose=2)
svc_cv_model.fit(X_train,y_train)

Fitting 10 folds for each of 9 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed: 10.4min finished


GridSearchCV(cv=10, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='linear', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=2)

In [13]:
print("SVC model best params:", svc_cv_model.best_params_)

SVC model best params: {'C': 5}


In [14]:
#final model with best params

In [15]:
svc_final_model= SVC(kernel="linear", C=5).fit(X_train,y_train)

In [16]:
y_pred= svc_final_model.predict(X_test)
accuracy_score(y_test,y_pred)


0.7445887445887446

In [17]:
# We found 0.774 by Logistic Regression
#          0.775 by Naive Bayes 
#          0.731 by KNN
#And now,  0.744 by Linear SVC

Thanks to https://github.com/mvahit/DSMLBC