In [76]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error

In [4]:
db_dataset = pd.read_csv('diabetes.csv')
print(db_dataset.columns.to_list())

['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']


In [12]:
print(db_dataset.isnull().sum())

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [18]:
print(db_dataset.duplicated().sum())

0


In [24]:
x = db_dataset.drop(columns = 'Outcome', axis = 1)
y = db_dataset['Outcome']

print(x, y)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  
0                       0.627   50  
1                       0.351   31  


In [30]:
scaler = StandardScaler()
scaler.fit(x)
standardize_data = scaler.transform(x)
print(standardize_data)

[[ 0.63994726  0.84832379  0.14964075 ...  0.20401277  0.46849198
   1.4259954 ]
 [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078
  -0.19067191]
 [ 1.23388019  1.94372388 -0.26394125 ... -1.10325546  0.60439732
  -0.10558415]
 ...
 [ 0.3429808   0.00330087  0.14964075 ... -0.73518964 -0.68519336
  -0.27575966]
 [-0.84488505  0.1597866  -0.47073225 ... -0.24020459 -0.37110101
   1.17073215]
 [-0.84488505 -0.8730192   0.04624525 ... -0.20212881 -0.47378505
  -0.87137393]]


In [32]:
x = standardize_data
y = db_dataset['Outcome']

In [42]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y, random_state = 2)
print(x.shape, x_train.shape, x_test.shape)
print(y.shape, y_train.shape, y_test.shape)

(768, 8) (614, 8) (154, 8)
(768,) (614,) (154,)


In [48]:
model = SVC(kernel = 'linear')
model.fit(x_train, y_train)

In [58]:
train_prediction = model.predict(x_train)
train_accuracy = accuracy_score(y_train, train_prediction)
train_mse = mean_squared_error(y_train, train_prediction)

print(f"The train model accuracy is {train_accuracy}, mse {train_mse}")

The train model accuracy is 0.7866449511400652, mse 0.21335504885993486


In [62]:
test_prediction = model.predict(x_test)
test_accuracy = accuracy_score(y_test, test_prediction)
test_mse = mean_squared_error(y_test, test_prediction)

print(f"The test model accuracy is {test_accuracy}, mse {test_mse}")

The test model accuracy is 0.7727272727272727, mse 0.22727272727272727


In [72]:
input_data = (5, 166, 72, 19, 175, 25.8, 0.587, 51)
input_data_df = pd.DataFrame([input_data], columns=db_dataset.columns[:-1])
std_data = scaler.transform(input_data_df)
print(std_data)

[[ 0.3429808   1.41167241  0.14964075 -0.09637905  0.82661621 -0.78595734
   0.34768723  1.51108316]]


In [78]:
prediction = model.predict(std_data)
if prediction[0] == 0:
    print("The person doesn't have Diabetes")
else:
    print("The peron have Diabetes")

The peron have Diabetes
