___
# Diabetes predictions
## Modeling with Support Vector Machine - SVM
___

### Importing dependencies


In [282]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns 

%matplotlib inline


### Data collection and analysing the Diabetes Dataset


In [283]:
diabetes_df = pd.read_csv('C:/Users/crist/Desktop/Python_Stuff/diabetes.csv')

In [284]:
diabetes_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1



### * **Pregnancies**: Number of times pregnant

### * Glucose: Plasma glucose concentration over 2 hours in an oral glucose tolerance test
### * BloodPressure: Diastolic blood pressure (mm Hg)
### * SkinThickness: Triceps skin fold thickness (mm)
### * Insulin: 2-Hour serum insulin (mu U/ml)
### * BMI: Body mass index (weight in kg/(height in m)2)
### * DiabetesPedigreeFunction: Diabetes pedigree function (a function which scores likelihood of diabetes based on family history)
### * Age: Age (years)
### * Outcome: Class variable (0 if non-diabetic, 1 if diabetic)

In [285]:
diabetes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [286]:
diabetes_df['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

###  0 --> Non-diabetict 
###  1 --> Diabetic

In [287]:
## Separating the data and labels
X = diabetes_df.drop(columns = 'Outcome', axis= 1)
y = diabetes_df['Outcome']


## Data Standardization


In [288]:
scaler = StandardScaler()
scaler.fit(X)

StandardScaler()

In [289]:
std_df = scaler.transform(X)
X = std_df

## Train Test Split and training the model

In [290]:
X_train, X_test, y_train, y_test = train_test_split(X, y ,test_size = 0.2, stratify= y, random_state=2)


In [291]:
model = svm.SVC(kernel='linear', probability= True)
model.fit(X_train, y_train)

SVC(kernel='linear', probability=True)

## Model evaluation
### Accuracy score

In [292]:
# accurary score on the training data

X_train_pred = model.predict(X_train)
X_train_df_acc = accuracy_score(X_train_pred, y_train)

In [293]:
print(f'Training data accuracy score: {X_train_df_acc * 100:.2f}%')

Training data accuracy score: 78.66%


In [294]:
# accuracy score on the test data

X_test_pred = model.predict(X_test)
X_test_df_acc = accuracy_score(X_test_pred, y_test)

In [295]:
print(f'Test data accuracy score: {X_test_df_acc * 100:.2f}%')

Test data accuracy score: 77.27%


## Prediction system

In [296]:
# input data with a random row from the diabetes df as exemple
input_data = (0,90,65,25,55,25.8,0.587,26)


# Changing the input_data to numpy array
input_df_asnp = np.asarray(input_data)

# Reshape the array to predict one instance
input_df_reshape = input_df_asnp.reshape(1, -1)

# Standartize the input data
std_input = scaler.transform(input_df_reshape)

# Making prediction
prediction = model.predict(std_input)
pred_prob = model.predict_proba(std_input)

# Checking prediction
if prediction[0] == 0 :
    print('The person is not diabetic.')
    
else:
    print('The person is diabetic.')
print(prediction)

# Probability of the class begin 0 or 1
print(f'{pred_prob*100}')


The person is not diabetic.
[0]
[[94.19302153  5.80697847]]
