### Project Report: Diabetes Prediction with SVM

In [285]:
#Importing Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score


In [286]:
#Loading the dataset into pandas data frame
diabetes_dataset=pd.read_csv('/kaggle/input/diabetes-prediction/Diabetes_prediction.csv')

In [287]:
#Outputs first five rows of the dataset
diabetes_dataset.head()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Diagnosis
0,2,115.863387,56.410731,24.336736,94.385783,26.45594,0.272682,20.100494,0
1,2,92.490122,70.61552,23.443591,138.652426,23.910167,0.66516,44.912281,0
2,1,88.141469,63.262618,23.404364,149.358082,21.94825,0.676022,48.247873,1
3,2,108.453101,67.793632,20.75158,108.751638,24.209304,0.289636,42.749868,0
4,1,127.849443,94.725685,22.603078,25.269987,32.997477,0.601315,32.797789,0


In [288]:
#Outputs number of rows and columns in the dataset
diabetes_dataset.shape

(1000, 9)

In [289]:
#Outputs all the information
diabetes_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               1000 non-null   int64  
 1   Glucose                   1000 non-null   float64
 2   BloodPressure             1000 non-null   float64
 3   SkinThickness             1000 non-null   float64
 4   Insulin                   1000 non-null   float64
 5   BMI                       1000 non-null   float64
 6   DiabetesPedigreeFunction  1000 non-null   float64
 7   Age                       1000 non-null   float64
 8   Diagnosis                 1000 non-null   int64  
dtypes: float64(7), int64(2)
memory usage: 70.4 KB


In [290]:
#Outputs Statistical Measures
diabetes_dataset.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Diagnosis
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,1.771,99.440607,72.179837,23.278316,84.582679,25.4336,0.449383,43.281798,0.306
std,1.354398,19.47073,13.882017,1.173807,74.872733,3.690223,0.199334,14.465398,0.46106
min,0.0,30.571402,31.401487,19.369987,-165.310033,13.548818,0.100037,-0.979804,0.0
25%,1.0,86.145927,62.795447,22.501591,35.076535,23.022715,0.283376,33.518451,0.0
50%,2.0,99.458362,71.909588,23.275225,84.442232,25.455649,0.448219,43.634273,0.0
75%,3.0,113.264556,82.08266,24.052022,134.267842,27.972184,0.619158,53.098446,1.0
max,8.0,161.238939,110.723715,26.917654,317.701852,36.324598,0.799654,90.573782,1.0


In [291]:
#Binary label indicating whether the individual has diabetes (1) or not (0).
diabetes_dataset['Diagnosis'].value_counts()

Diagnosis
0    694
1    306
Name: count, dtype: int64

In [292]:
#Grouping the diagnosis column and deriving mean
diabetes_dataset.groupby('Diagnosis').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,1.772334,99.504846,72.084485,23.278327,82.823035,25.515244,0.441372,43.711192
1,1.767974,99.294917,72.396092,23.278291,88.573505,25.248433,0.46755,42.307942


In [293]:
#seperating data and labeles 
X = diabetes_dataset.drop(columns = 'Diagnosis' , axis=1)
Y = diabetes_dataset ['Diagnosis']

In [294]:
print (X )


     Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
0              2  115.863387      56.410731      24.336736   94.385783   
1              2   92.490122      70.615520      23.443591  138.652426   
2              1   88.141469      63.262618      23.404364  149.358082   
3              2  108.453101      67.793632      20.751580  108.751638   
4              1  127.849443      94.725685      22.603078   25.269987   
..           ...         ...            ...            ...         ...   
995            1  103.496355      41.059401      24.930299   43.675978   
996            1   60.714150      64.082842      24.689064  111.562745   
997            0   97.597727      64.122312      22.140339  108.138552   
998            0   66.775137      55.908797      24.885274  219.521484   
999            0   87.647268      69.111593      25.621266  134.335745   

           BMI  DiabetesPedigreeFunction        Age  
0    26.455940                  0.272682  20.100494  
1  

In [295]:
print(Y)

0      0
1      0
2      1
3      0
4      0
      ..
995    0
996    1
997    1
998    0
999    1
Name: Diagnosis, Length: 1000, dtype: int64


**Data Standardization**

In [296]:
#Creates a scaler that standardizes data so each feature has mean=0 and standard deviation=1
scaler = StandardScaler()

In [297]:
#learns the mean and standard deviation of each feature in X
scaler.fit(X)

In [298]:
#Uses the learned mean & std to actually scale X so each feature has mean=0 and std=1
standardized_data = scaler.transform(X)

In [299]:
print (standardized_data)

[[ 0.16916344  0.84388194 -1.13650602 ...  0.27717891 -0.88689813
  -1.60333665]
 [ 0.16916344 -0.35714958 -0.11274294 ... -0.41303606  1.08302816
   0.11277251]
 [-0.56954153 -0.58060442 -0.64267896 ... -0.94495457  1.13754671
   0.34347898]
 ...
 [-1.30824649 -0.09469611 -0.58071939 ... -0.73217763  1.56639535
   1.08367856]
 [-1.30824649 -1.67851002 -1.17268119 ...  1.73127265 -0.33380688
   0.25225371]
 [-1.30824649 -0.60599888 -0.2211335  ...  1.53594517  0.78523145
  -0.10346468]]


In [300]:
X = standardized_data #Store the scaled feature values in X 
Y = diabetes_dataset ['Diagnosis'] # target/label we want to predict

In [301]:
print (X)
print(Y)

[[ 0.16916344  0.84388194 -1.13650602 ...  0.27717891 -0.88689813
  -1.60333665]
 [ 0.16916344 -0.35714958 -0.11274294 ... -0.41303606  1.08302816
   0.11277251]
 [-0.56954153 -0.58060442 -0.64267896 ... -0.94495457  1.13754671
   0.34347898]
 ...
 [-1.30824649 -0.09469611 -0.58071939 ... -0.73217763  1.56639535
   1.08367856]
 [-1.30824649 -1.67851002 -1.17268119 ...  1.73127265 -0.33380688
   0.25225371]
 [-1.30824649 -0.60599888 -0.2211335  ...  1.53594517  0.78523145
  -0.10346468]]
0      0
1      0
2      1
3      0
4      0
      ..
995    0
996    1
997    1
998    0
999    1
Name: Diagnosis, Length: 1000, dtype: int64


**Train Test Split**

In [302]:
#Splitting the dataset into training and testing
X_train ,X_test,Y_train ,Y_test = train_test_split (X,Y ,test_size = 0.2, stratify = Y , random_state = 2) 

In [303]:
print (X.shape , X_train.shape , X_test.shape)

(1000, 8) (800, 8) (200, 8)


**Training the Model**

In [304]:
#Create an SVM classifier that tries to separate data using a straight line (linear decision boundary)
classifier = svm.SVC(kernel = 'linear')

**Training the SVM Classifier**

In [305]:
#Trains the SVM model using the training data (X_train = features, Y_train = labels) 
classifier.fit(X_train ,Y_train)

**Model Evaluation**

In [306]:
#Accuracy Score of Training data 

X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score (X_train_prediction , Y_train)

In [307]:
print('Accuracy score of the training  data : ',training_data_accuracy)

Accuracy score of the training  data :  0.69375


In [308]:
#Accuracy Score of Test data 

X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score (X_test_prediction , Y_test)

In [309]:
print('Accuracy score of the training  data : ',test_data_accuracy)

Accuracy score of the training  data :  0.695


**Making a Predictive System**

In [310]:
input_data = (3,95.665270,	54.157100,	23.927648,	130.989859,	29.235840	,0.461786	,18.669086)

In [311]:
#changing the input data to a numpy array
input_data_as_numpy_array =np.asarray(input_data)

In [312]:
#reshape the array as we are predicting for one instance 
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

In [313]:
#standardized the input data
std_data = scaler.transform(input_data_reshaped)
print(std_data)

prediction = classifier.predict(std_data)
print(prediction)

if (prediction [0] == 0):
    print ('The person is not diabetic')
else:
    print ('The person is diabetic')
    

[[ 0.9078684  -0.19399511 -1.29892898  0.55346145  0.6201243   1.03087062
   0.06225409 -1.70234012]]
[0]
The person is not diabetic




### Handling Class Imbalance with `class_weight='balanced'`

In our dataset, about 69% are non-diabetic and 31% are diabetic.  
This imbalance made the original SVM mostly predict "non-diabetic", which gave okay accuracy (~69%) but **missed many diabetics**.

To fix this,I trained another SVM with the parameter `class_weight='balanced'`.  
This tells the model:  
- "Pay more attention to the minority class (diabetic)"  
- "Don’t just chase accuracy by always guessing non-diabetic"  

When we compare results:  
- **Accuracy went down** (53%), but that’s not a bad thing here.  
- **Recall for diabetics (class 1) went up to 61%** → the model now catches more of the actual diabetics.  
- Precision dropped (only 34%), meaning there are more false alarms.


In [314]:

# create SVM with balanced class weights
classifier = svm.SVC(kernel='linear', class_weight='balanced')

# train the model
classifier.fit(X_train, Y_train)

# make predictions
y_pred = classifier.predict(X_test)


In [315]:
from sklearn.metrics import classification_report

print(classification_report(Y_test, y_pred))


              precision    recall  f1-score   support

           0       0.74      0.49      0.59       139
           1       0.34      0.61      0.44        61

    accuracy                           0.53       200
   macro avg       0.54      0.55      0.51       200
weighted avg       0.62      0.53      0.54       200



In [316]:
input_data = (3,95.665270,	54.157100,	23.927648,	130.989859,	29.235840	,0.461786	,18.669086)
#changing the input data to a numpy array
input_data_as_numpy_array =np.asarray(input_data)
#reshape the array as we are predicting for one instance 
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)
#standardized the input data
std_data = scaler.transform(input_data_reshaped)
print(std_data)

prediction = classifier.predict(std_data)
print(prediction)

if (prediction [0] == 0):
    print ('The person is not diabetic')
else:
    print ('The person is diabetic')


[[ 0.9078684  -0.19399511 -1.29892898  0.55346145  0.6201243   1.03087062
   0.06225409 -1.70234012]]
[1]
The person is diabetic


