In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

In [5]:
# Load the diabetes dataset
df = pd.read_csv("diabetes.csv")


In [7]:
# Display the first few rows of the dataset
print(df.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [9]:
# Get the shape and descriptive statistics of the dataset
print("Shape of the dataset:", df.shape)
print(df.describe())

Shape of the dataset: (768, 9)
       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      19.355807      15.952218  115.244002   
min       0.000000    0.000000       0.000000       0.000000    0.000000   
25%       1.000000   99.000000      62.000000       0.000000    0.000000   
50%       3.000000  117.000000      72.000000      23.000000   30.500000   
75%       6.000000  140.250000      80.000000      32.000000  127.250000   
max      17.000000  199.000000     122.000000      99.000000  846.000000   

              BMI  DiabetesPedigreeFunction         Age     Outcome  
count  768.000000                768.000000  768.000000  768.000000  
mean    31.992578                  0.471876   33.240885    0.348958  
std      7.884160                  0.331329   11.760232    0.476951  
min 

In [11]:
# Check the distribution of the target variable 'Outcome'
print("Value counts for Outcome:\n", df["Outcome"].value_counts())

Value counts for Outcome:
 Outcome
0    500
1    268
Name: count, dtype: int64


# 0 non diabetic 
# 1 diabetic

In [13]:
# Define features and target variable
X = df.drop(columns="Outcome", axis=1)  # Features
y = df["Outcome"]  # Target variable

In [15]:
# Standardize the feature data
scaler = StandardScaler()

In [17]:
scaler.fit(X)  # Fit the scaler to the data

In [19]:
standardized_data = scaler.transform(X)  # Transform the data

In [22]:
X = standardized_data
X # Update X with standardized data

array([[ 0.63994726,  0.84832379,  0.14964075, ...,  0.20401277,
         0.46849198,  1.4259954 ],
       [-0.84488505, -1.12339636, -0.16054575, ..., -0.68442195,
        -0.36506078, -0.19067191],
       [ 1.23388019,  1.94372388, -0.26394125, ..., -1.10325546,
         0.60439732, -0.10558415],
       ...,
       [ 0.3429808 ,  0.00330087,  0.14964075, ..., -0.73518964,
        -0.68519336, -0.27575966],
       [-0.84488505,  0.1597866 , -0.47073225, ..., -0.24020459,
        -0.37110101,  1.17073215],
       [-0.84488505, -0.8730192 ,  0.04624525, ..., -0.20212881,
        -0.47378505, -0.87137393]])

In [22]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
# Display the shapes of the training and testing sets
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Training set shape: (614, 8)
Testing set shape: (154, 8)


In [26]:
# Create and train the SVM classifier
classifier = svm.SVC(kernel="linear")
classifier.fit(X_train, y_train)

In [28]:
# Make predictions on the training set and calculate accuracy
X_train_prediction = classifier.predict(X_train)
train_accuracy = accuracy_score(X_train_prediction, y_train)
print("Training accuracy:", train_accuracy)

Training accuracy: 0.7736156351791531


In [30]:
# Make predictions on the testing set and calculate accuracy
X_test_prediction = classifier.predict(X_test)
test_accuracy = accuracy_score(X_test_prediction, y_test)
print("Testing accuracy:", test_accuracy)

Testing accuracy: 0.7532467532467533


In [34]:
# Example input for prediction
input_sample = (5, 175, 56, 20, 143, 19.2, 0.54, 52)  # Sample input data

In [36]:
# Convert the input sample to a numpy array and reshape it
input_array = np.asarray(input_sample).reshape(1, -1)

In [38]:
# Standardize the input sample
std_data = scaler.transform(input_array)



In [41]:
# Make a prediction with the classifier
prediction = classifier.predict(std_data)
print("Prediction for the input sample:", prediction)

Prediction for the input sample: [0]


