In [55]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn import svm
from sklearn.metrics import accuracy_score

In [56]:
# loading the diabetes dataset to a pandas DataFrame
diabetes_dataset = pd.read_csv('diabetes.csv')

In [57]:
# printing the first 5 rows of the dataset
diabetes_dataset.head(4)                    

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0


In [58]:
diabetes_dataset.shape

(768, 9)

In [59]:
# getting the statistical measures of the data
diabetes_dataset.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [60]:
# tres important 
diabetes_dataset['Outcome'].value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

0 --> Non-Diabetic

1 --> Diabetic

In [61]:
diabetes_dataset.groupby('Outcome').agg(['mean','var','sum'])


Unnamed: 0_level_0,Pregnancies,Pregnancies,Pregnancies,Glucose,Glucose,Glucose,BloodPressure,BloodPressure,BloodPressure,SkinThickness,...,Insulin,BMI,BMI,BMI,DiabetesPedigreeFunction,DiabetesPedigreeFunction,DiabetesPedigreeFunction,Age,Age,Age
Unnamed: 0_level_1,mean,var,sum,mean,var,sum,mean,var,sum,mean,...,sum,mean,var,sum,mean,var,sum,mean,var,sum
Outcome,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,3.298,9.103403,1649,109.98,683.362325,54990,68.184,326.274693,34092,19.664,...,34396,30.3042,59.13387,15152.1,0.429734,0.089452,214.867,31.19,136.134168,15595
1,4.865672,13.99687,1304,141.257463,1020.139457,37857,70.824627,461.897968,18981,22.164179,...,26890,35.142537,52.750693,9418.2,0.5505,0.138648,147.534,37.067164,120.302588,9934


The groupby() function in pandas is used to group data in a DataFrame according to one or more keys, and then apply some operation to each group. This function is incredibly powerful for data aggregation and analysis because it allows you to split the data into groups based on some criteria, perform operations on each group independently, and then combine the results.

agg(): Allows you to apply multiple aggregation functions simultaneously. 
other

In [62]:
# separating the data and labels
X = diabetes_dataset.drop(columns = 'Outcome',axis=1)
 #axis = 0 pour supprimer les lignes
# 2 eme methode
x = diabetes_dataset[["Glucose" ,"Pregnancies" , "BloodPressure" , "SkinThickness" , "Insulin" ,"BMI" , "DiabetesPedigreeFunction" ,"Age"  ]]
Y = diabetes_dataset['Outcome']

axis=1 tells pandas that you're referring to columns.
axis=0 tells pandas that you're referring to rows .

Data Standardization

Standardization is a common preprocessing step to ensure that all features have a mean of 0 and a standard deviation of 1, which is particularly important for models like SVM. 
 is essential for ensuring that your machine learning model treats all features equally, leading to more reliable and accurate predictions. It also aids in the convergence of optimization algorithms, reduces the risk of overfitting, and makes the model more robust to different scales of input data.

In [63]:
# x_stand = x - mean(toutes les x ) / variance (toute les x ) 

# x_stand = |(148 - 120.89)| / 31.97 [-1 , 1]
#x_stand = x / x_max   ----> 
# min max scaler : x-x_max/ xmax - xmin 


# a retenir

scaler = StandardScaler()
scaler.fit(X)
standardized_data = scaler.transform(X)

the StandardScaler from sklearn standardizes each feature in X to have a mean of 0 and a standard deviation of 1 
The fit() method in Preprocessing (Scalers...) : fit is used to calculate necessary statistics (e.g., mean and standard deviation) from the training data

Key Differences Between fit and transform:
fit(): Computes the necessary parameters (e.g., mean and standard deviation in StandardScaler) from the data but does not change the data.
transform(): Applies the transformation to the data using the parameters computed during fit.

In [64]:
standardized_data

array([[ 0.63994726,  0.84832379,  0.14964075, ...,  0.20401277,
         0.46849198,  1.4259954 ],
       [-0.84488505, -1.12339636, -0.16054575, ..., -0.68442195,
        -0.36506078, -0.19067191],
       [ 1.23388019,  1.94372388, -0.26394125, ..., -1.10325546,
         0.60439732, -0.10558415],
       ...,
       [ 0.3429808 ,  0.00330087,  0.14964075, ..., -0.73518964,
        -0.68519336, -0.27575966],
       [-0.84488505,  0.1597866 , -0.47073225, ..., -0.24020459,
        -0.37110101,  1.17073215],
       [-0.84488505, -0.8730192 ,  0.04624525, ..., -0.20212881,
        -0.47378505, -0.87137393]])

In [65]:
X = standardized_data
Y = diabetes_dataset['Outcome']

In [66]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, shuffle=True)

The dataset is split into training and testing sets using an 80-20 split (test_size=0.2).
X_train and Y_train are used for training the model, while X_test and Y_test are reserved for evaluating its performance.

test_size=0.2 means that 20% of the entire dataset will be set aside as the test set.
The remaining 80% of the data will be used as the training set.

shuffle:
When True (the default), the data is shuffled before splitting. Shuffling ensures that the split is random and that the data order does not influence the split.
If False, the data is split without shuffling, preserving the original order.

In [67]:
print(X.shape, X_train.shape, X_test.shape)

(768, 8) (614, 8) (154, 8)


Training the Model


In [75]:
classifier = svm.SVC(kernel='linear')

Choosing the right Classifiers Overview : 
Logistic Regression: Simple and interpretable, great for binary classification.

Decision Trees: Easy to interpret, works well with categorical data but prone to overfitting.

Random Forest: An ensemble method that reduces overfitting by combining multiple decision trees.

Support Vector Machines (SVM): Effective for high-dimensional spaces; requires feature scaling.

k-Nearest Neighbors (k-NN): Simple, effective for small datasets; requires feature scaling and is computationally expensive.

Neural Networks: Powerful, especially for complex and unstructured data; requires large datasets and is computationally expensive.

Naive Bayes: Good for text classification and categorical data; makes strong independence assumptions. 

Choosing the Right Kernel for SVM :
  
  Linear Kernel:
Suitable when the data is linearly separable or when the number of features is very large.
Fast to compute and works well with high-dimensional data.

Polynomial Kernel:
Suitable for data that is not linearly separable but can be separated by a polynomial decision boundary.
Use when you suspect that interactions between features are important.

Radial Basis Function (RBF) or Gaussian Kernel:
Default choice when the data is not linearly separable.
Good for complex data structures, where decision boundaries are non-linear.

Sigmoid Kernel:
Often used in neural networks but can be applied in SVMs to mimic a neural network.

In [82]:
#training the support vector Machine Classifier
classifier.fit(X_train, Y_train)

Model Evaluation

Accuracy Score

In [77]:
# accuracy score on the training data
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [78]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.7833876221498371


In [79]:
# accuracy score on the test data
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [80]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.7662337662337663


Making a Predictive System

In [81]:
input_data = (5,166,72,19,175,25.8,0.587,51)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)
print(input_data_as_numpy_array)
# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)
print(input_data_reshaped)
# standardize the input data
std_data = scaler.transform(input_data_reshaped)
#print(std_data)

prediction = classifier.predict(std_data)
#print(prediction)

if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')

[  5.    166.     72.     19.    175.     25.8     0.587  51.   ]
[[  5.    166.     72.     19.    175.     25.8     0.587  51.   ]]
The person is diabetic


