## Travail à faire: Prédire avec précision si les patients de l'ensemble de données sont diabètiques ou non

### En faisant la comparaison des modèles suivants:
#### 1) Arbre de décision
#### 2) SVM
#### 3) Régression Logistique
#### 4) Naives Bayes

## 1)Arbre de décision

In [56]:
# importation des bibliothèques
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_recall_fscore_support
import seaborn as sns
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix

In [57]:
# importation et affichage des données
data = pd.read_csv('diabetes.csv')
X=data.iloc[:,:-1].values
y=data.iloc[:,-1].values
data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [58]:
# Combien y'a t'il d'exemples dans chaque classe
np.bincount(y)

array([500, 268], dtype=int64)

In [59]:
# L'ensemble des variables du dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [60]:
# Création du jeu d'Apprentissage et de Test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [61]:
# Construire un arbre de Décision
modeltree = tree.DecisionTreeClassifier()
modeltree.fit(X_train, y_train)

In [62]:
# Calculer le taux de performance du Modèle
accuracy = modeltree.score(X_test, y_test)
print('Accuracy Arbre Test: ', accuracy)

Accuracy Arbre Test:  0.7077922077922078


In [63]:
# Prédictions sur l'ensemble de test
pred = modeltree.predict(X_test)
# Affichage des métriques du modèle
precision, recall, f1, _ = precision_recall_fscore_support(y_test, pred, average='binary')
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")

Precision: 0.5714285714285714
Recall: 0.5384615384615384
F1-score: 0.5544554455445544


In [64]:
# Prédiction Test
modeltree.predict([[6, 148, 72, 35, 0, 33.6, 0.627, 30]])

array([1], dtype=int64)

## 2) SVM

In [65]:
# Division des données en jeu d'Entrainement et jeu de Test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [66]:
# Création du modèle SVM
classifier = SVC(kernel = 'linear', random_state=0)
classifier.fit(X_train, y_train)

In [67]:
# Prédiction sur le Test set
y_pred = classifier.predict(X_test)
# Matrice de Confusion
confusion_matrix(y_test, y_pred)

array([[87, 10],
       [25, 32]], dtype=int64)

In [69]:
# Rapport de classification
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.90      0.83        97
           1       0.76      0.56      0.65        57

    accuracy                           0.77       154
   macro avg       0.77      0.73      0.74       154
weighted avg       0.77      0.77      0.76       154



In [70]:
# Calcul du taux de performance du Modèle
accur = classifier.score(X_test, y_test)
print('Accuracy SVM: ', accur)

Accuracy SVM:  0.7727272727272727


In [71]:
#Faire de nouvelles prédictions 
classifier.predict([[5, 110, 65, 35, 0, 34, 0.125, 46]])

array([0], dtype=int64)

## 3) Régression Logistique

In [72]:
# Création du jeu d'Apprentissage et de Test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=0)

In [73]:
# Construction du Modèle
classifier = LogisticRegression(solver='liblinear', random_state=0)
classifier.fit(X_train, y_train)

In [74]:
# Calcul du taux de performance du Modèle
y_pred = classifier.predict(X_test)
acc = classifier.score(X_test, y_test)
print('Accuracy Logistic Regression: ', acc)

Accuracy Logistic Regression:  0.8181818181818182


In [75]:
# Rapport de classification
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.92      0.88       107
           1       0.76      0.60      0.67        47

    accuracy                           0.82       154
   macro avg       0.80      0.76      0.77       154
weighted avg       0.81      0.82      0.81       154



In [76]:
#Faire de nouvelles prédictions 
classifier.predict([[6, 148, 72, 35, 0, 33.6, 0.627, 30]])

array([1], dtype=int64)

## 4) Naives Bayes

In [77]:
# Entrainement et Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

In [78]:
NB = GaussianNB()
NB.fit(X_train, y_train)

In [79]:
NB.score(X_test, y_test)

0.7077922077922078

In [80]:
predictions = NB.predict(X_test)
list(zip(predictions, y_test))

[(0, 0),
 (0, 0),
 (0, 0),
 (1, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (1, 1),
 (0, 0),
 (0, 0),
 (0, 0),
 (1, 1),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 1),
 (0, 0),
 (1, 1),
 (1, 1),
 (0, 0),
 (1, 0),
 (0, 0),
 (0, 1),
 (0, 1),
 (1, 1),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 1),
 (0, 0),
 (0, 1),
 (0, 0),
 (1, 1),
 (1, 1),
 (0, 1),
 (1, 1),
 (1, 0),
 (0, 0),
 (0, 0),
 (1, 1),
 (0, 0),
 (0, 1),
 (0, 0),
 (0, 0),
 (0, 1),
 (0, 0),
 (0, 1),
 (1, 1),
 (0, 0),
 (0, 1),
 (1, 0),
 (0, 1),
 (0, 0),
 (0, 1),
 (0, 0),
 (0, 0),
 (1, 1),
 (1, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (1, 0),
 (0, 0),
 (0, 0),
 (1, 1),
 (0, 0),
 (0, 1),
 (1, 1),
 (0, 0),
 (1, 1),
 (1, 1),
 (1, 0),
 (0, 1),
 (1, 1),
 (0, 1),
 (0, 1),
 (0, 0),
 (0, 1),
 (0, 0),
 (1, 0),
 (1, 0),
 (0, 1),
 (0, 1),
 (0, 0),
 (1, 0),
 (1, 1),
 (0, 0),
 (1, 1),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 1),
 (0, 0),
 (0, 0),
 (1, 0),
 (0, 0),
 (1, 0),
 (0, 0),
 (0, 0),
 (1, 0),
 (0, 0),
 (0, 0),
 (1, 1),
 (0, 0),
 (0, 0),
 (1, 1),
 (1, 0),
 

In [81]:
from sklearn.metrics import accuracy_score
print('Accuracy Naive Bayes: ', accuracy_score(y_test, predictions))

Accuracy Naive Bayes:  0.7077922077922078


In [82]:
# Rapport de classification
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.75      0.82      0.78       100
           1       0.60      0.50      0.55        54

    accuracy                           0.71       154
   macro avg       0.68      0.66      0.67       154
weighted avg       0.70      0.71      0.70       154



In [83]:
# Prédiction Test
NB.predict([[6, 148, 72, 35, 0, 33.6, 0.627, 30]])

array([0], dtype=int64)