In [None]:
import pandas as pd

df = pd.read_csv('dataset-diabete-68e2810ab0d7e949117525.csv' , index_col = 0)
df

: 

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
(df == 0).sum()

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
df.duplicated().sum()

In [None]:
df.shape

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt   # <-- la bonne importation

sns.scatterplot(data=df, x='Pregnancies', y='Age')
plt.show()

sns.scatterplot(data=df, x='Pregnancies', y='Glucose')
plt.show()

In [None]:
df.corr()

In [None]:
corr_matrix  = df.corr()
sns.heatmap(corr_matrix ,fmt=".2f",annot=True , cmap='viridis', linewidths=.5, cbar=True)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

X = df.drop(columns="DiabetesPedigreeFunction")
y = df['DiabetesPedigreeFunction']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

k_range = range(1, 26)
mse_values = []

for k in k_range:
    knn = KNeighborsRegressor(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mse_values.append(mse)

plt.figure(figsize=(12, 7))
plt.plot(k_range, mse_values, color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)
plt.title('Mean Squared Error vs. K Value', fontsize=16)
plt.xlabel('K (Number of Neighbors)', fontsize=12)
plt.ylabel('Mean Squared Error (MSE)', fontsize=12)
plt.grid(True)
plt.show()

optimal_k = k_range[np.argmin(mse_values)]
print(f"The optimal value for 'k' is: {optimal_k}")

In [None]:
df.replace(0, np.nan, inplace=True)

In [None]:
df

In [None]:
from sklearn.impute import KNNImputer
imputer  = KNNImputer(n_neighbors=12 )
new_df = imputer.fit_transform(df)
new_df = pd.DataFrame(new_df, columns=df.columns)

new_df

In [None]:
colums_plot = new_df.drop(columns= "DiabetesPedigreeFunction").columns
colums_plot

for column in colums_plot:
    plt.figure(figsize=(8 , 5))
    plt.title(f'the {column} plot  ')
    sns.boxplot(x = new_df[column])
    plt.grid(True)
    plt.show()

# outliers
import numpy as np
from scipy import stats

cleaned_df = new_df.copy()

# apply log transformation 
for column in colums_plot:
    if stats.skew(cleaned_df[column]) > 1:
        min_val = cleaned_df[column].min()
        if min_val <= 0:
            cleaned_df[column] = np.log(cleaned_df[column] + abs(min_val) + 1)
        else:
            cleaned_df[column] = np.log(cleaned_df[column])

#  extreme outliers
for column in colums_plot:
    Q1 = cleaned_df[column].quantile(0.25)
    Q3 = cleaned_df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 3 * IQR
    upper_bound = Q3 + 3 * IQR
    cleaned_df = cleaned_df[(cleaned_df[column] >= lower_bound) & (cleaned_df[column] <= upper_bound)]

print(f"Dataset cleaned: {len(new_df)} -> {len(cleaned_df)} rows")

In [None]:
# Show cleaned dataset boxplots
for column in colums_plot:
    plt.figure(figsize=(8, 5))
    plt.title(f'Cleaned {column} plot')
    sns.boxplot(x=cleaned_df[column])
    plt.grid(True)
    plt.show()


In [None]:
from scipy import stats
array = []
for column in colums_plot:
    array.append(stats.zscore(cleaned_df[column]))
array
pd.DataFrame(array)

In [None]:
from sklearn.preprocessing import StandardScaler
import pandas as pd

numeric_df = cleaned_df.select_dtypes(include=['number'])
scaler = StandardScaler()
scaled_array = scaler.fit_transform(numeric_df)
scaled_df = pd.DataFrame(scaled_array, columns=numeric_df.columns, index=numeric_df.index)
scaled_df.head()

In [None]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

X_cleaned = cleaned_df.drop(columns=['DiabetesPedigreeFunction'])

k_range = range(2 , 20)
silhouette_scores = []
for k in k_range:
    kmeans = KMeans(n_clusters = k, random_state = 42 , n_init = 'auto')
    kmeans.fit(X_cleaned)
    score = silhouette_score(X_cleaned , kmeans.labels_)
    silhouette_scores.append(score)
plt.figure(figsize=(10, 6))
plt.plot(k_range, silhouette_scores, marker='o')
plt.title('Silhouette Score vs. Number of Clusters (k)')
plt.xlabel('k')
plt.ylabel('Average Silhouette Score')
plt.grid(True)
plt.show()

In [None]:
optimal_k = k_range[silhouette_scores.index(max(silhouette_scores))]
optimal_k

In [None]:
k_range = range(1, 11) 
inertia_values = []

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')
    kmeans.fit(X_cleaned)
    inertia_values.append(kmeans.inertia_)

plt.figure(figsize=(10, 6))
plt.plot(k_range, inertia_values, marker='o')
plt.title('Elbow Method For Optimal k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia (WCSS)')
plt.grid(True)
plt.xticks(k_range)
plt.show()


In [None]:
scaled_df

In [None]:
import pandas as pd
from sklearn.cluster import KMeans

optimal_k = 2

X = scaled_df
kmeans_final = KMeans(n_clusters=optimal_k, random_state=42, n_init='auto')
kmeans_final.fit(X)

cleaned_df['Cluster'] = kmeans_final.labels_

print(cleaned_df.head())
print(cleaned_df['Cluster'].value_counts())




In [None]:

sns.scatterplot(
    data=cleaned_df,
    x='BloodPressure',
    y='Glucose',
    hue='Cluster'
)

plt.title('Clustering Visualization')
plt.show()

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principal_components = pca.fit_transform(cleaned_df)
pd.DataFrame(principal_components)


In [None]:
from sklearn.decomposition import PCA

#  scaled data for PCA (same as clustering)
pca = PCA(n_components = 2)
pca_df = pca.fit_transform(scaled_df)
pca_df = pd.DataFrame(data=pca_df,
                      columns=['Principal1', 'Principal2'])
pca_df['Cluster'] = cleaned_df['Cluster'].values

pca_df



In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Principal1', y='Principal2', data=pca_df, hue='Cluster', alpha=1)
plt.title('PCA Visualization of Clusters')
plt.show()

In [None]:
# Count observations per cluster
print(cleaned_df['Cluster'].value_counts())

# Calculate cluster means
cluster_means = cleaned_df.groupby('Cluster')[['Glucose', 'BMI', 'DiabetesPedigreeFunction']].mean()
print(cluster_means)


cleaned_df['risk_category'] = cleaned_df['Cluster']



In [None]:

sns.scatterplot(x='Principal1', y='Principal2', data=pca_df, hue=cleaned_df['risk_category'])
plt.title('Risk Categories')
plt.show()


In [None]:
moyens = pca_df.groupby('Cluster')[['Principal1', 'Principal2']].mean()
moyens

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
# Importations des modèles
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [None]:


models = {
    "Régression Logistique": LogisticRegression(max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM (Support Vector Machine)": SVC(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "XGBoost (XGB)": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

results = {}


for name, model in models.items():
    print(f"training de : {name}...")
    

    model.fit(X_train, y_train)
    

    y_pred = model.predict(X_test)


    accuracy = accuracy_score(y_test, y_pred)
    

    results[name] = {
        "Modèle": model,
        "Précision (Accuracy)": accuracy,
        "Rapport de Classification": classification_report(y_test, y_pred, output_dict=True)
    }
    
    print(f"{name} terminé. Précision : {accuracy:.4f}\n")


In [None]:

for name, res in results.items():
    print(f"\n--- {name} ---")
    print(f"Précision (Accuracy) : {res['Précision (Accuracy)']:.4f}")
    
   
    report = res['Rapport de Classification']
    

    if '1' in report:
        print(f"  Rappel (Recall) Classe 1 : {report['1']['recall']:.4f}")
        print(f"  F1-Score Classe 1 : {report['1']['f1-score']:.4f}")
    else:
        # Afficher la métrique macro-average ou weighted-average comme alternative
        print(f"  F1-Score (Moyenne pondérée) : {report['weighted avg']['f1-score']:.4f}")
        
