For this task, you are required to build a machine learning model to predict the outcome variable. This will be a binary classification task, as the target variable is binary. You should select at least two models, one of which should be an ensemble model, and compare their performance.

- Train the models: Train the selected models on the training set.
- Model evaluation: Evaluate the trained models on the testing set using appropriate evaluation metrics, such as accuracy, precision, recall, F1-score, and ROC-AUC.
- Model comparison: Compare the performance of the selected models and choose the best-performing model based on the evaluation metrics. You can also perform additional analysis, such as model tuning and cross-validation, to improve the model's performance.

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [48]:
from sklearn import tree
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [145]:
import os

In [148]:
os.getcwd()

'C:\\Users\\user\\ml-project-supervised-learning'

# Random Forest 

In [140]:
filepaths = ['df_full', 'df_no_insulin', 'df_pregnant']

for filepath in filepaths:
    # Read the CSV file
    df = pd.read_csv(filepath)

    # Features and target variable
    X = df.drop(columns=['Outcome'])  # Features
    y = df['Outcome']  # Target variable

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

    # Initialize the model
    model = RandomForestClassifier(random_state=42,n_estimators=100)

    # Train the model
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy for {filepath}: {accuracy:.2f}')

    # Generate a classification report
    report = classification_report(y_test, y_pred)
    print(f'Classification Report for {filepath}:\n{report}\n')

Accuracy for df_full: 0.75
Classification Report for df_full:
              precision    recall  f1-score   support

           0       0.79      0.84      0.81       101
           1       0.65      0.57      0.61        53

    accuracy                           0.75       154
   macro avg       0.72      0.70      0.71       154
weighted avg       0.74      0.75      0.74       154


Accuracy for df_no_insulin: 0.73
Classification Report for df_no_insulin:
              precision    recall  f1-score   support

           0       0.76      0.85      0.80       101
           1       0.63      0.49      0.55        53

    accuracy                           0.73       154
   macro avg       0.70      0.67      0.68       154
weighted avg       0.72      0.73      0.72       154


Accuracy for df_pregnant: 0.79
Classification Report for df_pregnant:
              precision    recall  f1-score   support

           0       0.85      0.83      0.84        86
           1       0.69      

# Gradient boost Classifier

In [142]:
filepaths = ['df_full', 'df_no_insulin', 'df_pregnant']

for filepath in filepaths:
    # Read the CSV file
    df = pd.read_csv(filepath)

    # Features and target variable
    X = df.drop(columns=['Outcome'])  # Features
    y = df['Outcome']  # Target variable

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=998)

    # Initialize the model
    model = GradientBoostingClassifier(loss='log_loss', random_state=9898)
    # Train the model
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy for {filepath}: {accuracy:.2f}')
    # Generate a classification report
    report = classification_report(y_test, y_pred)
    print(f'Accuracy for {filepath}: {accuracy:.2f}')

    # Generate a classification report
    report = classification_report(y_test, y_pred)
    print(f'Classification Report for {filepath}:\n{report}\n')

Accuracy for df_full: 0.78
Accuracy for df_full: 0.78
Classification Report for df_full:
              precision    recall  f1-score   support

           0       0.83      0.83      0.83       100
           1       0.69      0.69      0.69        54

    accuracy                           0.78       154
   macro avg       0.76      0.76      0.76       154
weighted avg       0.78      0.78      0.78       154


Accuracy for df_no_insulin: 0.73
Accuracy for df_no_insulin: 0.73
Classification Report for df_no_insulin:
              precision    recall  f1-score   support

           0       0.80      0.78      0.79       100
           1       0.61      0.65      0.63        54

    accuracy                           0.73       154
   macro avg       0.71      0.71      0.71       154
weighted avg       0.74      0.73      0.74       154


Accuracy for df_pregnant: 0.75
Accuracy for df_pregnant: 0.75
Classification Report for df_pregnant:
              precision    recall  f1-score   s

# Hist Gradient Boost Classfier

In [143]:
filepaths = ['df_full', 'df_no_insulin', 'df_pregnant']

for filepath in filepaths:
    # Read the CSV file
    df = pd.read_csv(filepath)

    # Features and target variable
    X = df.drop(columns=['Outcome'])  # Features
    y = df['Outcome']  # Target variable

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=998)

    # Initialize the model
    model = HistGradientBoostingClassifier(loss='log_loss', random_state=9898)
    # Train the model
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy for {filepath}: {accuracy:.2f}')
    # Generate a classification report
    report = classification_report(y_test, y_pred)
    print(f'Accuracy for {filepath}: {accuracy:.2f}')

    # Generate a classification report
    report = classification_report(y_test, y_pred)
    print(f'Classification Report for {filepath}:\n{report}\n')

Accuracy for df_full: 0.76
Accuracy for df_full: 0.76
Classification Report for df_full:
              precision    recall  f1-score   support

           0       0.82      0.81      0.81       100
           1       0.65      0.67      0.66        54

    accuracy                           0.76       154
   macro avg       0.74      0.74      0.74       154
weighted avg       0.76      0.76      0.76       154


Accuracy for df_no_insulin: 0.75
Accuracy for df_no_insulin: 0.75
Classification Report for df_no_insulin:
              precision    recall  f1-score   support

           0       0.82      0.80      0.81       100
           1       0.64      0.67      0.65        54

    accuracy                           0.75       154
   macro avg       0.73      0.73      0.73       154
weighted avg       0.76      0.75      0.75       154


Accuracy for df_pregnant: 0.77
Accuracy for df_pregnant: 0.77
Classification Report for df_pregnant:
              precision    recall  f1-score   s

# BaggingClassifier

In [84]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5)

In [144]:
filepaths = ['df_full', 'df_no_insulin', 'df_pregnant']

for filepath in filepaths:
    # Read the CSV file
    df = pd.read_csv(filepath)

    # Features and target variable
    X = df.drop(columns=['Outcome'])  # Features
    y = df['Outcome']  # Target variable

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=998)
    
    estimator = DecisionTreeClassifier(random_state=42)

    # Create a BaggingClassifier with 100 base estimators (decision trees)
    bagging_classifier = BaggingClassifier(estimator=estimator, n_estimators=100, random_state=42)

    # Train the BaggingClassifier
    bagging_classifier.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = bagging_classifier.predict(X_test)
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy for {filepath}: {accuracy:.2f}')
    # Generate a classification report
    report = classification_report(y_test, y_pred)
    print(f'Accuracy for {filepath}: {accuracy:.2f}')

    # Generate a classification report
    report = classification_report(y_test, y_pred)
    print(f'Classification Report for {filepath}:\n{report}\n')

Accuracy for df_full: 0.80
Accuracy for df_full: 0.80
Classification Report for df_full:
              precision    recall  f1-score   support

           0       0.82      0.88      0.85       100
           1       0.74      0.65      0.69        54

    accuracy                           0.80       154
   macro avg       0.78      0.76      0.77       154
weighted avg       0.80      0.80      0.80       154


Accuracy for df_no_insulin: 0.81
Accuracy for df_no_insulin: 0.81
Classification Report for df_no_insulin:
              precision    recall  f1-score   support

           0       0.83      0.88      0.85       100
           1       0.75      0.67      0.71        54

    accuracy                           0.81       154
   macro avg       0.79      0.77      0.78       154
weighted avg       0.80      0.81      0.80       154


Accuracy for df_pregnant: 0.80
Accuracy for df_pregnant: 0.80
Classification Report for df_pregnant:
              precision    recall  f1-score   s

Accuracy: 0.85
              precision    recall  f1-score   support

           0       0.83      0.86      0.85        51
           1       0.87      0.84      0.86        57

    accuracy                           0.85       108
   macro avg       0.85      0.85      0.85       108
weighted avg       0.85      0.85      0.85       108



In [24]:
from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix
confusion = confusion_matrix(y_test,y_pred)

# Output the confusion matrix
print("Confusion Matrix:")
print(confusion)


Confusion Matrix:
[[44  7]
 [ 9 48]]


In [18]:
df.shape

(536, 15)

class sklearn.ensemble.HistGradientBoostingClassifier(loss='log_loss', *, learning_rate=0.1, max_iter=100, max_leaf_nodes=31, max_depth=None, min_samples_leaf=20, l2_regularization=0.0, max_bins=255, categorical_features=None, monotonic_cst=None, interaction_cst=None, warm_start=False, early_stopping='auto', scoring='loss', validation_fraction=0.1, n_iter_no_change=10, tol=1e-07, verbose=0, random_state=None, class_weight=None)

In [27]:
from sklearn.ensemble import HistGradientBoostingClassifier

In [37]:

model = HistGradientBoostingClassifier(loss='log_loss', random_state=9898)
# Train the model
model.fit(X_train, y_train)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
y_pred = model.predict(X_test)
# Generate a classification report
report = classification_report(y_test, y_pred)
print(report)

In [39]:
y_pred = model.predict(X_test)

In [40]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Generate a classification report
report = classification_report(y_test, y_pred)
print(report)

Accuracy: 0.81
              precision    recall  f1-score   support

           0       0.76      0.86      0.81        51
           1       0.86      0.75      0.80        57

    accuracy                           0.81       108
   macro avg       0.81      0.81      0.81       108
weighted avg       0.81      0.81      0.81       108



In [41]:
# Calculate the confusion matrix
confusion = confusion_matrix(y_test,y_pred)

# Output the confusion matrix
print("Confusion Matrix:")
print(confusion)

Confusion Matrix:
[[44  7]
 [14 43]]


In [43]:
from sklearn.ensemble import GradientBoostingClassifier

In [44]:
model = GradientBoostingClassifier(loss='log_loss', random_state=9898)
# Train the model
model.fit(X_train, y_train)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
y_pred = model.predict(X_test)
# Generate a classification report
report = classification_report(y_test, y_pred)
print(report)

Accuracy: 0.81
              precision    recall  f1-score   support

           0       0.78      0.82      0.80        51
           1       0.83      0.79      0.81        57

    accuracy                           0.81       108
   macro avg       0.81      0.81      0.81       108
weighted avg       0.81      0.81      0.81       108



In [45]:
# Calculate the confusion matrix
confusion = confusion_matrix(y_test,y_pred)

# Output the confusion matrix
print("Confusion Matrix:")
print(confusion)

Confusion Matrix:
[[42  9]
 [12 45]]


In [46]:
df=pd.read_csv('df_no_insulin')

In [49]:
# Features and target variable
X = df.drop(columns=['Outcome'])  # Features
y = df['Outcome']  # Target variable


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=998)


# Initialize the model
model = RandomForestClassifier(random_state=42)
# Train the model
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Generate a classification report
report = classification_report(y_test, y_pred)
print(report)

Accuracy: 0.76
              precision    recall  f1-score   support

           0       0.80      0.84      0.82       100
           1       0.67      0.61      0.64        54

    accuracy                           0.76       154
   macro avg       0.74      0.73      0.73       154
weighted avg       0.76      0.76      0.76       154



In [50]:
df=pd.read_csv('df_pregnant')

In [51]:
# Features and target variable
X = df.drop(columns=['Outcome'])  # Features
y = df['Outcome']  # Target variable


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=998)


# Initialize the model
model = RandomForestClassifier(random_state=42)
# Train the model
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Generate a classification report
report = classification_report(y_test, y_pred)
print(report)

Accuracy: 0.80
              precision    recall  f1-score   support

           0       0.85      0.88      0.86        94
           1       0.68      0.61      0.64        38

    accuracy                           0.80       132
   macro avg       0.76      0.74      0.75       132
weighted avg       0.80      0.80      0.80       132

