In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics

In [3]:
# Load the dataset
file_path = './../datafiles/diabetes.csv'
data = pd.read_csv(file_path)

# Separate features (X) and target (y)
X = data.drop('Outcome', axis=1)  # Features
y = data['Outcome']               # Target (0: Non-diabetic, 1: Diabetic)

In [4]:
X

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


In [5]:
y

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64

In [6]:
# Split the dataset into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
X_train

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
334,1,95,60,18,58,23.9,0.260,22
139,5,105,72,29,325,36.9,0.159,28
485,0,135,68,42,250,42.3,0.365,24
547,4,131,68,21,166,33.1,0.160,28
18,1,103,30,38,83,43.3,0.183,33
...,...,...,...,...,...,...,...,...
71,5,139,64,35,140,28.6,0.411,26
106,1,96,122,0,0,22.4,0.207,27
270,10,101,86,37,0,45.6,1.136,38
435,0,141,0,0,0,42.4,0.205,29


In [8]:
X_test

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
668,6,98,58,33,190,34.0,0.430,43
324,2,112,75,32,0,35.7,0.148,21
624,2,108,64,0,0,30.8,0.158,21
690,8,107,80,0,0,24.6,0.856,34
473,7,136,90,0,0,29.9,0.210,50
...,...,...,...,...,...,...,...,...
619,0,119,0,0,0,32.4,0.141,24
198,4,109,64,44,99,34.8,0.905,26
538,0,127,80,37,210,36.3,0.804,23
329,6,105,70,32,68,30.8,0.122,37


In [9]:
y_train

334    0
139    0
485    1
547    0
18     0
      ..
71     0
106    0
270    1
435    1
102    0
Name: Outcome, Length: 537, dtype: int64

In [10]:
y_test

668    0
324    0
624    0
690    0
473    0
      ..
619    1
198    1
538    0
329    0
302    0
Name: Outcome, Length: 231, dtype: int64

In [15]:
# Initialize the AdaBoost classifier with DecisionTreeClassifier as the base estimator
ada_classifier = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=1),  # Weak learner (stump)
    n_estimators=50,                                     # Number of boosting rounds
    random_state=42                                     # Random state for reproducibility
)

In [16]:
# Train the AdaBoost classifier on the training data
ada_classifier.fit(X_train, y_train)

In [17]:
# Predict the target labels on the test data
y_pred = ada_classifier.predict(X_test)

In [18]:
# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

In [19]:
print(conf_matrix)

[[122  29]
 [ 30  50]]


In [20]:
# Generate classification report
class_report = classification_report(y_test, y_pred)

In [21]:
print(class_report)

              precision    recall  f1-score   support

           0       0.80      0.81      0.81       151
           1       0.63      0.62      0.63        80

    accuracy                           0.74       231
   macro avg       0.72      0.72      0.72       231
weighted avg       0.74      0.74      0.74       231



In [22]:
sample_input = [[5, 116, 74, 25, 0, 32.2, 0.201, 30]]
sample_prediction = ada_classifier.predict(sample_input)
print(sample_prediction)  # Output will be either 0 or 1 (Non-diabetic or Diabetic)

[0]


Using SVM as base Model

In [25]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
file_path = './../datafiles/diabetes.csv'
data = pd.read_csv(file_path)

# Separate features and target
X = data.drop('Outcome', axis=1)
y = data['Outcome']

# Split the dataset into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the SVC model
svc = SVC(probability=True, kernel='sigmoid')  # Using a linear kernel for simplicity

# Initialize the AdaBoost classifier with SVC as the base estimator
ada_classifier_svc = AdaBoostClassifier(
    estimator=svc,  # SVC as the base model
    n_estimators=50,     # Number of boosting rounds
    random_state=42
)

# Train the AdaBoost classifier with SVC on the training data
ada_classifier_svc.fit(X_train, y_train)

# Predict the target labels on the test data
y_pred_svc = ada_classifier_svc.predict(X_test)

# Compute confusion matrix
conf_matrix_svc = confusion_matrix(y_test, y_pred_svc)
print(conf_matrix)

# Generate classification report
class_report_svc = classification_report(y_test, y_pred_svc)



# Display the classification report
print(class_report_svc)

# Test with a sample input for prediction
sample_input = [[5, 116, 74, 25, 0, 32.2, 0.201, 30]]  # Example input
sample_prediction_svc = ada_classifier_svc.predict(sample_input)

# Show the predicted outcome (0: Non-diabetic, 1: Diabetic)
print("Sample input prediction (0: Non-diabetic, 1: Diabetic):", sample_prediction_svc)


[[122  29]
 [ 30  50]]
              precision    recall  f1-score   support

           0       0.65      1.00      0.79       151
           1       0.00      0.00      0.00        80

    accuracy                           0.65       231
   macro avg       0.33      0.50      0.40       231
weighted avg       0.43      0.65      0.52       231

Sample input prediction (0: Non-diabetic, 1: Diabetic): [0]


Using KNN as base model

In [27]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
file_path = './../datafiles/diabetes.csv'
data = pd.read_csv(file_path)

# Separate features and target
X = data.drop('Outcome', axis=1)
y = data['Outcome']

# Split the dataset into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the KNN model
knn = KNeighborsClassifier(n_neighbors=5)  # Using 5 neighbors for simplicity

# Initialize the AdaBoost classifier with KNN as the base estimator
ada_classifier_knn = AdaBoostClassifier(
    estimator=knn,  # KNN as the base model
    n_estimators=50,     # Number of boosting rounds
    random_state=42
)

# Train the AdaBoost classifier with KNN on the training data
ada_classifier_knn.fit(X_train, y_train)

# Predict the target labels on the test data
y_pred_knn = ada_classifier_knn.predict(X_test)

# Compute confusion matrix
conf_matrix_knn = confusion_matrix(y_test, y_pred_knn)

# Generate classification report
class_report_knn = classification_report(y_test, y_pred_knn)

# Plot the confusion matrix for better visualization
plt.figure(figsize=(6,4))
sns.heatmap(conf_matrix_knn, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix (AdaBoost with KNN)')
plt.ylabel('Actual Class')
plt.xlabel('Predicted Class')
plt.show()

# Display the classification report
print(class_report_knn)

# Test with a sample input for prediction
sample_input = [[5, 116, 74, 25, 0, 32.2, 0.201, 30]]  # Example input
sample_prediction_knn = ada_classifier_knn.predict(sample_input)

# Show the predicted outcome (0: Non-diabetic, 1: Diabetic)
print("Sample input prediction (0: Non-diabetic, 1: Diabetic):", sample_prediction_knn)


ValueError: KNeighborsClassifier doesn't support sample_weight.

Using Logistic Regression as base model

In [30]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
file_path = './../datafiles/diabetes.csv'
data = pd.read_csv(file_path)

# Separate features and target
X = data.drop('Outcome', axis=1)
y = data['Outcome']

# Split the dataset into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the Logistic Regression model
logistic_regression = LogisticRegression(max_iter=1000)  # Increased max_iter for convergence

# Initialize the AdaBoost classifier with Logistic Regression as the base estimator
ada_classifier_logreg = AdaBoostClassifier(
    estimator=logistic_regression,  # Logistic Regression as base model
    n_estimators=50,                     # Number of boosting rounds
    random_state=42
)

# Train the AdaBoost classifier with Logistic Regression on the training data
ada_classifier_logreg.fit(X_train, y_train)

# Predict the target labels on the test data
y_pred_logreg = ada_classifier_logreg.predict(X_test)

# Compute confusion matrix
conf_matrix_logreg = confusion_matrix(y_test, y_pred_logreg)
print(conf_matrix_logreg)

# Generate classification report
class_report_logreg = classification_report(y_test, y_pred_logreg)

# Display the classification report
print(class_report_logreg)

# Test with a sample input for prediction
sample_input = [[5, 116, 74, 25, 0, 32.2, 0.201, 30]]  # Example input
sample_prediction_logreg = ada_classifier_logreg.predict(sample_input)

# Show the predicted outcome (0: Non-diabetic, 1: Diabetic)
print("Sample input prediction (0: Non-diabetic, 1: Diabetic):", sample_prediction_logreg)


[[115  36]
 [ 34  46]]
              precision    recall  f1-score   support

           0       0.77      0.76      0.77       151
           1       0.56      0.57      0.57        80

    accuracy                           0.70       231
   macro avg       0.67      0.67      0.67       231
weighted avg       0.70      0.70      0.70       231

Sample input prediction (0: Non-diabetic, 1: Diabetic): [0]
