# Explore here

In [18]:
# Your code here

- **IMPORTING DATA AND SPLITTING INTO TRAIN AND TEST**

In [19]:
import os
import pandas as pd
import csv

train_df = pd.read_csv("../data/processed/processed_train.csv")
test_df = pd.read_csv("../data/processed/processed_test.csv")

X_train = train_df.drop(["Outcome"], axis = 1)
y_train = train_df["Outcome"]
X_test = test_df.drop(["Outcome"], axis = 1)
y_test = test_df["Outcome"]

display(X_train)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,Insulin,BMI,DiabetesPedigreeFunction,Age
0,2.0,84.0,0.0,0.000,0.0,0.304,21.0
1,9.0,112.0,82.0,0.000,28.2,1.282,50.0
2,1.0,139.0,46.0,83.000,28.7,0.654,22.0
3,0.0,161.0,50.0,0.000,21.9,0.254,65.0
4,6.0,134.0,80.0,318.125,46.2,0.238,46.0
...,...,...,...,...,...,...,...
609,5.0,139.0,64.0,140.000,28.6,0.411,26.0
610,1.0,96.0,122.0,0.000,22.4,0.207,27.0
611,10.0,101.0,86.0,0.000,45.6,1.136,38.0
612,0.0,141.0,0.0,0.000,42.4,0.205,29.0


- **BOOSTING ALGORITHM MODEL**

In [20]:
from xgboost import XGBClassifier

'''model = XGBClassifier(random_state = 42, n_estimators = 100, learning_rate = 0.01, max_depth=3)'''
model = XGBClassifier(random_state=42)
model.fit(X_train, y_train)

In [21]:
y_pred = model.predict(X_test)
y_pred

array([1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0])

In [22]:
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report

print(f"MSE: {mean_squared_error(y_test, y_pred)}")
print(f"Accuracy score: {accuracy_score(y_test, y_pred)}")
print(f"Classification Report:\n {classification_report(y_test, y_pred)}")

MSE: 0.2857142857142857
Accuracy score: 0.7142857142857143
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.73      0.77        99
           1       0.58      0.69      0.63        55

    accuracy                           0.71       154
   macro avg       0.70      0.71      0.70       154
weighted avg       0.73      0.71      0.72       154



In [23]:
from pickle import dump

file_path = os.path.join("../models", "boosting_algorithm.sav")
dump(model, open(file_path, "wb"))

- **APPLYING HYPERPARAMETERS**

In [24]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 0.9]
}

optimized_model = XGBClassifier(random_state=42)
grid_search = GridSearchCV(estimator=optimized_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.7; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.7; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.7; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.7; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.7; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.1

In [25]:
best_model = grid_search.best_estimator_
print("Best Hyperparameters:", grid_search.best_params_)

Best Hyperparameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300, 'subsample': 0.9}


In [26]:
optimized_y_pred = best_model.predict(X_test)
optimized_y_pred

array([0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0])

In [27]:
print(f"Optimized boosting MSE: {mean_squared_error(y_test, optimized_y_pred)}")
print(f"Optimized boosting accuracy score: {accuracy_score(y_test, optimized_y_pred)}")
print(f"Optimized boosting classification Report:\n {classification_report(y_test, optimized_y_pred)}")

Optimized boosting MSE: 0.24025974025974026
Optimized boosting accuracy score: 0.7597402597402597
Optimized boosting classification Report:
               precision    recall  f1-score   support

           0       0.80      0.83      0.82        99
           1       0.67      0.64      0.65        55

    accuracy                           0.76       154
   macro avg       0.74      0.73      0.74       154
weighted avg       0.76      0.76      0.76       154



In [28]:
file_path = os.path.join("../models", "boosting_algorithm_optimized.sav")
dump(model, open(file_path, "wb"))

- **COMPARING TO OTHER MODELS**

- **Decision tree**

In [29]:
import pickle

file_path = os.path.join("../models", "decision_tree_classifier_default_42_OPTIMIZED.sav")

with open(file_path, "rb") as model_file:
    decision_tree_optimized_model = pickle.load(model_file)

In [30]:
decision_tree_y_pred = decision_tree_optimized_model.predict(X_test)
decision_tree_y_pred

array([0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0])

In [31]:
print(f"Decision tree MSE: {mean_squared_error(y_test, decision_tree_y_pred)}")
print(f"Decision tree accuracy score: {accuracy_score(y_test, decision_tree_y_pred)}")
print(f"Decision tree classification Report:\n {classification_report(y_test, decision_tree_y_pred)}")

Decision tree MSE: 0.23376623376623376
Decision tree accuracy score: 0.7662337662337663
Decision tree classification Report:
               precision    recall  f1-score   support

           0       0.84      0.79      0.81        99
           1       0.66      0.73      0.69        55

    accuracy                           0.77       154
   macro avg       0.75      0.76      0.75       154
weighted avg       0.77      0.77      0.77       154



- **Random forest**

In [32]:
file_path = os.path.join("../models", "random-forest-optimized-42.sav")

with open(file_path, "rb") as model_file:
    random_forest_optimized_model = pickle.load(model_file)

In [33]:
random_forest_y_pred = random_forest_optimized_model.predict(X_test)
random_forest_y_pred

array([1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0])

In [34]:
print(f"Random forest MSE: {mean_squared_error(y_test, random_forest_y_pred)}")
print(f"Random forest accuracy score: {accuracy_score(y_test, random_forest_y_pred)}")
print(f"Random forest classification Report:\n {classification_report(y_test, random_forest_y_pred)}")

Random forest MSE: 0.23376623376623376
Random forest accuracy score: 0.7662337662337663
Random forest classification Report:
               precision    recall  f1-score   support

           0       0.83      0.80      0.81        99
           1       0.66      0.71      0.68        55

    accuracy                           0.77       154
   macro avg       0.75      0.75      0.75       154
weighted avg       0.77      0.77      0.77       154



- **CONCLUSION**

    The decision tree model has just slightly better precision, recall and f1-score

In [35]:
print(f"Optimized boosting MSE: {mean_squared_error(y_test, optimized_y_pred)}")
print(f"Optimized boosting accuracy score: {accuracy_score(y_test, optimized_y_pred)}")
print(f"Optimized boosting classification Report:\n {classification_report(y_test, optimized_y_pred)}")

print(f"Decision tree MSE: {mean_squared_error(y_test, decision_tree_y_pred)}")
print(f"Decision tree accuracy score: {accuracy_score(y_test, decision_tree_y_pred)}")
print(f"Decision tree classification Report:\n {classification_report(y_test, decision_tree_y_pred)}")

print(f"Random forest MSE: {mean_squared_error(y_test, random_forest_y_pred)}")
print(f"Random forest accuracy score: {accuracy_score(y_test, random_forest_y_pred)}")
print(f"Random forest classification Report:\n {classification_report(y_test, random_forest_y_pred)}")

Optimized boosting MSE: 0.24025974025974026
Optimized boosting accuracy score: 0.7597402597402597
Optimized boosting classification Report:
               precision    recall  f1-score   support

           0       0.80      0.83      0.82        99
           1       0.67      0.64      0.65        55

    accuracy                           0.76       154
   macro avg       0.74      0.73      0.74       154
weighted avg       0.76      0.76      0.76       154

Decision tree MSE: 0.23376623376623376
Decision tree accuracy score: 0.7662337662337663
Decision tree classification Report:
               precision    recall  f1-score   support

           0       0.84      0.79      0.81        99
           1       0.66      0.73      0.69        55

    accuracy                           0.77       154
   macro avg       0.75      0.76      0.75       154
weighted avg       0.77      0.77      0.77       154

Random forest MSE: 0.23376623376623376
Random forest accuracy score: 0.76623376