# Explore here

In [63]:
# Your code here

- **IMPORTING DATA AND SPLITTING INTO TEST AND TRAIN**

In [64]:
import os
import pandas as pd
import csv

train_df = pd.read_csv("../data/processed/processed_train.csv")
test_df = pd.read_csv("../data/processed/processed_test.csv")

X_train = train_df.drop(["Outcome"], axis = 1)
y_train = train_df["Outcome"]
X_test = test_df.drop(["Outcome"], axis = 1)
y_test = test_df["Outcome"]

display(X_train)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,Insulin,BMI,DiabetesPedigreeFunction,Age
0,2.0,84.0,0.0,0.000,0.0,0.304,21.0
1,9.0,112.0,82.0,0.000,28.2,1.282,50.0
2,1.0,139.0,46.0,83.000,28.7,0.654,22.0
3,0.0,161.0,50.0,0.000,21.9,0.254,65.0
4,6.0,134.0,80.0,318.125,46.2,0.238,46.0
...,...,...,...,...,...,...,...
609,5.0,139.0,64.0,140.000,28.6,0.411,26.0
610,1.0,96.0,122.0,0.000,22.4,0.207,27.0
611,10.0,101.0,86.0,0.000,45.6,1.136,38.0
612,0.0,141.0,0.0,0.000,42.4,0.205,29.0


- **RANDOM FOREST**

In [65]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators = 60, random_state=42)
model.fit(X_train, y_train)

In [66]:
y_pred = model.predict(X_test)
y_pred

array([0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0])

In [67]:
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report

print(f"MSE: {mean_squared_error(y_test, y_pred)}")
print(f"Accuracy score: {accuracy_score(y_test, y_pred)}")
print("Classification Report:\n", classification_report(y_test, y_pred))

MSE: 0.22727272727272727
Accuracy score: 0.7727272727272727
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.79      0.82        99
           1       0.66      0.75      0.70        55

    accuracy                           0.77       154
   macro avg       0.75      0.77      0.76       154
weighted avg       0.78      0.77      0.78       154



In [68]:
from pickle import dump

file_path = os.path.join("../models", "random-forest.sav")
dump(model, open(file_path, "wb"))

- **APPLYING HYPERPARAMETERS**

In [69]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', None]
}

optimized_model = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(estimator=optimized_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)

Best parameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}


In [70]:
best_optimized_model = grid_search.best_estimator_

In [71]:
optimized_y_pred = best_optimized_model.predict(X_test)
optimized_y_pred

array([1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0])

In [72]:
print(f"MSE: {mean_squared_error(y_test, optimized_y_pred)}")
print(f"Accuracy score: {accuracy_score(y_test, optimized_y_pred)}")
print("Classification Report:\n", classification_report(y_test, optimized_y_pred))

MSE: 0.23376623376623376
Accuracy score: 0.7662337662337663
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.80      0.81        99
           1       0.66      0.71      0.68        55

    accuracy                           0.77       154
   macro avg       0.75      0.75      0.75       154
weighted avg       0.77      0.77      0.77       154



In [73]:
file_path = os.path.join("../models", "random-forest-optimized-42.sav")
dump(best_optimized_model, open(file_path, "wb"))