In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

mon_features_modified = pd.read_csv('/content/drive/MyDrive/ML_Project/mon_features_modified.csv')
mon_labels = pd.read_csv('/content/drive/MyDrive/ML_Project/mon_labels.csv')

unmon_features_modified = pd.read_csv('/content/drive/MyDrive/ML_Project/unmon_features_modified.csv')
unmon_labels = pd.read_csv('/content/drive/MyDrive/ML_Project/unmon_labels.csv')

In [3]:
import numpy as np

features_modified = pd.concat([mon_features_modified, unmon_features_modified], ignore_index=True)

mon_labels = np.ones(len(mon_features_modified))
unmon_labels = -np.ones(len(unmon_features_modified))
labels = np.concatenate([mon_labels, unmon_labels])

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def scaler_samples(train_X,test_X):
  scaler = StandardScaler()
  train_X = scaler.fit_transform(train_X)
  test_X = scaler.transform(test_X)

X_train, X_test, y_train, y_test = train_test_split(
    features_modified, labels, test_size=0.2, random_state=1
)

scaler_samples(X_train, X_test)

In [5]:
from sklearn.ensemble import RandomForestClassifier

rf_binary = RandomForestClassifier()

rf_binary.fit(X_train, y_train)

In [6]:
from sklearn.metrics import accuracy_score, confusion_matrix

y_pred = rf_binary.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc}")

conf_matrix = confusion_matrix(y_test, y_pred)
print(f"Confusion Matrix\n {conf_matrix}")

Accuracy: 0.8494827586206897
Confusion Matrix
 [[1364  568]
 [ 305 3563]]


In [10]:
from sklearn.metrics import precision_score, recall_score, roc_auc_score, roc_curve, precision_recall_curve, auc

fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=1)
precision = precision_score(y_test, y_pred, pos_label=1)
recall = recall_score(y_test, y_pred, pos_label=1)
roc_auc = auc(fpr, tpr)
precision_values, recall_values, thresholds_pr = precision_recall_curve(y_test, y_pred, pos_label=1)
pr_auc = auc(recall_values, precision_values)

print(f"True Positive Rate (TPR): {tpr[1]}")
print(f"False Positive Rate (FPR): {fpr[1]}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"ROC AUC: {roc_auc}")
print(f"Precision-Recall AUC: {pr_auc}")

True Positive Rate (TPR): 0.921147880041365
False Positive Rate (FPR): 0.2939958592132505
Precision: 0.8625030259017187
Recall: 0.921147880041365
ROC AUC: 0.8135760104140572
Precision-Recall AUC: 0.9181185564198178


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [2500, 3000, 3500, 4000],
    'max_depth': [200, 250, 300, 350, 400],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
    'max_features': ['sqrt'],
    'bootstrap': [True],
    'max_leaf_nodes' : [2000, 2500, 3000],
    'class_weight' : ['balanced']
}

grid_search = GridSearchCV(estimator=rf_binary, param_grid=param_grid, cv=5)

grid_search.fit(X_test, y_test)

In [None]:
crves = grid_search.cv_results_
for mean_score, params in zip(crves["mean_test_score"], crves["params"]):
    print(mean_score, params)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best Hyperparameters:", best_params)
print("Best Model:", best_model)
y_pred = best_model.predict(X_test)

In [None]:
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc}")

conf_matrix = confusion_matrix(y_test, y_pred)
print(f"Confusion Matrix\n {conf_matrix}")