### Libraries Importing

In [28]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score
print("Libraries imported")

Libraries imported


### Loading preprocessed, feature selected data

In [29]:
df =pd.read_csv("data/feature_selection.csv")
X = df.drop(columns="y")
y = df["y"].apply(lambda x: 1 if x > 0 else 0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
print("Feature selection done")

Feature selection done


### Grid search CV

In [30]:
param_grid = {
    'n_estimators': [50, 100, 200,250],         # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],       # Maximum depth of the tree
    'min_samples_split': [2, 5, 6, 10],       # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4, 6],         # Minimum number of samples required at a leaf node
    'max_features': ['sqrt', 'log2'] # Number of features to consider when looking for the best split
}

grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    
)


print("Starting GridSearchCV...")
grid_search.fit(X_train, y_train)


print("\nGridSearchCV finished.")
print("Best Hyperparameters found:", grid_search.best_params_)
print(f"Best cross-validation AUC score: {grid_search.best_score_:.4f}")


best_rf_model = grid_search.best_estimator_

Starting GridSearchCV...


KeyboardInterrupt: 

### Random Search CV

In [None]:


random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_grid,
    n_iter=50, # Number of parameter settings that are sampled
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    random_state=42,
    verbose=2
)


print("\nStarting RandomizedSearchCV...")
random_search.fit(X, y)


print("\nRandomizedSearchCV finished.")
print("Best Hyperparameters found:", random_search.best_params_)
print(f"Best cross-validation AUC score: {random_search.best_score_:.4f}")

best_rf_model_random = random_search.best_estimator_

## Comparison between the base model and the hyper tuned one

In [31]:
from sklearn.metrics import classification_report

best_rf_model = RandomForestClassifier(random_state=30, n_estimators=250, max_depth=None, min_samples_split=6, min_samples_leaf=6, max_features='sqrt' )
accuracy_optimized = cross_val_score(best_rf_model, X,y, cv=5,scoring= 'accuracy').mean()
auc_optimized = cross_val_score(best_rf_model, X,y, cv=5,scoring= 'roc_auc').mean()


baseline_rf = RandomForestClassifier(random_state=30)
baseline_accuracy = cross_val_score(baseline_rf, X, y, cv=5, scoring='accuracy').mean()
baseline_auc = cross_val_score(baseline_rf, X, y, cv=5, scoring='roc_auc').mean()
print("\n--- Performance Comparison ---")
print(f"Baseline Model -> Accuracy: {baseline_accuracy:.4f}, AUC: {baseline_auc:.4f}")
print(f"Optimized Model -> Accuracy: {accuracy_optimized:.4f}, AUC: {auc_optimized:.4f}")

improvement_auc = ((auc_optimized - baseline_auc) / baseline_auc) * 100
print(f"\nImprovement in AUC: {improvement_auc:.2f}%")


--- Performance Comparison ---
Baseline Model -> Accuracy: 0.7976, AUC: 0.8813
Optimized Model -> Accuracy: 0.8342, AUC: 0.9036

Improvement in AUC: 2.53%


### Saving the results to a txt file

In [34]:
best_rf_model = RandomForestClassifier(random_state=30, n_estimators=250, max_depth=None, min_samples_split=6, min_samples_leaf=6, max_features='sqrt' )
best_rf_model.fit(X, y)
with open("results/evaluation_metrics.txt", 'w') as f:
    f.write(classification_report(y, best_rf_model.predict(X)))