In [1]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC

from sklearn.metrics import (
    precision_recall_curve,
    roc_curve,
    classification_report,
    confusion_matrix
)
from sklearnex import patch_sklearn

patch_sklearn()

sys.path.append("..")

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
from src.modules.config.ml_dataset import smiles_to_fingerprint

In [3]:
current_dir = os.getcwd()
TRAIN = os.path.join(current_dir, "../data/processed/train.csv")
TEST = os.path.join(current_dir, "../data/processed/test.csv")

In [4]:
train_data = pd.read_csv(TRAIN)
test_data = pd.read_csv(TEST)

In [5]:
X_train = smiles_to_fingerprint(train_data["smiles"].tolist())
y_train = train_data["HIV_active"].values

X_test = smiles_to_fingerprint(test_data["smiles"].tolist())
y_test = test_data["HIV_active"].values

del train_data, test_data



In [None]:
svm_model = SVC(kernel="rbf", probability=True)
svm_model.fit(X_train, y_train)

y_test_proba_svm = svm_model.predict_proba(X_test)[:, 1]

In [None]:
precision, recall, thresholds_pr = precision_recall_curve(y_test, y_test_proba_svm)
f1_scores = 2 * (precision * recall) / (precision + recall)
plt.figure(figsize=(8, 6))
plt.plot(thresholds_pr, precision[:-1], label="Precision", color="blue")
plt.plot(thresholds_pr, recall[:-1], label="Recall", color="green")
plt.plot(thresholds_pr, f1_scores[:-1], label="F1-score", color="red")
plt.xlabel("Threshold")
plt.ylabel("Score")
plt.title("Precision-Recall-F1-score vs Threshold")
plt.legend(loc="best")
plt.grid()
plt.show()

In [None]:
fpr, tpr, thresholds_roc = roc_curve(y_test, y_test_proba_svm)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label="ROC Curve", color="darkorange")
plt.plot([0, 1], [0, 1], "k--", label="Random Guess")
plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")
plt.title("ROC Curve")
plt.legend(loc="best")
plt.grid()
plt.show()

In [None]:
optimal_idx = f1_scores.argmax()
optimal_threshold = thresholds_pr[optimal_idx]

print(f"Optimal Threshold: {optimal_threshold}")

# Make predictions using the new threshold
y_test_pred_optimal = (y_test_proba_svm >= optimal_threshold).astype(int)

In [None]:
print("SVM Test Classification Report (Optimal Threshold):")
print(classification_report(y_test, y_test_pred_optimal))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred_optimal))