DATA PREDICTION MODEL

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

In [None]:
#load clean data
clean_data = pd.read_csv('cleaned_data.csv')

In [None]:
#define target variable
target = 'Dx:Cancer'
X = clean_data.drop(columns=[target])
Y = clean_data[target]

In [None]:
#split data
X_training, X_testing, Y_training, Y_testing = train_test_split(X, Y, test_size=0.2, random_state=42)
#standardise features
scaler = StandardScaler()
X_training_scaled = scaler.fit_transform(X_training)
X_testing_scaled = scaler.transform(X_testing)

In [None]:
#define models
models = {
    "Logistics Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "SVM": SVC(),
}

Evalutaion

In [None]:
#train + evaluate models
result = []
fitted_models = {}

for name, model in models.items():
    print(f"Training {name}:")
    model.fit(X_training_scaled, Y_training)
    fitted_models[name] = model

    Y_prediction = model.predict(X_testing_scaled)
    if hasattr(model, "predict_proba"):
        Y_probability = model.predict_proba(X_testing_scaled)[:, 1]
    else:
        Y_probability = None

    print(f"Results for {name}:")
    print(f"Accuracy: {accuracy_score(Y_testing, Y_prediction):.4f}")
    print(f"Precision: {precision_score(Y_testing, Y_prediction):.4f}")
    print(f"Recall: {recall_score(Y_testing, Y_prediction):.4f}")
    print(f"F1 Score: {f1_score(Y_testing, Y_prediction):.4f}")

    print(classification_report(Y_testing, Y_prediction))

In [None]:
#CONFUSION MATRIX
from sklearn.metrics import confusion_matrix
confusion_mtx = confusion_matrix(Y_testing, Y_prediction)
sns.heatmap(
    confusion_mtx, 
    annot=True, 
    format='d', 
    cmap="Blues", 
    xticklabels=['Cancer negative', 'Cancer positive'], 
    yticklabels=['Cancer negative', 'Cancer positive']
), 

In [None]:
#ROC Curve
from sklearn.metrics import roc_curve, auc

if Y_probability is not None:
    fpr,tpr, _ = roc_curve(Y_testing, Y_probability)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(8,6))
    plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.2f})")
    plt.plot([0, 1], [0, 1], 'k--')  # Random chance
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("Receiver Operating Characteristic (ROC) Curve")
    plt.legend(loc="lower right")
    plt.grid(True)
    plt.show()