DATA PREDICTION MODEL

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [None]:
#load clean data
clean_data = pd.read_csv('cleaned_data.csv')

In [None]:
#define target variable
target = 'Dx:Cancer' #or 'Biopsy'
X = clean_data.drop(columns=[target])
Y = clean_data[target]

In [None]:
#split data
X_training, X_testing, Y_training, Y_testing = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
#define models
models = {
    "Logistics Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier()
}

In [None]:
#train + evaluate models
result = []
fitted_models = {}

for name, model in models.items():
    print(f"Training {name}:")
    model.fit(X_training,Y_training)
    fitted_models[name] = model

    Y_prediction = model.predict(X_testing)
    Y_probability = model.predict_probability(X_testing)[:, 1] if hasattr(model, "predict_proba") else None

    print(f"Results for {name}:")
    print(f"Accuracy: {accuracy_score(Y_testing, Y_prediction):.4f}")
    print(f"Precision: {precision_score(Y_testing, Y_prediction):.4f}")
    print(f"Recall: {recall_score(Y_testing, Y_prediction):.4f}")
    print(f"F1 Score: {f1_score(Y_testing, Y_prediction):.4f}")

    print(classification_report(Y_testing, Y_prediction))