In [1]:
import os
print(f"current dir: {os.getcwd()}")

import sys
print(f"python executable path: {sys.executable}")

current dir: /home/felipevzps/breast-cancer-prediction/notebooks
python executable path: /home/felipevzps/miniconda3/envs/breast-cancer-prediction/bin/python


In [None]:
import joblib
import pandas as pd
import numpy as np
from sklearn.preprocessing import FunctionTransformer

In [None]:
# add "src" to PYTHONPATH
notebook_dir = os.path.dirname(os.path.abspath(''))
sys.path.append(os.path.join(notebook_dir, 'src'))

# import custom functions to transform our data
from transformers import replace_question_with_nan, convert_to_numeric

In [None]:
df = pd.read_csv("../data/breast_cancer_bd.csv")
df.head()

In [None]:
# mapping for "Class"
mapping = {
    2: 0,
    4: 1
}
df["Target_Class_Mapped"] = df["Class"].map(mapping)

In [None]:
# drop "Sample code number"
df = df.drop("Sample code number", axis=1)

In [None]:
df_without_cancer = df[df["Target_Class_Mapped"] == 0]
random_samples_without_cancer = df_without_cancer.sample(n=10, random_state=42)
random_samples_without_cancer.head()

In [None]:
df_with_cancer = df[df["Target_Class_Mapped"] == 1]
random_samples_with_cancer = df_with_cancer.sample(n=10, random_state=42)
random_samples_with_cancer.head()

In [None]:
# concatenate all "random samples" in a new dataframe

# NOTE: thats not the best practices to test a model
# It is just a unit test to see if the model performs well on some "random" dataset
# to know exactly if the model will work in production, its necessary to generate new synthetic data
# to simulate data that was NEVER SEEN BEFORE by the model!

# concatenate samples
df_test_samples = pd.concat([random_samples_without_cancer, random_samples_with_cancer])
df_test_samples = df_test_samples.sample(frac=1, random_state=42).reset_index(drop=True)

# create X and y (target)
X_test_samples = df_test_samples.drop(columns=["Target_Class_Mapped"])
y_test_samples = df_test_samples["Target_Class_Mapped"]

In [None]:
# load model, predict and compare results with "random" dataset
lr_model_path = "../model/logreg_breast_cancer_pipeline_v1.0.joblib"
lr_model = joblib.load(lr_model_path)

predictions = lr_model.predict(X_test_samples)
accuracy = np.mean(predictions == y_test_samples) * 100

print("--- Random Sample Prediction ---")
print("Prediction: ", predictions)
print("True Label: ", y_test_samples.values)
print(f"\nCorrect predictions: {accuracy:.2f}%.")