In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, roc_auc_score, matthews_corrcoef, confusion_matrix)
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

In [None]:
# load breast cancer dataset
# this is already part of sklearn, so we can directly load it without needing to upload it as a csv file, will test it with uploaded test data later.
brest_cancer_data = load_breast_cancer()
X = pd.DataFrame(brest_cancer_data.data, columns=brest_cancer_data.feature_names)
y = pd.Series(brest_cancer_data.target)

# scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# split the data into train and test set 
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, train_size=0.8, random_state=42)
training_size = X_train.shape[0]
test_size = X_test.shape[0]
print(f"Training size: {training_size}, Test size: {test_size}")


In [None]:
# evaluate the models using y_pred, y_true and y_proba
def evaluate_model(y_true, y_pred, y_proba):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_proba) 
    mcc = matthews_corrcoef(y_true, y_pred)
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC Score: {roc_auc:.4f}")
    print(f"Matthews Correlation Coefficient: {mcc:.4f}")
    
    return accuracy, precision, recall, f1, roc_auc, mcc

# plot confusion matrix
def plot_confusion_matrix(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()

In [None]:
# implement a simple logistic regression model first, will add more models later.
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
y_lr_pred = lr_model.predict(X_test)
y_lr_prob = lr_model.predict_proba(X_test)[:, 1] # probability of the positive class

print(y_lr_prob[:10])
print(y_lr_pred[:10])


In [None]:
joblib.dump(lr_model, "model/lr_model.pkl")

In [None]:
# evaluation metrics
evaluate_model(y_true=y_test, y_pred=y_lr_pred, y_proba=y_lr_prob)
# plot confusion matrix 
plot_confusion_matrix(y_true=y_test, y_pred=y_lr_pred)

In [None]:
# implement naive bayes classifier - gaussian or multinomial, will test both and see which one performs better on this dataset, will add more models later.
from sklearn.naive_bayes import GaussianNB
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
y_nb_pred = nb_model.predict(X_test)
y_nb_proba = nb_model.predict_proba(X_test)[:,1]
# evaluate the model
evaluate_model(y_true= y_test, y_pred=y_nb_pred, y_proba=y_nb_proba)

# draw confusion matrix
plot_confusion_matrix(y_true=y_test, y_pred=y_nb_pred)


In [None]:
joblib.dump(nb_model, "model/nb_model.pkl")

In [None]:
# implement a Decision Tree Classifier now
from sklearn.tree import DecisionTreeClassifier

dcf = DecisionTreeClassifier(random_state=42, max_depth=4)
dcf.fit(X=X_train, y=y_train)
y_dcf_pred = dcf.predict(X=X_test)
y_dcf_proba = dcf.predict_proba(X=X_test)[:,1] # since binary classificaiton

# evaluate 
evaluate_model(y_true=y_test, y_pred=y_dcf_pred, y_proba=y_dcf_proba)

# make confusion matrix
plot_confusion_matrix(y_true=y_test, y_pred=y_dcf_pred)

In [None]:
joblib.dump(dcf, "model/dcf_model.pkl")

In [None]:
# implement KNN instance learning
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y=y_train)
y_knn_pred = knn_model.predict(X_test)
y_knn_proba = knn_model.predict_proba(X_test)[:, 1]
# evaluate the matrix
evaluate_model(y_true=y_test, y_pred=y_knn_pred, y_proba=y_knn_proba)

#draw confusion matrix
plot_confusion_matrix(y_true= y_test, y_pred=y_knn_pred)

In [None]:
joblib.dump(knn_model, "model/knn_model.pkl")

In [None]:
# implement random forest
from sklearn.ensemble import RandomForestClassifier

# bagging with 25% of samples, take only sqrt(30) = 5 features to build the single decision stump (estimator)
rfc_model = RandomForestClassifier(max_depth=4, n_estimators=100, \
                                   max_features="sqrt", n_jobs=3, \
                                    random_state=42, max_samples=.25) 
rfc_model.fit(X=X_train, y=y_train)
y_rfc_pred = rfc_model.predict(X=X_test)
y_rfc_proba = rfc_model.predict_proba(X=X_test)[:,1]

#evaluate the model
evaluate_model(y_pred=y_rfc_pred, y_true=y_test, y_proba=y_rfc_proba)

# plot confusion matrix 
plot_confusion_matrix(y_true=y_test, y_pred=y_rfc_pred)

In [None]:
# save the model (pickle file)
joblib.dump(rfc_model, "model/rfc_model.pkl")

In [None]:
# let's do the last one finally ;) , XGBoost
# sklearn doesn't provide one, we need to do it via xgboost lib
#from xgboost import 