## Setup

Load libraries and seed for reproducibility.


In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import joblib
import os
np.random.seed(42)

## Load and Inspect Data

Using the built-in breast cancer dataset (all numeric).


In [2]:
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name='target')
X.head(), y.value_counts()

(   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
 0        17.99         10.38          122.80     1001.0          0.11840   
 1        20.57         17.77          132.90     1326.0          0.08474   
 2        19.69         21.25          130.00     1203.0          0.10960   
 3        11.42         20.38           77.58      386.1          0.14250   
 4        20.29         14.34          135.10     1297.0          0.10030   
 
    mean compactness  mean concavity  mean concave points  mean symmetry  \
 0           0.27760          0.3001              0.14710         0.2419   
 1           0.07864          0.0869              0.07017         0.1812   
 2           0.15990          0.1974              0.12790         0.2069   
 3           0.28390          0.2414              0.10520         0.2597   
 4           0.13280          0.1980              0.10430         0.1809   
 
    mean fractal dimension  ...  worst radius  worst texture  worst perimeter 

## Basic Cleaning

Check for missing values and basic stats. (Dataset is already clean, but steps shown for completeness.)


In [3]:
missing = X.isna().sum().sum()
desc = X.describe().T[['mean','std','min','max']].head()
missing, desc

(0,
                        mean         std        min        max
 mean radius       14.127292    3.524049    6.98100    28.1100
 mean texture      19.289649    4.301036    9.71000    39.2800
 mean perimeter    91.969033   24.298981   43.79000   188.5000
 mean area        654.889104  351.914129  143.50000  2501.0000
 mean smoothness    0.096360    0.014064    0.05263     0.1634)

## Train/Test Split

Stratified split keeps class balance.


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_train.shape, X_test.shape

((455, 30), (114, 30))

## Pipelines

- Logistic Regression: scaler + classifier.
- Decision Tree: raw numeric features.


In [5]:
numeric_features = X.columns
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
preprocess = ColumnTransformer(
    transformers=[('num', numeric_transformer, numeric_features)],
    remainder='drop'
)
log_reg_clf = Pipeline(steps=[
    ('preprocess', preprocess),
    ('model', LogisticRegression(max_iter=1000))
])
tree_clf = Pipeline(steps=[('model', DecisionTreeClassifier(random_state=42, max_depth=5))])

## Train Models


In [6]:
log_reg_clf.fit(X_train, y_train)
tree_clf.fit(X_train, y_train)
'done'

'done'

## Evaluate


In [7]:
def evaluate(model, X_te, y_te, label):
    preds = model.predict(X_te)
    acc = accuracy_score(y_te, preds)
    report = classification_report(y_te, preds, target_names=data.target_names)
    
    print(report)
evaluate(log_reg_clf, X_test, y_test, 'Logistic Regression')
evaluate(tree_clf, X_test, y_test, 'Decision Tree')

              precision    recall  f1-score   support

   malignant       0.98      0.98      0.98        42
      benign       0.99      0.99      0.99        72

    accuracy                           0.98       114
   macro avg       0.98      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114

              precision    recall  f1-score   support

   malignant       0.87      0.93      0.90        42
      benign       0.96      0.92      0.94        72

    accuracy                           0.92       114
   macro avg       0.91      0.92      0.92       114
weighted avg       0.92      0.92      0.92       114



## Export Models

Save trained artifacts with joblib.


In [8]:
os.makedirs('artifacts', exist_ok=True)
log_reg_path = os.path.join('artifacts', 'log_reg_model.joblib')
tree_path = os.path.join('artifacts', 'decision_tree_model.joblib')
joblib.dump(log_reg_clf, log_reg_path)
joblib.dump(tree_clf, tree_path)
log_reg_path, tree_path

('artifacts\\log_reg_model.joblib', 'artifacts\\decision_tree_model.joblib')

## Quick Inference Check

Load artifacts and predict on a few rows.


In [9]:
loaded_log_reg = joblib.load(log_reg_path)
loaded_tree = joblib.load(tree_path)
sample = X_test.iloc[:3]
log_preds = loaded_log_reg.predict(sample)
tree_preds = loaded_tree.predict(sample)
pd.DataFrame({
    'true': y_test.iloc[:3].values,
    'log_reg_pred': log_preds,
    'tree_pred': tree_preds
})

Unnamed: 0,true,log_reg_pred,tree_pred
0,0,0,0
1,1,1,1
2,0,0,0
