# Breast Cancer Classification - Logistic Reegression
Develop a machine learning model to classify breast cancer using real clinical data

1. Import dependecies

In [2]:
import numpy as np # Used to make numpy arrays
import pandas as pd # Used to create pandas df (structured tables)
import sklearn.datasets
from sklearn.model_selection import train_test_split # Split our ds into training and testing data
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score # Used to evaluate model

2. Data collection

In [3]:
# Loading data from sklearn
breast_cancer_dataset = sklearn.datasets.load_breast_cancer()
breast_cancer_dataset

{'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
         1.189e-01],
        [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
         8.902e-02],
        [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
         8.758e-02],
        ...,
        [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
         7.820e-02],
        [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
         1.240e-01],
        [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
         7.039e-02]]),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
        1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0

In [4]:
# Loading data to a dataframe
df = pd.DataFrame(breast_cancer_dataset.data, columns = breast_cancer_dataset.feature_names) # Create a table (DataFrame) with the breast cancer data and feature names as columns.
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [5]:
# Adding the 'target' column to the data frame
df['label'] = breast_cancer_dataset.target
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,label
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


3. Data Pre-Processing and analysis

In [6]:
# Print number of rows in the dataset - rows (569 people) and columns (30
# columns and the last one being the label)
df.shape

(569, 31)

In [7]:
# Get information about the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [8]:
# Statistical measures about the dataset
df.describe()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,label
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946,0.627417
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061,0.483918
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504,0.0
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146,0.0
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004,1.0
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208,1.0
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075,1.0


In [9]:
# Checking the distribuition of target variable - labels
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,357
0,212


1 --> Benign

0 --> Malignant

In [10]:
# Group dataset based on labels.
df.groupby('label').mean()
# This code groups the data by the 'label' column and calculates the mean of each feature for each group. This is useful for understanding the average characteristics of benign and malignant tumors.

Unnamed: 0_level_0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,17.46283,21.604906,115.365377,978.376415,0.102898,0.145188,0.160775,0.08799,0.192909,0.06268,...,21.134811,29.318208,141.37033,1422.286321,0.144845,0.374824,0.450606,0.182237,0.323468,0.09153
1,12.146524,17.914762,78.075406,462.790196,0.092478,0.080085,0.046058,0.025717,0.174186,0.062867,...,13.379801,23.51507,87.005938,558.89944,0.124959,0.182673,0.166238,0.074444,0.270246,0.079442


4. Splitting data into training and testing data

In [11]:
# Separate features (ccolumns) and target
X = df.drop(columns='label', axis=1)
Y = df['label']
print(X)

     mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0          17.99         10.38          122.80     1001.0          0.11840   
1          20.57         17.77          132.90     1326.0          0.08474   
2          19.69         21.25          130.00     1203.0          0.10960   
3          11.42         20.38           77.58      386.1          0.14250   
4          20.29         14.34          135.10     1297.0          0.10030   
..           ...           ...             ...        ...              ...   
564        21.56         22.39          142.00     1479.0          0.11100   
565        20.13         28.25          131.20     1261.0          0.09780   
566        16.60         28.08          108.30      858.1          0.08455   
567        20.60         29.33          140.10     1265.0          0.11780   
568         7.76         24.54           47.92      181.0          0.05263   

     mean compactness  mean concavity  mean concave points  mea

In [12]:
print(Y)

0      0
1      0
2      0
3      0
4      0
      ..
564    0
565    0
566    0
567    0
568    1
Name: label, Length: 569, dtype: int64


In [13]:
# Splitting data into training data and testing data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2) # Split the dataset into 80% training data and 20% testing data.
print(X.shape, X_train.shape, X_test.shape)

(569, 30) (455, 30) (114, 30)


5. Model training

In [14]:
# Logistic regression model
model = LogisticRegression()

# Traning our logistic regression model using trained data
model.fit(X_train, Y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


6. Model Evaluation

In [15]:
# Accuracy on training data
X_train_prediction = model.predict(X_train) # Use the model to make predictions on the training data (X_train - 80%). This gives predicted labels (0 or 1) for each training example.
training_data_accuracy = accuracy_score(Y_train, X_train_prediction) # Compare the predicted labels with the actual labels (Y_train - 20%) and calculate how many predictions the model got right
print('Accuracy on training data: ', round(training_data_accuracy, 3))

Accuracy on training data:  0.949


In [16]:
# Accuracy on testing data
X_test_prediction = model.predict(X_test) # # Use the model to make predictions on the testing data (X_test). These are new examples the model hasn't seen before
testing_data_accuracy = accuracy_score(Y_test, X_test_prediction) # # Compare the predicted labels with the actual labels (Y_test) and calculate how accurate the model is on unseen data
print('Accuracy on testing data: ', round(testing_data_accuracy, 3))

Accuracy on testing data:  0.93


### Evaluating Model Performance

After training a machine learning model, it's crucial to evaluate its performance to see how well it generalizes to new, unseen data. Here's what to consider:

*   **Accuracy Score:** This is the most basic metric and represents the proportion of correctly predicted instances (both positive and negative) out of the total number of instances. A higher accuracy score generally indicates a better-performing model. However, what constitutes a "good" accuracy score depends heavily on the specific problem and dataset. For example, in medical diagnosis, you would typically aim for a very high accuracy. In other domains, a lower accuracy might be acceptable.

*   **Overfitting:** This occurs when a model learns the training data too well, including the noise and outliers. An overfitted model will have a very high accuracy on the training data but perform poorly on the testing data (unseen data). This is because the model has essentially memorized the training examples instead of learning the underlying patterns. Think of it as a student who memorizes answers for a test but doesn't understand the concepts – they'll do well on that specific test but fail if the questions are slightly different.

*   **Underfitting:** This occurs when a model is too simple to capture the underlying patterns in the data. An underfitted model will have low accuracy on both the training and testing data. This is like a student who didn't study enough and doesn't understand the material at all – they'll perform poorly on any test.

**How to identify Overfitting and Underfitting:**

You can often detect overfitting and underfitting by comparing the accuracy scores on your training and testing datasets:

*   **Overfitting:** High training accuracy, significantly lower testing accuracy.
*   **Underfitting:** Low training accuracy, and similarly low testing accuracy.
*   **Good Fit:** High training accuracy, and a similar high testing accuracy. The scores might not be exactly the same, but they should be close.

7. Building a predictive system

In [19]:
# 1. You start with a tuple (one data point with 30 features)
input_data = (14.05,27.15,91.38,600.4,0.09929,0.1126,0.04462,0.04304,0.1537,0.06171,0.3645,1.492,2.888,29.84,0.007256,0.02678,0.02071,0.01626,0.0208,0.005304,15.3,33.17,100.2,706.7,0.1241,0.2264,0.1326,0.1048,0.225,0.08321)

# 2. Convert it to a NumPy array so it can be reshaped and passed to the model
input_data_as_numpy_array = np.asarray(input_data)

# 3. Reshape it from shape (30,) → (1, 30) because model expects a 2D array:
#    - 1 row = 1 sample/ row/ patient
#    - 30 columns = 30 features/ x
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

# 4. Predict using the trained model
prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('The tumor is Malignant')
else:
  print('The tumor is Benign')

[1]
The tumor is Benign



X does not have valid feature names, but LogisticRegression was fitted with feature names



### Step 8: 3D Visualization of Tumor Shape (Feature-Based)

**Simulated tumor geometry based on patient features:**

This is a synthetic 3D tumor generated entirely from the patient’s real numerical data — not a medical scan but a mathematically constructed shape that visually represents tumor characteristics.

- Built from real diagnostic features such as radius, texture, concavity, and more.  
- Each shape is unique — different patients yield different geometries.  
- Color reflects the model’s prediction: cool tones (blue, purple) for malignant, warm tones (yellow, orange) for benign.  
- Fully interactive — use your mouse to zoom, rotate, and explore the structure.  

This visualization brings an intuitive, visual layer to your machine learning model, enhancing demos, educational content or AI-driven medical tools.


In [18]:
import numpy as np
import plotly.graph_objects as go

# --- Patient input_data (from earlier step)
input_data = (14.05,27.15,91.38,600.4,0.09929,0.1126,0.04462,0.04304,0.1537,0.06171,0.3645,1.492,2.888,29.84,0.007256,0.02678,0.02071,0.01626,0.0208,0.005304,15.3,33.17,100.2,706.7,0.1241,0.2264,0.1326,0.1048,0.225,0.08321)

# --- Simulated prediction label (0 = malignant, 1 = benign)
prediction = model.predict(np.asarray(input_data).reshape(1, -1))

# --- Shape modifiers based on selected features
r = input_data[0] / 5
t = input_data[1] / 10
p = input_data[2] / 100
c = input_data[6] * 2
bump = (input_data[9] + input_data[5]) * 0.5

# --- Spherical grid
theta, phi = np.mgrid[0:np.pi:100j, 0:2*np.pi:100j]

# --- Tumor shape with deformation
x = r * np.sin(theta) * np.cos(phi) * (1 + c * np.sin(5 * phi))
y = r * np.sin(theta) * np.sin(phi) * p
z = r * np.cos(theta) * t

# --- Add surface “bumpiness”
x *= (1 + bump * np.cos(8 * phi) * np.sin(5 * theta))
z *= (1 + bump * np.sin(6 * phi))

# --- Simulated surface intensity (for colorscale)
intensity = c * np.sin(5 * phi) + bump * np.cos(4 * theta)

# --- Create interactive 3D surface
fig = go.Figure(data=[go.Surface(
    x=x, y=y, z=z,
    surfacecolor=intensity,
    colorscale='Inferno',
    cmin=-1, cmax=1,
    showscale=False
)])

# --- Title based on prediction
label = "Malignant" if prediction[0] == 0 else "Benign"

# --- Style and layout
fig.update_layout(
    title={
        'text': f"Simulated Tumor Geometry – {label}",
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        'font': dict(size=20)
    },
    autosize=True,
    margin=dict(l=0, r=0, b=0, t=60),
    scene=dict(
        xaxis=dict(backgroundcolor='white', gridcolor='lightgray', showspikes=False),
        yaxis=dict(backgroundcolor='white', gridcolor='lightgray', showspikes=False),
        zaxis=dict(backgroundcolor='white', gridcolor='lightgray', showspikes=False),
        xaxis_title='',
        yaxis_title='',
        zaxis_title=''
    )
)

fig.show()

