# Breast Cancer Recurrence Prediction using Machine Learning

## Dataset Preparation

In [1]:
# import needed libraries
import numpy as np
import pandas as pd

from statsmodels.stats.power import TTestIndPower

from sklearn.model_selection import train_test_split

# set fixed random seed
np.random.seed(42)

### Importing Dataset

The data is provided as two separate ```.data``` files<br>
- ```breast-cancer.data```, containing the dataset 
- ```breast-cancer.names```, containing relevant informations about the dataset

In [2]:
# import and preview the dataset
data = pd.read_csv('./dataset/breast-cancer.data', header=None)

data.columns = [
    'class',
    'age',
    'menopause',
    'tumour_size',
    'inv_nodes',
    'node_caps',
    'deg_malig',
    'breast',
    'breast_quad',
    'irrad'
]


### Dataset Size and Memory Usage

Size is expressed in *bytes* then converted in *megabytes*

In [3]:
size_b = data.memory_usage(deep=True).sum()
size_mb = size_b / (1024 * 1024)

print(f"Size data: {size_mb:.2f} Mb")

Size data: 0.16 Mb


### Preview the Dataset

In [4]:
data.head(10)

Unnamed: 0,class,age,menopause,tumour_size,inv_nodes,node_caps,deg_malig,breast,breast_quad,irrad
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no
5,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,left,left_low,no
6,no-recurrence-events,50-59,premeno,25-29,0-2,no,2,left,left_low,no
7,no-recurrence-events,60-69,ge40,20-24,0-2,no,1,left,left_low,no
8,no-recurrence-events,40-49,premeno,50-54,0-2,no,2,left,left_low,no
9,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,left_up,no


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   class        286 non-null    object
 1   age          286 non-null    object
 2   menopause    286 non-null    object
 3   tumour_size  286 non-null    object
 4   inv_nodes    286 non-null    object
 5   node_caps    286 non-null    object
 6   deg_malig    286 non-null    int64 
 7   breast       286 non-null    object
 8   breast_quad  286 non-null    object
 9   irrad        286 non-null    object
dtypes: int64(1), object(9)
memory usage: 22.5+ KB


In [6]:
# preview the informations provided with the dataset
data_info = open("./dataset/breast-cancer.names").read()
print(data_info)

Citation Request:
   This breast cancer domain was obtained from the University Medical Centre,
   Institute of Oncology, Ljubljana, Yugoslavia.  Thanks go to M. Zwitter and 
   M. Soklic for providing the data.  Please include this citation if you plan
   to use this database.

1. Title: Breast cancer data (Michalski has used this)

2. Sources: 
   -- Matjaz Zwitter & Milan Soklic (physicians)
      Institute of Oncology 
      University Medical Center
      Ljubljana, Yugoslavia
   -- Donors: Ming Tan and Jeff Schlimmer (Jeffrey.Schlimmer@a.gp.cs.cmu.edu)
   -- Date: 11 July 1988

3. Past Usage: (Several: here are some)
     -- Michalski,R.S., Mozetic,I., Hong,J., & Lavrac,N. (1986). The 
        Multi-Purpose Incremental Learning System AQ15 and its Testing 
        Application to Three Medical Domains.  In Proceedings of the 
        Fifth National Conference on Artificial Intelligence, 1041-1045,
        Philadelphia, PA: Morgan Kaufmann.
        -- accuracy range: 66%-72%
     -

##### Missing values are denoted as *?*

In [7]:
# replace missing values from ? to np.NaN
data = data.replace("?", np.nan)

### Sample Size Estimation

I want to determine if the available data points included in the provided dataset are enough to successfully train a model for the objective stated.<br>
For this prediction model, I want to avoid as much as possible false positives and especially false negatives.<br>
I will use power analysis and set:
- Effect Size to 0.5
- Alpha to 0.001
- Power to 0.9

Most of the features are categorical type data. To run the power analysis I will need to store the dataset into a new variable ```power_analysis``` and convert the categorical data type to numerical data type. 

#### Power Analysis

In [8]:
# save dataset to new variable
power_analysis = data[['class', 'age', 'menopause', 'tumour_size', 'inv_nodes', 'node_caps', 'deg_malig', 'breast', 'breast_quad', 'irrad']]
power_analysis.head(3)

Unnamed: 0,class,age,menopause,tumour_size,inv_nodes,node_caps,deg_malig,breast,breast_quad,irrad
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no


In [9]:
# convert data columns to numerical type except for deg_malig
for col in power_analysis.columns:
    if col != 'deg_malig':
        power_analysis[col] = pd.Categorical(power_analysis[col]).codes
    else:
        continue

power_analysis.head(3)

Unnamed: 0,class,age,menopause,tumour_size,inv_nodes,node_caps,deg_malig,breast,breast_quad,irrad
0,0,1,2,5,0,0,3,0,1,0
1,0,2,2,3,0,0,2,1,4,0
2,0,2,2,3,0,0,2,0,1,0


In [10]:
# Define effect size, alpha, and power
effect_size = 0.5
alpha = 0.001
power = 0.9

# Count the total number of categories across all categorical features
n_classes = len(set(power_analysis[power_analysis.columns].values.flatten()))


# Create a TTestIndPower object for a two-sample t-test
analysis = TTestIndPower()

# Calculate the required sample size
sample_size = analysis.solve_power(effect_size=effect_size, alpha=alpha, power=power)

print('Required Sample Size:', round(sample_size))

Required Sample Size: 170


  return np.clip(_boost._nct_sf(x, df, nc), 0, 1)
  return np.clip(_boost._nct_cdf(x, df, nc), 0, 1)


#### Sensitivity Analysis

In [11]:
# Define a list of effect sizes to test
effect_sizes = [0.2, 0.8]

# Calculate the required sample size for each effect size
for effect_size in effect_sizes:
    sample_size = analysis.solve_power(effect_size=effect_size, alpha=alpha, power=power)
    print(f"Effect size: {effect_size}, Required sample size: {round(sample_size)}")

Effect size: 0.2, Required sample size: 1048
Effect size: 0.8, Required sample size: 68


#### Achieved Power

In [12]:
effect_size = 0.5
alpha = 0.001

# Count the total number of categories across all categorical features
n_classes = len(set(power_analysis[power_analysis.columns].values.flatten()))

# Create a TTestIndPower object for a two-sample t-test
analysis = TTestIndPower()

# Define the sample size
sample_size = 200

# Calculate the achieved power
achieved_power = analysis.power(effect_size=effect_size, nobs1=sample_size, alpha=alpha, ratio=1, df=None, alternative='two-sided')

print(f"Achieved power: ", round(achieved_power,2))

Achieved power:  0.95


The dataset is large enough to continue working on the prediction model.

### Split Train and Test

In [13]:
TARGET_NAME = "class"
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=[TARGET_NAME]), 
                                                    data[TARGET_NAME], 
                                                    test_size=0.30,
                                                    stratify=data[TARGET_NAME],
                                                    random_state=42)

# re-merger x,y for both train and test
data_train = pd.merge(left=y_train , right=X_train, left_index=True, right_index=True)
data_test  = pd.merge(left=y_test  , right=X_test, left_index=True, right_index=True)

print(f"# sample train set: {data_train.shape[0]}")
print(f"# sample test set: {data_test.shape[0]}")

# sample train set: 200
# sample test set: 86


In [14]:
data_train.head()

Unnamed: 0,class,age,menopause,tumour_size,inv_nodes,node_caps,deg_malig,breast,breast_quad,irrad
62,no-recurrence-events,50-59,ge40,0-4,0-2,no,1,left,left_low,no
61,no-recurrence-events,40-49,premeno,10-14,0-2,no,1,right,right_low,no
66,no-recurrence-events,40-49,premeno,25-29,0-2,no,1,right,right_low,no
258,recurrence-events,50-59,ge40,30-34,6-8,yes,2,left,right_low,yes
77,no-recurrence-events,30-39,premeno,25-29,0-2,no,1,left,central,no


In [15]:
# store the train and test set locally
data_train.to_csv(
    path_or_buf="./dataset/data_train.csv",
    header=True,
    index=False,
)

data_test.to_csv(
    path_or_buf="./dataset/data_test.csv",
    header=True,
    index=False,
)