In [88]:

import numpy as np
import pandas as pd
import json
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from seldonian.utils.io_utils import save_json



Data Description

DATA USED: https://archive.ics.uci.edu/ml/datasets/breast+cancer (Found on UCI Machine Learning Archive)

Provided to the UCI Machine Learning Archive by The Oncology Institute

The unprocessed data (found at ./data/breast-cancer.csv) contains 85 instances for breast cancer patients that experienced recurrence events and 201 instances for patients that did not experience recurrence. There are 9 other features that can be used to predict recurrence.

More information on the data can be found in ./data/breast-cancer.txt

Read the Data:

In [89]:
filePath = './data/breast-cancer.csv'

columnsOrig = ["recurrence-events", "age", "menopause", "tumor-size", "inv-nodes", "nodes-cap",
                "deg-malig", "breast", "breast-quad", "irradiat"]

data = pd.read_csv(filePath, header=None, names=columnsOrig)


# set inputs and outputs
Y = data['recurrence-events']
X = data.drop(columns=['recurrence-events'])


# set recurrence (output) to a True or False
Y = Y.map({"no-recurrence-events": False, "recurrence-events": True})

# set menopause, nodes-cap, irradiat to booleans
X["menopause"] = X["menopause"].map({'lt40': True, 'ge40': True, 'premeno': False})
X["nodes-cap"] = X["nodes-cap"].map({'yes': True, 'no': False})
X["irradiat"] = X["irradiat"].map({'yes': True, 'no': False})

Process Categorical Features

In [90]:
cat_cols = ["age", "menopause", "tumor-size", "inv-nodes", "nodes-cap",
                "deg-malig", "breast", "breast-quad", "irradiat"]

ohe = OneHotEncoder(handle_unknown='ignore')

#categorical_processing = Pipeline(steps=['ohe', ohe])

preprocessing = ColumnTransformer(transformers=[('cat', ohe, cat_cols)],
                                   remainder='passthrough')

X = preprocessing.fit_transform(X)

Y = LabelEncoder().fit_transform(Y)

# remove cat__ prefix in each column name
outCols = preprocessing.get_feature_names_out()
for i, col in enumerate(outCols):
    outCols[i] = col.removeprefix("cat__")

# save as dataframe and join with Y
X = X.todense()
outdf = pd.DataFrame(X, columns=outCols)
outdf["recurrence"] = Y

# change menopause columns
outdf.rename(columns={'menopause_False':'premenopause','menopause_True': 'menopause'},inplace=True)


Split to Training and Testing Data and save both as csv, save metadata

In [91]:
# Randomize the rows in the dataframe
trainDF = outdf.sample(frac=1)

trainDF.to_csv("./data/BC_Data_Proc.csv",index=False,header=False)

# Save metadata json file
metadata_dict = {
    "regime":"supervised_learning",
    "sub_regime":"classification",
    "all_col_names":list(trainDF.columns),
    "label_col_names": "recurrence",
    "sensitive_col_names":["premenopause","menopause"]
}

with open("data/metadata_breast_cancer.json",'w') as outfile:
    json.dump(metadata_dict,outfile,indent=2)
print(f"Saved metadata file to: data/metadata_breast_cancer.json") 


Saved metadata file to: data/metadata_breast_cancer.json
