In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [2]:
dat = load_breast_cancer()

In [5]:
print("The sklearn breast cancer dataset keys:")
print(dat.keys()) # dict_keys(['target_names', 'target', 'feature_names', 'data', 'DESCR'])
print("---")

# Note that we need to reverse the original '0' and '1' mapping in order to end up with this mapping:
# Benign = 0 (negative class)
# Malignant = 1 (positive class)

li_classes = [dat.target_names[1], dat.target_names[0]]
li_target = [1 if x==0 else 0 for x in list(dat.target)]
li_ftrs = list(dat.feature_names)

print("There are 2 target classes:")
print("li_classes", li_classes)
print("---")
print("Target class distribution from a total of %d target values:" % len(li_target))
print(pd.Series(li_target).value_counts())
print("---")

df_all = pd.DataFrame(dat.data[:,:], columns=li_ftrs)
print("Describe dataframe, first 6 columns:")
print(df_all.iloc[:,:6].describe().to_string())

The sklearn breast cancer dataset keys:
dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])
---
There are 2 target classes:
li_classes ['benign', 'malignant']
---
Target class distribution from a total of 569 target values:
0    357
1    212
Name: count, dtype: int64
---
Describe dataframe, first 6 columns:
       mean radius  mean texture  mean perimeter    mean area  mean smoothness  mean compactness
count   569.000000    569.000000      569.000000   569.000000       569.000000        569.000000
mean     14.127292     19.289649       91.969033   654.889104         0.096360          0.104341
std       3.524049      4.301036       24.298981   351.914129         0.014064          0.052813
min       6.981000      9.710000       43.790000   143.500000         0.052630          0.019380
25%      11.700000     16.170000       75.170000   420.300000         0.086370          0.064920
50%      13.370000     18.840000       86.240000   55

In [6]:
TEST_SIZE_RATIO = 0.2

# Setup X and y
X = df_all
y = pd.Series(li_target)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE_RATIO, random_state=0)
print("X_train.shape, y_train.shape", X_train.shape, y_train.shape)
print("X_test.shape, y_test.shape", X_test.shape, y_test.shape)

X_train.shape, y_train.shape (455, 30) (455,)
X_test.shape, y_test.shape (114, 30) (114,)


In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

In [9]:
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_const = np.hstack((np.ones((X_train_scaled.shape[0],1)), X_train_scaled))
X_test_const = np.hstack((np.ones((X_test_scaled.shape[0],1)), X_test_scaled))

In [10]:
X_train_breast = pd.DataFrame(X_train_const)
X_test_breast = pd.DataFrame(X_test_const)

In [11]:
X_train_breast.sample(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
419,1.0,-0.688803,-0.718006,-0.672326,-0.652144,1.188801,-0.211932,-0.575101,-0.450526,0.159344,...,-0.601022,-0.647981,-0.604795,-0.565752,0.89115,-0.681059,-0.594186,-0.249777,-0.159181,-0.182723
62,1.0,-1.224838,0.189467,-1.223254,-1.004323,0.434313,-0.997506,-1.037824,-1.057009,-1.678584,...,-1.092304,0.123279,-1.129743,-0.879367,-0.038018,-1.027402,-1.238611,-1.389104,-1.54184,0.359023
326,1.0,-0.394309,-1.293302,-0.413332,-0.452017,0.01862,-0.384433,-0.959964,-0.763844,-0.626252,...,-0.564175,-1.322423,-0.546566,-0.559094,-0.11766,-0.684328,-1.078946,-0.781892,-0.85051,-0.80202
157,1.0,-0.380151,0.54331,-0.403449,-0.409252,-0.64736,-0.739745,-0.725263,-0.520037,-0.421632,...,-0.513,-0.334554,-0.531787,-0.515048,-1.706979,-1.059767,-1.081303,-1.022509,-1.15247,-0.830681
76,1.0,-1.116385,-1.028521,-1.122786,-0.955689,1.232329,-0.450261,-0.986213,-0.928284,3.411343,...,-1.071834,-1.327346,-1.098707,-0.875099,-0.219426,-1.001705,-1.232644,-1.358759,1.037533,-0.18497


In [12]:
X_train_breast.to_csv("breast_X_train.csv", index=False)
X_test_breast.to_csv("breast_X_test.csv", index=False)
y_train.to_csv("breast_y_train.csv", index=False)
y_test.to_csv("breast_y_test.csv", index=False)