## Breast Cancer Data Pre-Processing

In [1]:
# importing the libraries

import pandas as pd
import numpy as np
import datetime as dt

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm

from sklearn.preprocessing import StandardScaler

In [2]:
# function

def add_col(data, to_add, to_remove):
    
    data[to_add] = np.where(data[to_remove].isna(), 0, 1)
    
    data = data.drop([to_remove], axis=1)
    return data

### Importing Datasets

In [3]:
df1 = pd.read_excel('breast_cancer_data.xlsx')
df2 = pd.read_excel('breast_cancer_data_2.xlsx')

tnm1 = pd.read_csv("breast_cancer_data_tnm.csv")
tnm2 = pd.read_csv("breast_cancer_data_tnm_2.csv")

In [4]:
# concatenate the two datasets
data = pd.concat([df1, df2]).set_index('ehr')
data.head(50)

Unnamed: 0_level_0,Unnamed: 0,side,neoadjuvant,grade,invasive,er_positive,pr_positive,her2_positive,ki67,birth_date,diagnosis_date,death_date,recurrence_year,menarche_age,menopause_age,pregnancy,abort,birth,caesarean,hist_type
ehr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
6849,AA,,no,1.0,1.0,1.0,1.0,1.0,,1967-08-08,2016-08-23,,,17.0,51.0,2.0,0.0,2,,ductal
268,AB,,no,,1.0,1.0,1.0,0.0,,1950-03-11,2015-09-05,,,12.0,,2.0,0.0,2,,unknown
1458,AC,,no,1.0,1.0,1.0,1.0,0.0,0.0,1953-09-17,2017-03-01,,,11.0,,2.0,0.0,2,0.0,ductal
268,AD,,no,,1.0,1.0,1.0,0.0,,1950-03-11,2015-09-05,,,12.0,,2.0,0.0,2,,unknown
2013,AE,,yes,3.0,1.0,1.0,1.0,1.0,17.0,1977-08-19,2014-08-31,,,,,,,-1,,ductal
1350,AF,,no,2.0,1.0,0.0,1.0,0.0,44.0,1951-04-02,2003-05-24,2022-05-11,,14.0,,3.0,,3,,ductal
5850,AG,,no,1.0,1.0,1.0,1.0,0.0,10.0,1958-11-12,2020-09-05,,,15.0,58.0,2.0,0.0,2,,unknown
268,AH,,no,,1.0,1.0,1.0,0.0,,1950-03-11,2015-09-05,,,12.0,,2.0,0.0,2,,unknown
953,AI,,no,1.0,1.0,1.0,0.0,0.0,10.0,1949-03-19,2021-07-11,,,13.0,,,,-1,,unknown
773,AJ,,no,2.0,1.0,1.0,1.0,0.0,10.0,1946-04-21,2017-09-26,,,,50.0,2.0,0.0,2,,ductal


In [5]:
# concatenate the two datasets
data_tnm = pd.concat([tnm1, tnm2]).set_index('ehr')
data_tnm

Unnamed: 0_level_0,n_tumor,t,n,m,t_after_neoadj,n_after_neoadj,m_after_neoadj
ehr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
6849,1,1,0.0,0.0,,,
6849,3,2,0.0,0.0,,,
268,1,1,0.0,0.0,,,
268,2,1,1.0,0.0,,,
268,3,2,0.0,0.0,,,
...,...,...,...,...,...,...,...
6647,1,,,,2.0,2.0,0.0
768,1,X,X,1.0,,,
4534,1,2,0,0.0,,,
7062,1,1,0,0.0,,,


### Creating new columns

In [6]:
data = add_col(data, 'dead', 'death_date')
data = add_col(data, 'recurrence', 'recurrence_year')
data = add_col(data, 'menopause', 'menopause_age')

In [7]:
data['age']=(dt.datetime.today()\
            -pd.to_datetime(data['birth_date'])).astype('timedelta64[Y]')

In [8]:
data['years_from_diagnosis']=(dt.datetime.today()\
            -pd.to_datetime(data['diagnosis_date'])).astype('timedelta64[Y]')

### Drop columns

In [9]:
data.isnull().sum()

Unnamed: 0                0
side                    229
neoadjuvant               1
grade                    94
invasive                 34
er_positive              24
pr_positive              27
her2_positive            42
ki67                     71
birth_date                0
diagnosis_date            0
menarche_age             53
pregnancy                99
abort                   115
birth                     0
caesarean               225
hist_type                 0
dead                      0
recurrence                0
menopause                 0
age                       0
years_from_diagnosis      0
dtype: int64

In [10]:
# drop birth_date, Unnamed: 0 (not useful) and side (more than 200 null)
# caesarean also has more than 200 null
data = data.drop(['Unnamed: 0',
                  'birth_date',
                  'diagnosis_date',
                  'caesarean',
                  'side'], axis=1)
data.head(20)

Unnamed: 0_level_0,neoadjuvant,grade,invasive,er_positive,pr_positive,her2_positive,ki67,menarche_age,pregnancy,abort,birth,hist_type,dead,recurrence,menopause,age,years_from_diagnosis
ehr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
6849,no,1.0,1.0,1.0,1.0,1.0,,17.0,2.0,0.0,2,ductal,0,0,1,55.0,6.0
268,no,,1.0,1.0,1.0,0.0,,12.0,2.0,0.0,2,unknown,0,0,0,72.0,7.0
1458,no,1.0,1.0,1.0,1.0,0.0,0.0,11.0,2.0,0.0,2,ductal,0,0,0,69.0,5.0
268,no,,1.0,1.0,1.0,0.0,,12.0,2.0,0.0,2,unknown,0,0,0,72.0,7.0
2013,yes,3.0,1.0,1.0,1.0,1.0,17.0,,,,-1,ductal,0,0,0,45.0,8.0
1350,no,2.0,1.0,0.0,1.0,0.0,44.0,14.0,3.0,,3,ductal,1,0,0,71.0,19.0
5850,no,1.0,1.0,1.0,1.0,0.0,10.0,15.0,2.0,0.0,2,unknown,0,0,1,64.0,2.0
268,no,,1.0,1.0,1.0,0.0,,12.0,2.0,0.0,2,unknown,0,0,0,72.0,7.0
953,no,1.0,1.0,1.0,0.0,0.0,10.0,13.0,,,-1,unknown,0,0,0,73.0,1.0
773,no,2.0,1.0,1.0,1.0,0.0,10.0,,2.0,0.0,2,ductal,0,0,1,76.0,5.0


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244 entries, 6849 to 7066
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   neoadjuvant           243 non-null    object 
 1   grade                 150 non-null    float64
 2   invasive              210 non-null    float64
 3   er_positive           220 non-null    float64
 4   pr_positive           217 non-null    float64
 5   her2_positive         202 non-null    float64
 6   ki67                  173 non-null    float64
 7   menarche_age          191 non-null    float64
 8   pregnancy             145 non-null    float64
 9   abort                 129 non-null    float64
 10  birth                 244 non-null    int64  
 11  hist_type             244 non-null    object 
 12  dead                  244 non-null    int32  
 13  recurrence            244 non-null    int32  
 14  menopause             244 non-null    int32  
 15  age                

### Map values and replace with most frequent

In [12]:
# checking the values of all the features using value counts
categoricals = ['neoadjuvant', 'hist_type']

for feature in categoricals:
    print(f"Values Counts for [{feature}]")
    print(data[feature].value_counts())
    print("\n\n")

Values Counts for [neoadjuvant]
no     194
yes     49
Name: neoadjuvant, dtype: int64



Values Counts for [hist_type]
ductal     118
unknown    106
lobular     20
Name: hist_type, dtype: int64





In [13]:
data['neoadjuvant'] = data['neoadjuvant'].map({'no': 0, 'yes': 1})
data['neoadjuvant'].value_counts()

0.0    194
1.0     49
Name: neoadjuvant, dtype: int64

In [14]:
ordinal_col = ['hist_type']
data_dum = pd.get_dummies(data, columns = ordinal_col)

In [15]:
# replace with most frequent value
for column in ['neoadjuvant','grade', 'invasive', 'er_positive', 'pr_positive', 'her2_positive', 'ki67', 'menarche_age']:
    data_dum[column].fillna(data_dum[column].mode()[0], inplace=True)
    
# replace with mean value
#for column in ['ki67']:                                           ## mode=10, mean=20.4
#    data_dum[column].fillna(data_dum[column].mean(), inplace=True)

In [16]:
data_dum.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244 entries, 6849 to 7066
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   neoadjuvant           244 non-null    float64
 1   grade                 244 non-null    float64
 2   invasive              244 non-null    float64
 3   er_positive           244 non-null    float64
 4   pr_positive           244 non-null    float64
 5   her2_positive         244 non-null    float64
 6   ki67                  244 non-null    float64
 7   menarche_age          244 non-null    float64
 8   pregnancy             145 non-null    float64
 9   abort                 129 non-null    float64
 10  birth                 244 non-null    int64  
 11  dead                  244 non-null    int32  
 12  recurrence            244 non-null    int32  
 13  menopause             244 non-null    int32  
 14  age                   244 non-null    float64
 15  years_from_diagnosi