In [1]:
# import libraries
import pandas as pd
import numpy as np
import datetime as dt

from sklearn.impute import SimpleImputer

from paths import RAW_DIR

In [None]:
# function

def add_col(data, to_add, to_remove):
    
    data[to_add] = np.where(data[to_remove].isna(), 0, 1)
    
    data = data.drop([to_remove], axis=1)
    return data

### Import the data

In [2]:
tnm1 = pd.read_csv(RAW_DIR / "breast_cancer_data_tnm.csv")
tnm2 = pd.read_csv(RAW_DIR / "breast_cancer_data_tnm_2.csv")

df1 = pd.read_excel(RAW_DIR / "breast_cancer_data.xlsx")
df2 = pd.read_excel(RAW_DIR / "breast_cancer_data_2.xlsx")

In [None]:
# concatenate the two datasets
data = pd.concat([df1, df2]).set_index('ehr')
data

In [None]:
# concatenate the two datasets
data_tnm = pd.concat([tnm1, tnm2]).set_index('ehr')
data_tnm

### Creating new columns

In [None]:
data = add_col(data, 'dead', 'death_date')
data = add_col(data, 'recurrence', 'recurrence_year')
data = add_col(data, 'menopause', 'menopause_age')

In [None]:
data['age']=(dt.datetime.today()\
            -pd.to_datetime(data['birth_date'])).astype('timedelta64[Y]')

In [None]:
data['years_from_diagnosis']=(dt.datetime.today()\
            -pd.to_datetime(data['diagnosis_date'])).astype('timedelta64[Y]')

### Drop columns

In [None]:
data.isnull().sum()

In [None]:
# drop birth_date, Unnamed: 0 (not useful) and side (more than 200 null)
# caesarean also has more than 200 null
data = data.drop(['Unnamed: 0',
                  'birth_date',
                  'diagnosis_date',
                  'caesarean',
                  'side'], axis=1)
data.head(20)

In [None]:
data.info()

### Map values and replace with most frequent

In [None]:
# checking the values of all the features using value counts
categoricals = ['neoadjuvant', 'hist_type']

for feature in categoricals:
    print(f"Values Counts for [{feature}]")
    print(data[feature].value_counts())
    print("\n\n")

In [None]:
data['neoadjuvant'] = data['neoadjuvant'].map({'no': 0, 'yes': 1})
data['neoadjuvant'].value_counts()

In [None]:
ordinal_col = ['hist_type']
data_dum = pd.get_dummies(data, columns = ordinal_col)

In [None]:
# replace with most frequent value
for column in ['neoadjuvant','grade', 'invasive', 'er_positive', 'pr_positive', 'her2_positive', 'ki67', 'menarche_age']:
    data_dum[column].fillna(data_dum[column].mode()[0], inplace=True)
    
# replace with mean value
#for column in ['ki67']:                                           ## mode=10, mean=20.4
#    data_dum[column].fillna(data_dum[column].mean(), inplace=True)

In [None]:
data_dum.info()