In [1]:
# import libraries
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer

from paths import RAW_DIR

### Import the data

In [2]:
tnm1 = pd.read_csv(RAW_DIR / "breast_cancer_data_tnm.csv")
tnm2 = pd.read_csv(RAW_DIR / "breast_cancer_data_tnm_2.csv")

df1 = pd.read_excel(RAW_DIR / "breast_cancer_data.xlsx")
df2 = pd.read_excel(RAW_DIR / "breast_cancer_data_2.xlsx")

In [None]:
# concatenate the two datasets
data = pd.concat([df1, df2]).set_index('ehr')
data

In [None]:
# concatenate the two datasets
data_tnm = pd.concat([tnm1, tnm2]).set_index('ehr')
data_tnm

### NULL values

In [None]:
data.isnull().sum()

In [None]:
data_tnm.isnull().sum()

In [None]:
# drop Unnamed: 0 (not useful) and side (more than 200 null)
data = data.drop(['Unnamed: 0', 'side'], axis=1)

# add alive col and drop death_date col
data['alive'] = np.where(
    data['death_date'].isna(),
    0,
    1
)
data = data.drop(['death_date'], axis=1)
data.head(20)

In [None]:
data.info()

In [None]:
# checking the values of all the features using value counts
categoricals = ['neoadjuvant', 'hist_type', 'caesarean']

for feature in categoricals:
    print(f"Values Counts for [{feature}]")
    print(data[feature].value_counts())
    print("\n\n")

In [None]:
data['neoadjuvant'] = data['neoadjuvant'].map({'no': 0, 'yes': 1})
data['neoadjuvant'].value_counts()

In [None]:
# caesarean: could fill with 0 and get dummies ??

In [None]:
ordinal_col = ['hist_type']
data_dum = pd.get_dummies(data, columns = ordinal_col)

In [None]:
# replace with most frequent value
for column in ['neoadjuvant','grade', 'invasive', 'er_positive', 'pr_positive', 'her2_positive', 'ki67']:
    data_dum[column].fillna(data_dum[column].mode()[0], inplace=True)
    
# replace with mean value
#for column in ['ki67']:                                           ## mode=10, mean=20.4
#    data_dum[column].fillna(data_dum[column].mean(), inplace=True)

In [None]:
data_dum.info()

In [None]:
data_dum.head(50)

In [None]:
# For menarche_age we can use the most frequent one.
data['menarche_age'] = data['menarche_age'].fillna(data['menarche_age'].value_counts().index[0])
data

In [None]:
# Split dataFrame in two: categorical and numerical:
cat_mask = (data.dtypes == object)
cat_cols = data.columns[cat_mask].tolist()

df_cat = data[cat_cols]
df_num = data.drop(cat_cols, axis=1)

df_cat.info()
df_num.info()

In [None]:
# For categorical variables
imp_cat = SimpleImputer(strategy='most_frequent')
columns = df_cat.columns
index = df_cat.index
df_cat = pd.DataFrame(imp_cat.fit_transform(df_cat), columns=columns, index=index)

print(df_cat.isnull().sum())

In [None]:
df_cat.head(20)