In [None]:
# import libraries
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer

from paths import RAW_DIR, STAGE_DIR
from display import cdisplay, rdisplay
from transformers import AssignTransformer, PandasColumnTransformer
from functions import to_numeric, to_binary_from_nan, inspect_nulls

In [None]:
# functions
def replace_nan_1cond(data, feat1, val1, col_to_replace, val_to_replace_with):
    data.loc[(data[feat1] == val1), 
             col_to_replace] = val_to_replace_with
    return data

def replace_nan_2cond(data, feat1, feat2, val1, val2, col_to_replace, val_to_replace_with):
    data.loc[(data[feat1] == val1) & (data[feat2] == val2), 
             col_to_replace] = val_to_replace_with
    return data

### Import the data

In [None]:
patient_1 = pd.read_excel(RAW_DIR / 'breast_cancer_data.xlsx')
patient_2 = pd.read_excel(RAW_DIR / 'breast_cancer_data_2.xlsx')

# concatenate the two datasets
data = pd.concat([patient_1, patient_2]).drop(['Unnamed: 0'], axis=1)
data.head(20)

### Solving duplicates in `ehr`

In [None]:
data['ehr'].value_counts(dropna=False)

In [None]:
cdisplay(data[data['ehr'].isin([268, 6897])])

We can see that repeated observations of patients 268 and 6897 have identical values for the rest of the columns. For this reason, we can drop the repeated observations.

In [None]:
data = data.drop_duplicates()

In [None]:
cdisplay(data)

### Creating new columns

In [None]:
data[pd.to_datetime(data['diagnosis_date']).gt(dt.datetime.today())]

In [None]:
data[pd.to_datetime(data['diagnosis_date']).gt(pd.to_datetime(data['death_date']))]

In [None]:
assign_map = {
    # fix diagnosis date
    'diagnosis_date': lambda df: np.select(
        [pd.to_datetime(df['diagnosis_date']).gt(dt.datetime.today()),
         pd.to_datetime(df['diagnosis_date']).gt(pd.to_datetime(df['death_date']))],
        [dt.datetime.today().strftime('%Y-%m-%d'), df['death_date']],
        df['diagnosis_date']
    ),

    'age': lambda df: np.where(
        df['death_date'].isna(),
        dt.datetime.today() - pd.to_datetime(df['birth_date']),
        pd.to_datetime(df['death_date']) - pd.to_datetime(df['birth_date'])
    ).astype('timedelta64[Y]').astype(int),
    'years_from_diagnosis': lambda df: np.where(
        df['death_date'].isna(),
        dt.datetime.today() - pd.to_datetime(df['diagnosis_date']),
        pd.to_datetime(df['death_date']) - pd.to_datetime(df['diagnosis_date'])
    ).astype('timedelta64[Y]').astype(int),
    'is_dead': lambda df: to_binary_from_nan(df, 'death_date'),
    'recurrence': lambda df: to_binary_from_nan(df, 'recurrence_year'),
    'menopause': lambda df: to_binary_from_nan(df, 'menopause_age'),
    # NOTE 1 null value in neoadjuvant: probably is going to be solved when
    # merging with the other dataset
    'neoadjuvant': lambda df: df['neoadjuvant'].replace({'no': 0, 'yes': 1}),
    # NOTE birth = -1 doesn't make sense --> replace it with 0
    'birth': lambda df: np.where(
        df['birth'].eq(-1) & df['pregnancy'].gt(0),
        df['pregnancy'],
        df['birth'].replace({-1: 0})
    ).astype(int),
    'caesarean': lambda df: np.where(
        df[['pregnancy', 'abort', 'birth']].gt(0).all(1) \
            & df['caesarean'].isna()\
            & (df['pregnancy'] - df['abort'] - df['birth']).ge(0),
        df['pregnancy'] - df['abort'] - df['birth'],
        df['caesarean'].fillna(0)
    ).astype(int),
    'abort': lambda df: df['abort'].fillna(0).astype(int),
    'pregnancy': lambda df: np.where(
        df['pregnancy'].isna() \
            | ~df['pregnancy'].eq(df['birth'] + df['caesarean'] + df['abort']),
        df['birth'] + df['caesarean'] + df['abort'],
        df['pregnancy']
    ).astype(int),
    # utils for imputation
    'group_age': lambda df: np.select(
        [df['age'].isin(list(range(i, i + 10))) for i in range(1, 100, 10)],
        [f'{i} - {i + 10}' for i in range(1, 100, 10)],
        '100+'
    ),
    'group_years_from_diagnosis': lambda df: np.select(
        [df['years_from_diagnosis'].isin(list(range(i, i + 5)))
         for i in range(1, 30, 5)],
        [f'{i} - {i + 5}' for i in range(1, 30, 5)],
        '30+'
    ),
}

assign_transformer = AssignTransformer(assign_map)
assigned_data = assign_transformer.transform(data)

In [None]:
assigned_data[['birth', 'abort', 'caesarean', 'pregnancy']].value_counts(dropna=False).sort_index()

In [None]:
assigned_data['group_age'].value_counts(dropna=False)

In [None]:
assigned_data['group_years_from_diagnosis'].value_counts(dropna=False)

### Drop columns

In [None]:
inspect_nulls(assigned_data)

In [None]:
# drop birth_date, diagnosis_date, side (more than 200 null)
drop_transformer = FunctionTransformer(
    func=lambda df: df.drop([
        'birth_date', 'diagnosis_date', 'death_date', 'recurrence_year',
        'menopause_age', 'side'], axis=1)
)
dropped_data = drop_transformer.transform(assigned_data)
dropped_data.head(20)

In [None]:
dropped_data.info()

### Impute null values

In [None]:
mean_imputer = SimpleImputer(strategy='most_frequent')

imputed_data = mean_imputer.fit_transform(dropped_data)

In [None]:
dropped_data.shape

In [None]:
a = np.where(
        dropped_data['er_positive'].isna(),
        dropped_data[GROUPS_INDEX].merge(
            dropped_data.groupby(GROUPS_INDEX)['er_positive'].apply(lambda x: x.mode()),
            on=GROUPS_INDEX)['er_positive'],
        dropped_data['er_positive']
    )

In [None]:
a['er_positive'].value_counts(dropna=False)

In [None]:
GROUPS_INDEX = ['group_years_from_diagnosis', 'group_age', 'menopause',
                'recurrence', 'hist_type']

group_imputer = {
    'er_positive': lambda df: np.where(
        df['er_positive'].isna(),
        df[GROUPS_INDEX].merge(
            df.groupby(GROUPS_INDEX)['er_positive'].apply(lambda x: x.mode()),
            on=GROUPS_INDEX)['er_positive'],
        df['er_positive']
    ),
}

imputer_transformer = AssignTransformer(group_imputer)
a = imputer_transformer.transform(dropped_data)

In [None]:
dropped_data.loc[dropped_data['er_positive'].isna(), ['group_years_from_diagnosis', 'group_age', 'menopause', 'recurrence', 'hist_type']]

In [None]:
a['er_positive'].value_counts(dropna=False)

### Data Visualization

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(14, 4))

sns.countplot(ax=axes[0], data=imputed_data, x="grade", hue="neoadjuvant", palette="RdPu")
axes[0].set(xlabel=None)
axes[0].set_title('Grade')

sns.countplot(ax=axes[1], data=imputed_data, x="hist_type", hue="neoadjuvant", palette="RdPu")
axes[1].set(xlabel=None)
axes[1].set_title('Histological Type')

sns.countplot(ax=axes[2], data=imputed_data, x="recurrence", hue="neoadjuvant", palette="RdPu")
axes[2].set(xlabel=None)
axes[2].set_title('Recurrence')

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(14, 4))

sns.violinplot(ax=axes[0], data=imputed_data, x="recurrence", y="ki67", hue="neoadjuvant", split=True, palette="Paired")
axes[0].set_title('Distribution of ki67')

sns.violinplot(ax=axes[1], data=imputed_data, x="recurrence", y="age", hue="neoadjuvant", split=True, palette="Paired")
axes[1].set_title('Distribution of Age')

sns.violinplot(ax=axes[2], data=imputed_data, x="recurrence", y="years_from_diagnosis", hue="neoadjuvant", split=True, palette="Paired")
axes[2].set_title('Distribution of years_from_diagnosis')

### Map values and replace with most frequent

In [None]:
data_dum.to_parquet(STAGE_DIR / 'patient-preprocessed-v2.parquet')