In [None]:
%load_ext autoreload
%autoreload 2

In [159]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from paths import STAGE_DIR, ANALYSIS_DIR
from display import cdisplay, rdisplay

# Load data

In [None]:
patient_df = pd.read_parquet(STAGE_DIR / 'patient-preprocessed-v3.parquet')
tnm_df = pd.read_parquet(STAGE_DIR / 'tnm-preprocessed-v1.parquet')

In [None]:
print(f'''
{patient_df.shape=}
{tnm_df.shape=}
''')

In [None]:
cdisplay(patient_df.head())

In [None]:
cdisplay(tnm_df.head())

In [None]:
set(patient_df['ehr']).difference(tnm_df['ehr'].unique())

In [None]:
set(tnm_df['ehr']).difference(patient_df['ehr'])

# Merge datasets

Because the each dataset has some patients information that doesn't appear in the other dataset, merging will be done staying with the indexes present in both datasets. The aim is to fill the NaN observation in `neoadjuvant` column with information about the treatments and `invasive` with information gathered in the columns `t`, `n` and `m`.

In [None]:
data = pd.merge(patient_df, tnm_df, on='ehr', how='inner')

In [None]:
cdisplay(data.head())

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data['ehr'].value_counts(dropna=False)

# Preprocess

## Solve `neoadjuvant` null value

Given that the patient 736 has no information of the tumors after neoadjuvant and the `neoadjuvant` column is null for this observation, then we are going to assume that the correct `neoadjuvant` value is 0.

In [None]:
cdisplay(data[data['neoadjuvant'].isna()])

In [None]:
data['neoadjuvant'] = data['neoadjuvant'].fillna(0).astype(int)

In [None]:
data['neoadjuvant'].value_counts(dropna=False)

## Solve `invasive` nulls

In [None]:
data['invasive'].value_counts(dropna=False)

The `invasive` column meaning provided by the teacher states that an invasive feature "indicates whether the tumor is invasive or not. If it is not invasive, then it is "in situ".".

According to this, if a tumor is considered invasive (`invasive` = 1), then the variable `t` shouldn't be classified as `IS` (meaning `IS`). We proposed changing the value of the column `invasive` for those observations to 0.

In [None]:
data.loc[data['invasive'].eq(1), ['t', 't_after_neoadj']].value_counts(dropna=False).sort_index()

The null values of the `invasive` feature correspond to three cases:
* In situ primary tumors (`t = TIS`) => `invasive = 0`
* Primary tumor classification in `T1`, `T2`, `T3` or `T4` => `invase = 1`
* Observations where primary tumor classification is `T0` (no evidence of primary tumor) or `TX` (primary tumor cannot be assessed), but the illness spread through the organism (`M = M1`) => `invasive = 1`

In [None]:
data.loc[
    data['invasive'].isna(),
    ['t', 'n', 'm', 't_after_neoadj', 'n_after_neoadj', 'm_after_neoadj']] \
.value_counts(dropna=False) \
.sort_index()

In [None]:
data['invasive'] = np.select(
    [data['t'].eq('TIS'),
     data['invasive'].isna() & ~data['t'].eq('TIS')],
    [0, 1],
    data['invasive']
)

## Analyze `neoadjuvant` vs treatment columns

If a treatment isn't applied to a certain person, then it makes sense to have nulls on treatment columns. This are **nulls by design** in the experiment, so we will create a new category for this cases to differentiate them from the rest of the nulls. We propose `TN`, `NN` and `MN` for the columns `t_after_neoadj`, `n_after_neoadj` and `m_after_neoadj` respectively.

In [None]:
data[[
    'neoadjuvant', 't_after_neoadj',
    'n_after_neoadj', 'm_after_neoadj']] \
.value_counts(dropna=False) \
.sort_index()

As we can see, there is an observation that didn't reaceive treatment, but has information about the tumors after it. In the case of this observation, before the "treatment" there was no evidence of any tumor, so the treatment could not be justify. In this case, we decided to change the post-treatment values to the null by design category.

In [None]:
data.loc[
    data['neoadjuvant'].eq(0) & data['t_after_neoadj'].eq('T2'),
    ['t', 'n', 'm', 't_after_neoadj', 'n_after_neoadj', 'm_after_neoadj']]

In [None]:
data.describe().T

In [None]:
data['t_after_neoadj'] = np.where(
    data['neoadjuvant'].eq(0),
    'TN',
    data['t_after_neoadj']
)
data['n_after_neoadj'] = np.where(
    data['neoadjuvant'].eq(0),
    'NN',
    data['n_after_neoadj']
)
data['m_after_neoadj'] = np.where(
    data['neoadjuvant'].eq(0),
    'MN',
    data['m_after_neoadj']
)

In [None]:
data[['neoadjuvant', 't_after_neoadj', 'n_after_neoadj', 'm_after_neoadj']] \
    .value_counts(dropna=False) \
    .sort_index()

## Analyze pre-treatment vs post-treatment variables

In [None]:
data_vs = data[data['neoadjuvant'].eq(1)].copy()

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

fig.suptitle('Pre-treatment vs post-treatment variables')

for i, col in enumerate(['t', 'n', 'm']):
    sns.stripplot(
        ax=axes[i],
        data=data_vs,
        x=col,
        y=f'{col}_after_neoadj',
        c='orange')
    axes[i].set_title(f'{col.upper()} before and after treatment')
    axes[i].set_xlabel(f'Pre-treatment {col.upper()} value')
    axes[i].set_ylabel(f'Post-treatment {col.upper()} value')

The `T9`, `N9` and `M9` categories correspond to null values in the columns. Given that we have eliminated the nulls by design in the post-treatment columns, the remaining nulls could be for several reason including dead before the end of the treatment. We don't have information to filter this cases, so we will assume that if a null value is still in the columns, then it is going to be because the tumor could not be assessed (`X` category).

In [None]:
data_vs[['m', 'm_after_neoadj']].value_counts(dropna=False).sort_index()

In [None]:
data_vs[['n', 'n_after_neoadj']].value_counts(dropna=False).sort_index()

In [None]:
data_vs[['t', 't_after_neoadj']].value_counts(dropna=False).sort_index()

In [None]:
data[['t', 't_after_neoadj']] = data[['t', 't_after_neoadj']].replace('T9', 'TX')
data[['n', 'n_after_neoadj']] = data[['n', 'n_after_neoadj']].replace('N9', 'NX')
data[['m', 'm_after_neoadj']] = data[['m', 'm_after_neoadj']].replace('M9', 'MX')

# Split datasets

The merged dataset was useful to analyze relations between treatment variables in the both dataset and information about tumors. However, we recommend to keep it separated and to use them according to specific problems that may need one dataset or the other ot both. This decision is also based in the difference of patient that each dataset have respect the other.

In [None]:
patient_preprocessed_df = pd.concat([
    data[patient_df.columns],
    patient_df[~patient_df['ehr'].isin(data['ehr'].unique())]]) \
.drop_duplicates()

tnm_preprocessed_df = pd.concat([
    data[tnm_df.columns],
    tnm_df[~tnm_df['ehr'].isin(data['ehr'].unique())]])

In [None]:
patient_preprocessed_df.info()

The column `invasive` still has 2 nulls. These correspond to patients that weren't in the treatment dataset and for that reason they weren't analyzed in the merged. We are going to impute it with the mode.

In [None]:
patient_preprocessed_df['invasive'] = patient_preprocessed_df['invasive'].fillna(
    patient_preprocessed_df['invasive'].mode().iloc[0]
)

In [None]:
tnm_preprocessed_df.info()

# Save datasets

In [160]:
patient_preprocessed_df.to_csv(ANALYSIS_DIR / 'patient-dataset-v1.csv')
tnm_preprocessed_df.to_csv(ANALYSIS_DIR / 'tnm-dataset-v1.csv')