In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from paths import RAW_DIR, STAGE_DIR

### Import the data

In [None]:
tnm1 = pd.read_csv(RAW_DIR / "breast_cancer_data_tnm.csv")
tnm2 = pd.read_csv(RAW_DIR / "breast_cancer_data_tnm_2.csv")

In [None]:
# concatenate the two datasets
data_tnm = pd.concat([tnm1, tnm2])
data_tnm

In [None]:
cat = (data_tnm.dtypes == object)
cat_cols = data_tnm.columns[cat].tolist()

for column in cat_cols:
  print(column, data_tnm[column].unique())

### NULL values

In [None]:
print(data_tnm.isnull().sum())
data_tnm.info()

In [None]:
data_tnm['t'] = 'T' + (data_tnm['t']
    .fillna(9)
    .astype(str)
    .replace('\.0', '', regex=True))
data_tnm['n'] = 'N' + (data_tnm['n']
    .fillna(9)
    .astype(str)
    .replace('\.0', '', regex=True))
data_tnm['m'] = 'M' + (data_tnm['m']
    .fillna(9)
    .astype(str)
    .replace('\.0', '', regex=True))

data_tnm['t_after_neoadj'] = 'T' + (data_tnm['t_after_neoadj']
    .fillna(9)
    .astype(str)
    .replace('\.0', '', regex=True))
data_tnm['n_after_neoadj'] = 'N' + (data_tnm['n_after_neoadj']
    .fillna(9)
    .astype(str)
    .replace('\.0', '', regex=True))
data_tnm['m_after_neoadj'] = 'M' + (data_tnm['m_after_neoadj']
    .fillna(9)
    .astype(str)
    .replace('\.0', '', regex=True))

In [None]:
# Users can be duplicated for diff tumor types
data_tnm[data_tnm.ehr.duplicated(keep=False)]

In [None]:
data_tnm.loc[data_tnm.t.eq("TIS"), ['t', 't_after_neoadj']].value_counts(dropna=False)

In [None]:
len(data_tnm['ehr'].unique())

In [None]:
data_tnm['t'].value_counts()

In [None]:
data_tnm['t_after_neoadj'].value_counts()

In [None]:
# Take only data from patients with info before and after treatment
t_before_after = data_tnm.loc[(data_tnm.t != "TX") & (data_tnm.t_after_neoadj != "TX"), ['t', 't_after_neoadj']]
n_before_after = data_tnm.loc[(data_tnm.n != "NX") & (data_tnm.n_after_neoadj != "NX"), ['n', 'n_after_neoadj']]
m_before_after = data_tnm.loc[(data_tnm.m != "MX") & (data_tnm.m_after_neoadj != "MX"), ['m', 'm_after_neoadj']]
print(
  'T', len(t_before_after),
  'N',len(n_before_after),
  'M',len(m_before_after)
  )

In [None]:
# Show the effect of neoadjuvance
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

sns.countplot(ax=axes[0], data=pd.melt(t_before_after), x='value', hue='variable')
axes[0].set(xlabel='Stage')
axes[0].set(ylabel='Patients')
axes[0].set_title('T')

sns.countplot(ax=axes[1], data=pd.melt(n_before_after), x='value', hue='variable')
axes[1].set(xlabel='Stage')
axes[1].set(ylabel='Patients')
axes[1].set_title('N')

sns.countplot(ax=axes[2], data=pd.melt(m_before_after), x='value', hue='variable')
axes[2].set(xlabel='Stage')
axes[2].set(ylabel='Patients')
axes[2].set_title('M')

In [None]:
data_tnm.to_parquet(STAGE_DIR / 'tnm-preprocessed-v1.parquet')