In [None]:
import os
import pickle
import pandas as pd

import sys
sys.path.append('..')
import utils

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [None]:
# Constants and arguments
data_dir = "../../data/tnm_stage"
out_preds_path = "./model_preds"
model_name = "Clinical-BigBird"

# Data loading

In [None]:
t_label_enc = LabelEncoder()
n_label_enc = LabelEncoder()
m_label_enc = LabelEncoder()

## Training

In [None]:
df_train = pd.read_csv(os.path.join(data_dir, "train_tcga_reports_tnm_stage.csv"))

In [None]:
df_train.shape

(1947, 6)

In [None]:
t_label_enc = t_label_enc.fit(df_train['t_label'])
n_label_enc = n_label_enc.fit(df_train['n_label'])
m_label_enc = m_label_enc.fit(df_train['m_label'])

## Validation

In [None]:
df_val = pd.read_csv(os.path.join(data_dir, "val_tcga_reports_tnm_stage.csv"))

In [9]:
df_val.shape

(780, 6)

## Test

In [None]:
df_test = pd.read_csv(os.path.join(data_dir, "test_tcga_reports_tnm_stage.csv"))

In [None]:
df_test.shape

(1170, 6)

# Multi-task approach

## T subtask

In [None]:
with open(
    os.path.join(out_preds_path, f"t_label_{model_name}_test_preds.pkl"),
    'rb'
) as file:
    t_test_preds = pickle.load(file)

In [None]:
arr_t_test_label_preds = t_label_enc.inverse_transform(t_test_preds.argmax(axis=1))

In [None]:
accuracy_score(
    y_true=df_test[f't_label'].values,
    y_pred=arr_t_test_label_preds
)

0.7957264957264957

In [None]:
utils.calculate_performance(
    arr_gs=df_test[f't_label'].values,
    arr_preds=arr_t_test_label_preds,
    arr_labels=t_label_enc.classes_,
    col_label=f"t_label",
    df_data=df_test,
    df_train_data=df_train
)

Unnamed: 0,label,precision,recall,f1,n_train,n_val
0,T1,0.753906,0.736641,0.745174,435,262
1,T2,0.799065,0.830097,0.814286,689,412
2,T3,0.814085,0.802778,0.808392,596,360
3,T4,0.816794,0.786765,0.801498,227,136


## N subtask

In [None]:
with open(
    os.path.join(out_preds_path, f"n_label_{model_name}_test_preds.pkl"),
    'rb'
) as file:
    n_test_preds = pickle.load(file)

In [None]:
arr_n_test_label_preds = n_label_enc.inverse_transform(n_test_preds.argmax(axis=1))

In [None]:
accuracy_score(
    y_true=df_test[f'n_label'].values,
    y_pred=arr_n_test_label_preds
)

0.8136752136752137

In [None]:
utils.calculate_performance(
    arr_gs=df_test[f'n_label'].values,
    arr_preds=arr_n_test_label_preds,
    arr_labels=n_label_enc.classes_,
    col_label=f"n_label",
    df_data=df_test,
    df_train_data=df_train
)

Unnamed: 0,label,precision,recall,f1,n_train,n_val
0,N0,0.908836,0.954345,0.931034,1129,679
1,N1,0.694611,0.768212,0.72956,503,302
2,N2,0.588235,0.422535,0.491803,236,142
3,N3,0.571429,0.255319,0.352941,79,47


## M subtask

In [None]:
with open(
    os.path.join(out_preds_path, f"m_label_{model_name}_test_preds.pkl"),
    'rb'
) as file:
    m_test_preds = pickle.load(file)

In [None]:
arr_m_test_label_preds = m_label_enc.inverse_transform(m_test_preds.argmax(axis=1))

In [22]:
accuracy_score(
    y_true=df_test[f'm_label'].values,
    y_pred=arr_m_test_label_preds
)

0.9205128205128205

In [23]:
utils.calculate_performance(
    arr_gs=df_test[f'm_label'].values,
    arr_preds=arr_m_test_label_preds,
    arr_labels=m_label_enc.classes_,
    col_label=f"m_label",
    df_data=df_test,
    df_train_data=df_train
)

Unnamed: 0,label,precision,recall,f1,n_train,n_val
0,M0,0.945778,0.970803,0.958127,1821,1096
1,M1,0.288889,0.175676,0.218487,126,74
