In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

import nltk
nltk.download('punkt_tab')  # This line is only needed the first time you run the code
nltk.download('stopwords') # This line is only needed the first time you run the code

from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score, classification_report, f1_score, recall_score, precision_score

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/lopezgg/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lopezgg/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Utils

In [2]:
arr_stopwords = nltk.corpus.stopwords.words('english')

In [3]:
def tokenizer(text):
    """
    Tokenizes the input text (lowercased) using NLTK's word_tokenize function and removes
    stopwords, punctuation, tokens containing non-alphabetic characters, and tokens with length 1.
    
    Args:
        text (str): The input text to tokenize.
        
    Returns:
        list: A list of tokens.
    """
    arr_tokens = word_tokenize(text.lower())
    # Remove tokens with length 1
    arr_tokens = [token for token in arr_tokens if len(token) > 1]
    # Remove stopwords
    arr_tokens = [token for token in arr_tokens if token not in arr_stopwords]
    # Remove punctuation and tokens containing non-alphabetic characters
    arr_tokens = [token for token in arr_tokens if token.isalpha()]
    return arr_tokens

In [4]:
def calculate_performance(arr_gs, arr_preds, arr_labels, col_label, df_data, df_train_data):
    dict_perf = {
        'precision': precision_score(
            y_true=arr_gs,
            y_pred=arr_preds,
            average=None,
            labels=arr_labels,
            zero_division=0.0
        ),
        'recall': recall_score(
            y_true=arr_gs,
            y_pred=arr_preds,
            average=None,
            labels=arr_labels,
            zero_division=0.0
        ),
        'f1': f1_score(
            y_true=arr_gs,
            y_pred=arr_preds,
            average=None,
            labels=arr_labels,
            zero_division=0.0
        )
    }
    arr_res = []
    for i in range(len(arr_labels)):
        label = arr_labels[i]
        p, r, f1 = dict_perf['precision'][i], dict_perf['recall'][i], dict_perf['f1'][i]
        n_train = len(df_train_data[df_train_data[col_label] == label])
        n_eval = len(df_data[df_data[col_label] == label])
        arr_res.append({
            "label": label,
            "precision": p,
            "recall": r,
            "f1": f1,
            "n_train": n_train,
            "n_val": n_eval
        })
    df_perf = pd.DataFrame(arr_res)

    df_perf.sort_values(
        by=["f1", "recall", "precision"],
        ascending=True
    )

    return df_perf

# Data loading

In [5]:
data_dir = "../../data/tnm_stage"

## Training

In [6]:
df_train = pd.read_csv(
    os.path.join(data_dir, "train_tcga_reports_tnm_stage.csv"),
    sep=',',
    header=0
)

In [7]:
df_train.shape

(1947, 6)

In [8]:
df_train.head()

Unnamed: 0,patient_id,text,t_label,n_label,m_label,tnm_label
0,TCGA-V7-A7HQ,SurgicalFinal Report. l'emporary. orderediby. ...,T1,N2,M0,T1N2M0
1,TCGA-39-5021,Clinical Diagnosis & History: Left upper lung ...,T2,N0,M0,T2N0M0
2,TCGA-B6-A0RP,Surgica. CLINICAL HISTORY: Biopsy of right and...,T2,N1,M0,T2N1M0
3,TCGA-VR-A8EQ,PRIMARY SITE: Esophagus. I-) Product of esopha...,T3,N1,M0,T3N1M0
4,TCGA-BJ-A2NA,ADDENDA: Addendum. MOLECULAR ANATOMIC PATHOLOG...,T3,N0,M0,T3N0M0


In [9]:
arr_train_corpus = df_train["text"].values.tolist()

## Validation

In [10]:
df_val = pd.read_csv(
    os.path.join(data_dir, "val_tcga_reports_tnm_stage.csv"),
    sep=',',
    header=0
)

In [11]:
df_val.shape

(780, 6)

In [12]:
df_val.head()

Unnamed: 0,patient_id,text,t_label,n_label,m_label,tnm_label
0,TCGA-AA-A01G,Sample ID #. 1. Diagnosis: Right hemicolectomy...,T3,N0,M0,T3N0M0
1,TCGA-AA-3842,Diagnosis: This is an adenocarcinoma of the co...,T2,N1,M0,T2N1M0
2,TCGA-BP-4332,Clinical Diagnosis & History: ith left upper p...,T3,N0,M0,T3N0M0
3,TCGA-BJ-A45F,"FINAL DIAGNOSIS: PART 1: THYROID, TOTAL THYROI...",T1,N0,M0,T1N0M0
4,TCGA-IB-A6UF,CONFIDENTIAL. Demographics (for. verification ...,T3,N1,M0,T3N1M0


In [13]:
arr_val_corpus = df_val["text"].values.tolist()

# Multiclass

In [14]:
arr_train_labels = df_train["tnm_label"].values.tolist()

In [15]:
arr_val_labels = df_val["tnm_label"].values.tolist()

## Logistic Regression

In [16]:
lr_pipeline = Pipeline([
    (
        "bow", 
        CountVectorizer(
            tokenizer=tokenizer,
            token_pattern=None,
            lowercase=False,
            stop_words=None
        )
    ),
    (
        "tf-idf",
        TfidfTransformer(
            use_idf=True
        )
    ),
    (
        "classifier",
        LogisticRegression(
            random_state=0
        )
    )
])

In [17]:
lr_pipeline.fit(arr_train_corpus, arr_train_labels)

### Evaluation

#### Train

In [18]:
arr_train_pred = lr_pipeline.predict(arr_train_corpus)

In [19]:
accuracy_score(
    y_true=arr_train_labels,
    y_pred=arr_train_pred
)

0.5439137134052389

In [20]:
print(classification_report(
    y_true=arr_train_labels,
    y_pred=arr_train_pred
))

              precision    recall  f1-score   support

      T1N0M0       0.60      0.82      0.69       345
      T1N0M1       0.00      0.00      0.00         4
      T1N1M0       1.00      0.02      0.03        66
      T1N1M1       0.00      0.00      0.00         3
      T1N2M0       0.00      0.00      0.00        16
      T1N3M0       0.00      0.00      0.00         1
      T2N0M0       0.50      0.82      0.62       389
      T2N0M1       0.00      0.00      0.00        11
      T2N1M0       0.51      0.54      0.52       186
      T2N1M1       0.00      0.00      0.00         6
      T2N2M0       1.00      0.03      0.05        71
      T2N2M1       0.00      0.00      0.00         4
      T2N3M0       0.00      0.00      0.00        22
      T3N0M0       0.50      0.69      0.58       272
      T3N0M1       0.00      0.00      0.00        18
      T3N1M0       0.58      0.55      0.56       166
      T3N1M1       0.00      0.00      0.00        23
      T3N2M0       0.83    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### Val

In [21]:
arr_val_pred = lr_pipeline.predict(arr_val_corpus)

In [22]:
accuracy_score(
    y_true=arr_val_labels,
    y_pred=arr_val_pred
)

0.3525641025641026

In [23]:
calculate_performance(
    arr_gs=arr_val_labels,
    arr_preds=arr_val_pred,
    arr_labels=lr_pipeline.classes_,
    col_label="tnm_label",
    df_data=df_val,
    df_train_data=df_train
)

Unnamed: 0,label,precision,recall,f1,n_train,n_val
0,T1N0M0,0.43128,0.654676,0.52,345,139
1,T1N0M1,0.0,0.0,0.0,4,1
2,T1N1M0,0.0,0.0,0.0,66,27
3,T1N1M1,0.0,0.0,0.0,3,1
4,T1N2M0,0.0,0.0,0.0,16,6
5,T1N3M0,0.0,0.0,0.0,1,1
6,T2N0M0,0.301418,0.544872,0.388128,389,156
7,T2N0M1,0.0,0.0,0.0,11,4
8,T2N1M0,0.264706,0.243243,0.253521,186,74
9,T2N1M1,0.0,0.0,0.0,6,2


# Multi-task

# T

In [24]:
arr_t_train_labels = df_train["t_label"].values.tolist()

In [25]:
arr_t_val_labels = df_val["t_label"].values.tolist()

## Logistic Regression

In [26]:
lr_t_pipeline = Pipeline([
    (
        "bow", 
        CountVectorizer(
            tokenizer=tokenizer,
            token_pattern=None,
            lowercase=False,
            stop_words=None
        )
    ),
    (
        "tf-idf",
        TfidfTransformer(
            use_idf=True
        )
    ),
    (
        "classifier",
        LogisticRegression(
            random_state=0
        )
    )
])

In [27]:
lr_t_pipeline.fit(arr_train_corpus, arr_t_train_labels)

### Evaluation

#### Train

In [28]:
arr_t_train_pred = lr_t_pipeline.predict(arr_train_corpus)

In [29]:
accuracy_score(
    y_true=arr_t_train_labels,
    y_pred=arr_t_train_pred
)

0.7493579866461222

In [30]:
print(classification_report(
    y_true=arr_t_train_labels,
    y_pred=arr_t_train_pred
))

              precision    recall  f1-score   support

          T1       0.79      0.62      0.69       435
          T2       0.72      0.86      0.78       689
          T3       0.74      0.81      0.77       596
          T4       0.89      0.51      0.65       227

    accuracy                           0.75      1947
   macro avg       0.78      0.70      0.72      1947
weighted avg       0.76      0.75      0.74      1947



#### Val

In [31]:
arr_t_val_pred = lr_t_pipeline.predict(arr_val_corpus)

In [32]:
accuracy_score(
    y_true=arr_t_val_labels,
    y_pred=arr_t_val_pred
)

0.5717948717948718

In [33]:
calculate_performance(
    arr_gs=arr_t_val_labels,
    arr_preds=arr_t_val_pred,
    arr_labels=lr_t_pipeline.classes_,
    col_label="t_label",
    df_data=df_val,
    df_train_data=df_train
)

Unnamed: 0,label,precision,recall,f1,n_train,n_val
0,T1,0.517241,0.428571,0.46875,435,175
1,T2,0.55287,0.665455,0.60396,689,275
2,T3,0.6,0.65,0.624,596,240
3,T4,0.727273,0.355556,0.477612,227,90


# N

In [34]:
arr_n_train_labels = df_train["n_label"].values.tolist()

In [35]:
arr_n_val_labels = df_val["n_label"].values.tolist()

## Logistic Regression

In [36]:
lr_n_pipeline = Pipeline([
    (
        "bow", 
        CountVectorizer(
            tokenizer=tokenizer,
            token_pattern=None,
            lowercase=False,
            stop_words=None
        )
    ),
    (
        "tf-idf",
        TfidfTransformer(
            use_idf=True
        )
    ),
    (
        "classifier",
        LogisticRegression(
            random_state=0
        )
    )
])

In [37]:
lr_n_pipeline.fit(arr_train_corpus, arr_n_train_labels)

### Evaluation

#### Train

In [38]:
arr_n_train_pred = lr_n_pipeline.predict(arr_train_corpus)

In [39]:
accuracy_score(
    y_true=arr_n_train_labels,
    y_pred=arr_n_train_pred
)

0.7740112994350282

In [40]:
print(classification_report(
    y_true=arr_n_train_labels,
    y_pred=arr_n_train_pred
))

              precision    recall  f1-score   support

          N0       0.77      0.98      0.86      1129
          N1       0.77      0.62      0.69       503
          N2       0.91      0.26      0.41       236
          N3       0.81      0.32      0.45        79

    accuracy                           0.77      1947
   macro avg       0.81      0.55      0.60      1947
weighted avg       0.79      0.77      0.75      1947



#### Val

In [41]:
arr_n_val_pred = lr_n_pipeline.predict(arr_val_corpus)

In [42]:
accuracy_score(
    y_true=arr_n_val_labels,
    y_pred=arr_n_val_pred
)

0.6551282051282051

In [58]:
calculate_performance(
    arr_gs=arr_n_val_labels,
    arr_preds=arr_n_val_pred,
    arr_labels=lr_n_pipeline.classes_,
    col_label="n_label",
    df_data=df_val,
    df_train_data=df_train
)

Unnamed: 0,label,precision,recall,f1,n_train,n_val
0,N0,0.690209,0.949227,0.799257,1129,453
1,N1,0.553846,0.358209,0.435045,503,201
2,N2,0.411765,0.074468,0.126126,236,94
3,N3,0.2,0.0625,0.095238,79,32


# M

In [44]:
arr_m_train_labels = df_train["m_label"].values.tolist()

In [45]:
arr_m_val_labels = df_val["m_label"].values.tolist()

## Logistic Regression

In [46]:
lr_m_pipeline = Pipeline([
    (
        "bow", 
        CountVectorizer(
            tokenizer=tokenizer,
            token_pattern=None,
            lowercase=False,
            stop_words=None
        )
    ),
    (
        "tf-idf",
        TfidfTransformer(
            use_idf=True
        )
    ),
    (
        "classifier",
        LogisticRegression(
            random_state=0
        )
    )
])

In [47]:
lr_m_pipeline.fit(arr_train_corpus, arr_m_train_labels)

### Evaluation

#### Train

In [48]:
arr_m_train_pred = lr_m_pipeline.predict(arr_train_corpus)

In [49]:
accuracy_score(
    y_true=arr_m_train_labels,
    y_pred=arr_m_train_pred
)

0.9363122752953261

In [50]:
print(classification_report(
    y_true=arr_m_train_labels,
    y_pred=arr_m_train_pred
))

              precision    recall  f1-score   support

          M0       0.94      1.00      0.97      1821
          M1       1.00      0.02      0.03       126

    accuracy                           0.94      1947
   macro avg       0.97      0.51      0.50      1947
weighted avg       0.94      0.94      0.91      1947



#### Val

In [51]:
arr_m_val_pred = lr_m_pipeline.predict(arr_val_corpus)

In [52]:
accuracy_score(
    y_true=arr_m_val_labels,
    y_pred=arr_m_val_pred
)

0.9371794871794872

In [53]:
calculate_performance(
    arr_gs=arr_m_val_labels,
    arr_preds=arr_m_val_pred,
    arr_labels=lr_m_pipeline.classes_,
    col_label="m_label",
    df_data=df_val,
    df_train_data=df_train
)

Unnamed: 0,label,precision,recall,f1,n_train,n_val
0,M0,0.937179,1.0,0.967571,1821,731
1,M1,0.0,0.0,0.0,126,49


# TNM staging
We finally compare the performance of the Multi-task vs the Multiclass approach for the TNM staging classification problem:

In [61]:
arr_tnm_train_pred = arr_t_train_pred + arr_n_train_pred + arr_m_train_pred
arr_tnm_val_pred = arr_t_val_pred + arr_n_val_pred + arr_m_val_pred

# Train

In [None]:
# Multiclass
accuracy_score(
    y_true=arr_train_labels,
    y_pred=arr_train_pred
)

0.5439137134052389

In [None]:
# Multitask
accuracy_score(
    y_true=arr_train_labels,
    y_pred=arr_tnm_train_pred
)

0.5480225988700564

# Val

In [65]:
# Multiclass
accuracy_score(
    y_true=arr_val_labels,
    y_pred=arr_val_pred
)

0.3525641025641026

In [66]:
# Multitask
accuracy_score(
    y_true=arr_val_labels,
    y_pred=arr_tnm_val_pred
)

0.3474358974358974