# Automated Grading of Prostate Cancer  
### Using Convolutional Neural Network and Ordinal Classifier  

**Authors:** Abraham, Bejoy, and Nair, Madhu S.  
**Python Implementation:** Adapted by Edward Sandoval  

---

## Objective
This notebook aims to implement the **C4.5 ordinal classification algorithm** (referred to as J48) described in the cited paper for the **PICAI dataset**, focusing on prostate lesion malignancy classification into three ordinal classes.  

---

## Malignancy Class Mapping
The following class mappings are applied to group ISUP grades into ordinal malignancy levels:  

| **ISUP Grades** | **Malignancy Class** |  
|------------------|-----------------------|  
| 0, 1            | Class 0              |  
| 2, 3            | Class 1              |  
| 4, 5            | Class 2              |  

---

## Note
To ensure proper execution and compatibility with all dependencies, **it is recommended to run this notebook within the container generated from the `compose_weka.yml` file in the `container` folder**. This ensures the environment aligns with the requirements of the implemented algorithms.


In [1]:
import os
os.chdir('/app/')
import json
import pandas as pd
import numpy as np
import torch

DATA_PATH = '/Datasets/PICAI_64x64_patches/'
info = json.load(open(DATA_PATH + '8x64x64-CIspheres.json'))
folds_idxs = json.load(open(DATA_PATH + 'picai_patches_splits_5kf.json'))


In [2]:
test_data = np.load(DATA_PATH+'patches/10144_1000146_000.npy')
print(test_data.shape)
print(np.min(test_data),np.max(test_data))

(3, 8, 64, 64)
0.0 0.7118942737579346


In [3]:
import torch.nn.functional as F

def get_label(idx, type='bin1'):
    
    label = info[idx]['case_ISUP']
    if type == 'multi1':
        multi1 = {0:0,1:0,2:1,3:1,4:2,5:2}
        label = multi1[label]
    elif type == 'multi2':
        label = label
    elif type == 'bin1':
        bin1 = {0:0,1:0,2:1,3:1,4:1,5:1}
        label = bin1[label]

    return label

def preprocess_section(patch):
    """
    Receives a patch from dimensions 3,8,64,64 refering to modalities, slices, height and width
    Gets the center patch across the slices and resizes it to 224,224
    """
    #Convert to pytorch tensor
    patch = torch.tensor(patch[:, 3:6, :, :])
    
    #Resize image
    patch = F.interpolate(patch, size=(224, 224), mode='bilinear', align_corners=False)
   
    #Convert to float
    patch = patch.float()
    
    return patch


In [4]:
import timm

model = timm.create_model(
    'vgg16.tv_in1k',
    pretrained=True,
    features_only=False, #This parameters allows to obtain the last layer or all the previous ones
)
model = model.eval()

def get_embedding(model, input):
    """
    Receives a patch from dimensions 3,8,64,64 refering to modalities, slices, height and width
    Gets the center patch across the slices and resizes it to 224,224
    """
    #Preprocess patch
    input = preprocess_section(input)
    
    #Get embeddings
    with torch.no_grad():
        embeddings = model(input)

    #flatten embeddings
    embeddings = embeddings.view(1, -1)
    
    return embeddings

test_embedding = get_embedding(model, test_data)
print(test_embedding.shape) 


  from .autonotebook import tqdm as notebook_tqdm


torch.Size([1, 3000])


In [5]:
from tqdm import tqdm
import numpy as np
import torch

def get_Xy(idxs, label_type='multi1'):
    embs = []
    ys = []
    for i in tqdm(idxs, desc="Processing"):  # Add tqdm to iterate with a progress bar
        img = np.load(DATA_PATH + f'patches/{i}.npy')
        y = get_label(i, label_type)
        if info[i]['case_ISUP'] != 1:  
            emb = get_embedding(model, img)
            embs.append(emb), ys.append(y)

    # Convert the list of torch tensors to a NumPy array
    embs = torch.cat(embs, dim=0).numpy()  # Concatenate along batch dimension and convert to NumPy
    ys = np.array(ys)  # Convert list of integers to NumPy array

    print(f"   Final length of {ys.shape[0]}")

    return embs, ys


In [6]:
def get_idx_fold(fold):
    key_train = f'train_fold_{fold}'
    key_val = f'val_fold_{fold}'

    return folds_idxs[key_train], folds_idxs[key_val]
    

In [7]:
from sklearn.metrics import precision_score, recall_score, f1_score, cohen_kappa_score

def get_metrics(y_true, y_pred):
    """
    Computes precision, recall, F1-score (macro average), and weighted Cohen's kappa.

    Args:
        y_true (list or np.array): Ground truth labels.
        y_pred (list or np.array): Predicted labels.

    Returns:
        dict: A dictionary with precision, recall, F1-score, and weighted kappa.
    """
    metrics = {
        "precision_macro": precision_score(y_true, y_pred, average='macro'),
        "recall_macro": recall_score(y_true, y_pred, average='macro'),
        "f1_macro": f1_score(y_true, y_pred, average='macro'),
        "weighted_kappa": cohen_kappa_score(y_true, y_pred, weights='quadratic')
    }
    return metrics


In [8]:
import numpy as np
from weka.core.dataset import create_instances_from_matrices
from weka.core.converters import Loader
from weka.classifiers import Classifier
import weka.core.jvm 
from weka.filters import Filter


def create_dataset_weka(x,y):
    dataset = create_instances_from_matrices(x, y, col_y='class')

    # Convert the class attribute to nominal
    dataset.class_is_last()  # Ensure the class is the last column
    numeric_to_nominal = Filter(classname="weka.filters.unsupervised.attribute.NumericToNominal", options=["-R", "last"])
    numeric_to_nominal.inputformat(dataset)
    nominal_dataset = numeric_to_nominal.filter(dataset)

    return nominal_dataset


def predict_weka(cls, nominal_dataset):

    preds = []
    # Make predictions
    for index, inst in enumerate(nominal_dataset):
        pred = cls.classify_instance(inst)
        #dist = cls.distribution_for_instance(inst)
        #print(f"{index + 1}: label index={pred}, class distribution={dist}")
        preds.append(pred)
    return np.array(preds)



In [9]:

from weka.core.dataset import create_instances_from_matrices
from weka.core.converters import Loader
from weka.classifiers import Classifier
import weka.core.jvm as jvm
from weka.filters import Filter

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np

jvm.start()

train_metrics_list = []
val_metrics_list = []
for f in range(5):
    print(f"Current FOLD: {f+1}")
    idxs_train, idxs_val = get_idx_fold(f)

    # Load data
    X_train, y_train = get_Xy(idxs_train, label_type='multi1')
    X_val, y_val = get_Xy(idxs_val, label_type='multi1')

    # Scale data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # Apply PCA
    pca = PCA(n_components=0.85)
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_val_pca = pca.transform(X_val_scaled)
    print(f"    PCA retained {X_train_pca.shape[1]} components.")

    # Train and evaluate C4.5 classifier
    # Convert X_train, y_train to a pd dataframe as well as X_val,y_val
    dataset_train = create_dataset_weka(X_train_pca,y_train)
    dataset_val = create_dataset_weka(X_val_pca,y_val)

    cls = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"])
    cls.build_classifier(dataset_train)

    y_pred_train = predict_weka(cls, dataset_train)
    y_pred_val = predict_weka(cls, dataset_val)

    # Compute metrics
    metrics_train = get_metrics(y_train, y_pred_train)
    metrics_val = get_metrics(y_val, y_pred_val)

    print(f"    Train metrics ----> {metrics_train}")
    print(f"    Valid metrics ----> {metrics_val}")

    # Collect metrics
    train_metrics_list.append(metrics_train)
    val_metrics_list.append(metrics_val)


DEBUG:weka.core.jvm:Adding bundled jars
DEBUG:weka.core.jvm:Classpath=['/usr/local/lib/python3.10/dist-packages/javabridge/jars/rhino-1.7R4.jar', '/usr/local/lib/python3.10/dist-packages/javabridge/jars/runnablequeue.jar', '/usr/local/lib/python3.10/dist-packages/javabridge/jars/cpython.jar', '/usr/local/lib/python3.10/dist-packages/weka/lib/weka.jar', '/usr/local/lib/python3.10/dist-packages/weka/lib/python-weka-wrapper.jar', '/usr/local/lib/python3.10/dist-packages/weka/lib/core.jar', '/usr/local/lib/python3.10/dist-packages/weka/lib/mtj.jar', '/usr/local/lib/python3.10/dist-packages/weka/lib/arpack_combined.jar']
DEBUG:weka.core.jvm:MaxHeapSize=default
DEBUG:weka.core.jvm:Package support disabled
Nov 30, 2024 7:21:31 PM com.github.fommil.netlib.ARPACK <clinit>
Nov 30, 2024 7:21:31 PM com.github.fommil.netlib.ARPACK <clinit>


Current FOLD: 1


Processing:   0%|          | 0/873 [00:00<?, ?it/s]

Processing:  38%|███▊      | 328/873 [01:54<03:09,  2.88it/s]

In [None]:
# Aggregate metrics
def compute_mean_std(metrics_list):
    mean_std = {}
    for key in metrics_list[0]:
        values = [metrics[key] for metrics in metrics_list]
        mean_std[key] = (np.mean(values), np.std(values))
    return mean_std

train_metrics_summary = compute_mean_std(train_metrics_list)
val_metrics_summary = compute_mean_std(val_metrics_list)

print("\nSummary of Metrics Across Folds:")
print("Train Metrics (Mean ± Std):")
for key, (mean, std) in train_metrics_summary.items():
    print(f"  {key}: {mean:.4f} ± {std:.4f}")

print("Validation Metrics (Mean ± Std):")
for key, (mean, std) in val_metrics_summary.items():
    print(f"  {key}: {mean:.4f} ± {std:.4f}")


NameError: name 'train_metrics_list' is not defined