# Create prototype for "Similar nodules" feature

## Import modules

In [1]:
%load_ext autoreload
%autoreload 2

In [71]:
from pathlib import Path

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
from PIL import Image
from matplotlib.pyplot import cm

from lung_cancer_detection.utils import save_json

In [48]:
data_dir = Path("/Volumes/LaCie/data/lung-cancer-detection/lidc-idri/").absolute()
nod_dir = data_dir/"processed/nodules/"
meta_dir = data_dir/"processed/meta"
tmp_dir = Path("/Users/felix/Downloads/tmp_lcd/")
tmp_dir.mkdir(exist_ok=True)
print(data_dir.exists(), nod_dir.exists(), meta_dir.exists(), tmp_dir.exists())

True True True True


## Load nodule metadata

In [4]:
data = pd.read_csv(meta_dir/"nodules.csv")
data.head()

Unnamed: 0,PatientID,StudyID,SeriesID,NoduleID,NumAnnotations,Diameter,SurfaceArea,Volume,Malignancy,Texture,...,Sphericity,Calcification,InternalStructure,Subtlety,x_start,x_stop,y_start,y_stop,z_start,z_stop
0,LIDC-IDRI-0001,1.3.6.1.4.1.14519.5.2.1.6279.6001.298806137288...,1.3.6.1.4.1.14519.5.2.1.6279.6001.179049373636...,0,4,32.755812,2491.466573,6989.673615,5,5,...,4,6,1,5,340,392,297,341,86,95
1,LIDC-IDRI-0002,1.3.6.1.4.1.14519.5.2.1.6279.6001.490157381160...,1.3.6.1.4.1.14519.5.2.1.6279.6001.619372068417...,0,2,30.781671,2807.198994,7244.667508,5,2,...,5,6,1,2,334,379,324,374,171,199
2,LIDC-IDRI-0003,1.3.6.1.4.1.14519.5.2.1.6279.6001.101370605276...,1.3.6.1.4.1.14519.5.2.1.6279.6001.170706757615...,0,1,31.664468,1996.252117,4731.410934,2,1,...,5,6,1,1,331,367,350,384,62,70
3,LIDC-IDRI-0003,1.3.6.1.4.1.14519.5.2.1.6279.6001.101370605276...,1.3.6.1.4.1.14519.5.2.1.6279.6001.170706757615...,1,4,31.001964,2225.67735,6519.463698,5,4,...,4,6,1,5,333,373,344,389,70,80
4,LIDC-IDRI-0003,1.3.6.1.4.1.14519.5.2.1.6279.6001.101370605276...,1.3.6.1.4.1.14519.5.2.1.6279.6001.170706757615...,2,4,13.309155,321.183599,472.089669,4,5,...,4,6,1,4,192,205,299,318,81,85


In [5]:
data.columns

Index(['PatientID', 'StudyID', 'SeriesID', 'NoduleID', 'NumAnnotations',
       'Diameter', 'SurfaceArea', 'Volume', 'Malignancy', 'Texture',
       'Spiculation', 'Lobulation', 'Margin', 'Sphericity', 'Calcification',
       'InternalStructure', 'Subtlety', 'x_start', 'x_stop', 'y_start',
       'y_stop', 'z_start', 'z_stop'],
      dtype='object')

## Prepare data

### Remove case nodules

In [6]:
case_nods = data.query("PatientID == 'LIDC-IDRI-0186'")
case_nods

Unnamed: 0,PatientID,StudyID,SeriesID,NoduleID,NumAnnotations,Diameter,SurfaceArea,Volume,Malignancy,Texture,...,Sphericity,Calcification,InternalStructure,Subtlety,x_start,x_stop,y_start,y_stop,z_start,z_stop
554,LIDC-IDRI-0186,1.3.6.1.4.1.14519.5.2.1.6279.6001.300568323537...,1.3.6.1.4.1.14519.5.2.1.6279.6001.204566802718...,0,2,5.714748,88.608536,87.238244,1,5,...,5,3,1,5,121,131,335,344,75,77
555,LIDC-IDRI-0186,1.3.6.1.4.1.14519.5.2.1.6279.6001.300568323537...,1.3.6.1.4.1.14519.5.2.1.6279.6001.204566802718...,1,1,12.695788,412.727257,550.431778,2,1,...,5,6,1,1,209,228,407,424,77,81
556,LIDC-IDRI-0186,1.3.6.1.4.1.14519.5.2.1.6279.6001.300568323537...,1.3.6.1.4.1.14519.5.2.1.6279.6001.204566802718...,2,2,6.485468,100.715517,90.353896,3,5,...,4,6,1,4,204,213,135,146,107,110
557,LIDC-IDRI-0186,1.3.6.1.4.1.14519.5.2.1.6279.6001.300568323537...,1.3.6.1.4.1.14519.5.2.1.6279.6001.204566802718...,3,2,31.353731,3097.289971,7943.094052,3,5,...,4,6,4,5,268,323,172,219,111,120
558,LIDC-IDRI-0186,1.3.6.1.4.1.14519.5.2.1.6279.6001.300568323537...,1.3.6.1.4.1.14519.5.2.1.6279.6001.204566802718...,4,4,17.445504,881.249009,1382.700205,5,5,...,4,6,1,5,293,328,329,369,117,123


In [7]:
data = data.drop(index=case_nods.index)
data.shape

(2620, 23)

### Drop unnecessary columns

In [8]:
cols_to_drop = ['PatientID', 'StudyID', 'SeriesID', 'NoduleID', 'NumAnnotations', 'Malignancy', 'x_start', 'x_stop', 'y_start', 'y_stop', 'z_start', 'z_stop']

In [9]:
X = data.drop(columns=cols_to_drop)

In [10]:
X.head()

Unnamed: 0,Diameter,SurfaceArea,Volume,Texture,Spiculation,Lobulation,Margin,Sphericity,Calcification,InternalStructure,Subtlety
0,32.755812,2491.466573,6989.673615,5,5,3,4,4,6,1,5
1,30.781671,2807.198994,7244.667508,2,1,1,2,5,6,1,2
2,31.664468,1996.252117,4731.410934,1,1,1,2,5,6,1,1
3,31.001964,2225.67735,6519.463698,4,3,2,3,4,6,1,5
4,13.309155,321.183599,472.089669,5,2,2,4,4,6,1,4


In [16]:
X_case = case_nods.drop(columns=cols_to_drop)

In [17]:
X_case.head()

Unnamed: 0,Diameter,SurfaceArea,Volume,Texture,Spiculation,Lobulation,Margin,Sphericity,Calcification,InternalStructure,Subtlety
554,5.714748,88.608536,87.238244,5,1,3,5,5,3,1,5
555,12.695788,412.727257,550.431778,1,1,1,2,5,6,1,1
556,6.485468,100.715517,90.353896,5,2,3,3,4,6,1,4
557,31.353731,3097.289971,7943.094052,5,4,3,3,4,6,4,5
558,17.445504,881.249009,1382.700205,5,4,3,4,4,6,1,5


### Scale features to 0-1 range

In [11]:
X.describe()

Unnamed: 0,Diameter,SurfaceArea,Volume,Texture,Spiculation,Lobulation,Margin,Sphericity,Calcification,InternalStructure,Subtlety
count,2620.0,2620.0,2620.0,2620.0,2620.0,2620.0,2620.0,2620.0,2620.0,2620.0,2620.0
mean,10.151783,374.211068,626.801305,4.447328,1.575191,1.687405,4.055725,3.932061,5.700382,1.01374,3.859542
std,6.73443,695.327768,1779.612066,1.190155,0.999081,1.007226,1.14338,0.87385,0.8877,0.196869,1.146456
min,2.585936,13.761003,8.544922,1.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0
25%,6.12832,83.876045,63.503687,5.0,1.0,1.0,4.0,3.0,6.0,1.0,3.0
50%,7.817374,137.284587,121.815796,5.0,1.0,1.0,4.0,4.0,6.0,1.0,4.0
75%,11.122155,297.146951,321.619636,5.0,2.0,2.0,5.0,5.0,6.0,1.0,5.0
max,49.944618,9026.567042,31112.197113,5.0,5.0,5.0,5.0,5.0,6.0,5.0,5.0


In [12]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [13]:
df = pd.DataFrame(data=X_scaled, columns=X.columns)
df.describe()

Unnamed: 0,Diameter,SurfaceArea,Volume,Texture,Spiculation,Lobulation,Margin,Sphericity,Calcification,InternalStructure,Subtlety
count,2620.0,2620.0,2620.0,2620.0,2620.0,2620.0,2620.0,2620.0,2620.0,2620.0,2620.0
mean,0.159756,0.039993,0.019877,0.861832,0.143798,0.171851,0.763931,0.733015,0.900127,0.003435,0.714885
std,0.142201,0.077149,0.057216,0.297539,0.24977,0.251806,0.285845,0.218463,0.2959,0.049217,0.286614
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.074799,0.007779,0.001767,1.0,0.0,0.0,0.75,0.5,1.0,0.0,0.5
50%,0.110464,0.013705,0.003642,1.0,0.0,0.0,0.75,0.75,1.0,0.0,0.75
75%,0.180246,0.031443,0.010066,1.0,0.25,0.25,1.0,1.0,1.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [20]:
X_case_scaled = scaler.transform(X_case)
df = pd.DataFrame(data=X_case_scaled, columns=X.columns)
df.describe()

Unnamed: 0,Diameter,SurfaceArea,Volume,Texture,Spiculation,Lobulation,Margin,Sphericity,Calcification,InternalStructure,Subtlety
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,0.256618,0.100119,0.064372,0.8,0.35,0.4,0.6,0.85,0.8,0.15,0.75
std,0.220731,0.139926,0.107965,0.447214,0.379144,0.223607,0.285044,0.136931,0.447214,0.33541,0.433013
min,0.066066,0.008305,0.00253,0.0,0.0,0.0,0.25,0.75,0.0,0.0,0.0
25%,0.08234,0.009648,0.00263,1.0,0.0,0.5,0.5,0.75,1.0,0.0,0.75
50%,0.213474,0.044267,0.017422,1.0,0.25,0.5,0.5,0.75,1.0,0.0,1.0
75%,0.313767,0.096251,0.04418,1.0,0.75,0.5,0.75,1.0,1.0,0.0,1.0
max,0.607445,0.342128,0.2551,1.0,0.75,0.5,1.0,1.0,1.0,0.75,1.0


## Find similar nodules using nearest neighbors algorithm

In [14]:
knn = NearestNeighbors(n_neighbors=5)

In [15]:
knn.fit(X_scaled)

NearestNeighbors()

In [23]:
dists, inds = knn.kneighbors(X=X_case_scaled)
print(dists.shape)
print(inds.shape)

(5, 5)
(5, 5)


In [25]:
print(dists)
print(inds)

[[0.00877994 0.02636936 0.25001606 0.25030775 0.25034292]
 [0.03621386 0.04195926 0.04983029 0.05135735 0.06922574]
 [0.03926597 0.14934578 0.25007335 0.25009965 0.25017265]
 [0.41997139 0.58476071 0.67914261 0.75990946 0.79902492]
 [0.06433406 0.09549331 0.12196268 0.16767177 0.25116731]]
[[1239  383  331 2119 2305]
 [ 789  125   12 2327 2551]
 [1807  176  405  596 2116]
 [ 196  608 2615 1946 1954]
 [1359  205 2354  315   97]]


## Save metadata for similar nodules

In [52]:
data.dtypes

PatientID             object
StudyID               object
SeriesID              object
NoduleID               int64
NumAnnotations         int64
Diameter             float64
SurfaceArea          float64
Volume               float64
Malignancy             int64
Texture                int64
Spiculation            int64
Lobulation             int64
Margin                 int64
Sphericity             int64
Calcification          int64
InternalStructure      int64
Subtlety               int64
x_start                int64
x_stop                 int64
y_start                int64
y_stop                 int64
z_start                int64
z_stop                 int64
dtype: object

In [66]:
def extract_neighbor_metadata(data, similar_nod_ids, similar_nod_dists, query_nod_id):
    result = []
    for i, nod_id in enumerate(similar_nod_ids):
        nod = data.iloc[nod_id]
        neighbor_id = f"nod{query_nod_id}_neighbor{i}"
        meta = {
            "id": neighbor_id,
            "nodule___NODE": str(query_nod_id),
            "patientID": nod.PatientID,
            "noduleID": nod.NoduleID,
            "dist": similar_nod_dists[i],
            "diameter": nod.Diameter,
            "area": nod.SurfaceArea,
            "volume": nod.Volume,
            "malignancy": (nod.Malignancy - 1)/4,
            "texture": nod.Texture,
            "spiculation": nod.Spiculation,
            "lobulation": nod.Lobulation,
            "margin": nod.Margin,
            "sphericity": nod.Sphericity,
            "calcification": nod.Calcification,
            "internalStructure": nod.InternalStructure,
            "subtlety": nod.Subtlety,
            "thumbnail": f"scan01/nodules/neighbors/{neighbor_id}.png",
        }
        result.append(meta)
    return result

In [67]:
neighbor_data = []
for i in range(len(inds)):
    neighbor_data += extract_neighbor_metadata(data, inds[i], dists[i], i+1)
print(len(neighbor_data))

25


In [68]:
save_json(tmp_dir/"similarNodules.json", str(neighbor_data))

## Save thumbnails for similar nodules

In [78]:
def create_2d_img(img, z, window=(-600, 1500)):
    # clip pixel values to desired window
    level, width = window
    img = np.clip(img, level-(width/2), level+(width/2))
    # normalize pixel values to 0-1 range
    img_min = img.min()
    img_max = img.max()
    img = (img - img_min) / (img_max - img_min)
    # convert to Pillow image for display
    img_slice = img[:, :, z]
    pil_img = Image.fromarray(np.uint8(cm.gray(img_slice)*255))
    return pil_img.convert('RGBA')

def save_neighbor_thumbnails(data, similar_nod_ids, query_nod_id, src_dir, dest_dir):
    for i, nod_id in enumerate(similar_nod_ids):
        nod = data.iloc[nod_id]
        neighbor_id = f"nod{query_nod_id}_neighbor{i}"
        img_name = f"{nod.PatientID}_{nod.NoduleID}.npy"
        vol = np.load(src_dir/img_name)
        img = create_2d_img(vol, int(vol.shape[2]/2))
        img.save(dest_dir/f"{neighbor_id}.png")
    return

In [79]:
dest_dir = tmp_dir/"neighbors"
dest_dir.mkdir(exist_ok=True)

In [80]:
for i in range(len(inds)):
    save_neighbor_thumbnails(data, inds[i], i+1, nod_dir, dest_dir)