The goal is to create connector between clinical data and rtstruct data, by creating cinsistances between the clinical lesion names and the rtstruct lesion names.

Since majority of the work is done by hand you can jump right to the last cells

In [1]:
import numpy as np
import pandas as pd

In this part i am retrieving the way i created the file names. inorder to find a column how to merge join the file names with the clinical data

In [2]:
#read the metadata
df = pd.read_csv("../data/metadata.csv")


In [14]:
import SimpleITK as sitk
import pydicom
import glob
import os
from rt_utils import RTStructBuilder
import re

grouped_df = df.groupby(['pid', 'Study Date'])

rt_lesions = pd.DataFrame(columns=['sid', 'date', 'lesion', 'lesion_clean', 'course','file_name','sex','height','weight','age']) 

# Iterate over each group
for group_name, group_data in grouped_df:
    # Get the 'SOP Class Name' and 'File Location' for each group
    folders_dic = dict(zip(group_data['SOP Class Name'], group_data['File Location']))
    
    mri_folder = os.path.join('../data',folders_dic['MR Image Storage'])
    str_folder = os.path.join('../data',folders_dic['RT Structure Set Storage'])

    # STRUCT
    str_file = glob.glob(os.path.join(str_folder, "*.dcm"))[0]   
    rtstruct = RTStructBuilder.create_from(dicom_series_path=mri_folder,rt_struct_path=str_file)
    names = rtstruct.get_roi_names()
    lesions = [name for name in names if "Skull" not in name] 

    try:
        sex = rtstruct.ds.PatientSex
    except AttributeError:
        sex = 'NA'

    try:
        height = np.round(rtstruct.ds.PatientSize, 2)
    except (AttributeError, TypeError):
        height = -1.0

    try:
        weight = int(rtstruct.ds.PatientWeight)
    except (AttributeError, ValueError):
        weight = -1

    try:
        age = int(''.join(re.findall(r'\d+', rtstruct.ds.PatientAge)))
    except (AttributeError, ValueError):
        age = -1

    for lesion in lesions:
        
        #build a dataframe
        sid, date = group_name
        course = int(group_data[group_data.Modality == 'RTSTRUCT']['Series Description'].values[0][-1])
        lesion_clean = re.sub('[^A-Za-z]+', '', lesion)
        file_name=f"{sid}_{course}_{lesion_clean}"  
        new_row = [sid, date, lesion, lesion_clean,course,file_name,sex,height,weight,age]
        rt_lesions.loc[len(rt_lesions)] = new_row




rt_lesion contain all the lesions that are present in the RT STRUCT with the original lesion names

In [4]:
rt_lesions.head()

Unnamed: 0,sid,date,lesion,lesion_clean,course,file_name,sex,height,weight,age
0,103,04-18-2014,R atrium,Ratrium,1,103_1_Ratrium,F,1.65,75,78
1,103,04-18-2014,L temporal,Ltemporal,1,103_1_Ltemporal,F,1.65,75,78
2,103,12-18-2014,Rt Frontal,RtFrontal,2,103_2_RtFrontal,F,-1.0,-1,79
3,103,12-18-2014,Rt Sup Frontal,RtSupFrontal,2,103_2_RtSupFrontal,F,-1.0,-1,79
4,103,12-18-2014,R Inf Cerebellar,RInfCerebellar,2,103_2_RInfCerebellar,F,-1.0,-1,79


In [13]:
rt_lesions.to_csv('rt_lesions.csv', index=False)

Create the clinical data

In [2]:
import pandas as pd
import os

# Read the Excel file
excel_file = pd.ExcelFile('../data/Brain-TR-GammaKnife Clinical Information.xlsx')

# Get the sheet names
sheet_names = excel_file.sheet_names

# Read each sheet into a DataFrame
dfs = {}
for sheet_name in sheet_names:
    dfs[sheet_name] = excel_file.parse(sheet_name)

# Access the DataFrame for a specific sheet
df_cl = dfs['course_level']
df_ll = dfs['lesion_level']
df_ll = df_ll.drop('No.', axis=1)
df_cl = df_cl.rename(columns={'Course #': 'course'})
df_ll = df_ll.rename(columns={'Treatment Course': 'course'})
df_ll = df_ll.rename(columns={'Lesion Location': 'lesion'})
df_ll = df_ll.rename(columns={'unique_pt_id': 'sid'})
df_cl = df_cl.rename(columns={'unique_pt_id': 'sid'})

clinical = df_cl.merge(df_ll, on=['sid', 'course'], how='right')
clinical = clinical.drop(['Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Lesion Name in NRRD files'], axis=1) #, 'unique_pt_id'

In [15]:
clinical
clinical.to_csv('clinical_data.csv', index=False)

join clinincal with rt_lesions on lesion

In [35]:
jdf=clinical.merge(rt_lesions, on=['lesion','sid','course'], how='outer', indicator=True)

**At this point I wrked by hand to create the connctor**

I have asked chatgpt to exend the left and right

In [31]:
c = pd.read_csv('clinical_data_camel_chat_gpt.csv')
s = pd.read_csv('rt_lesions_camel_chat_gpt.csv')

In [32]:
c.head()

Unnamed: 0,sid,course,Diagnosis (Only want Mets),Primary Diagnosis,Age at Diagnosis,Gender,Lesion #,lesion,mri_type,duration_tx_to_imag (months),Fractions,chat_gpt
0,463,1,Brain Mets-Lung,Adenocarcinoma of the lung,60,Male,1,Lt Frontal,recurrence,10.713112,1,Left Frontal
1,463,2,Brain Mets-Lung,Adenocarcinoma of the lung,60,Male,2,R Motor Cortex,stable,7.952678,1,Right Motor Cortex
2,463,2,Brain Mets-Lung,Adenocarcinoma of the lung,60,Male,3,Lt Post Temporal,stable,7.952678,1,Left Post Temporal
3,463,2,Brain Mets-Lung,Adenocarcinoma of the lung,60,Male,4,Lt Lat Cerebellum,stable,7.952678,1,Left Lat Cerebellum
4,158,1,Brain Mets -Lung,Adenocarcinoma of the lung,58,Male,1,Rt SUP Frontal,stable,1.774565,1,Right Sup Frontal


In [33]:
s.head()

Unnamed: 0,sid,date,lesion,lesion_clean,course,file_name,chat_gpt
0,103,04-18-2014,R atrium,Ratrium,1,103_1_Ratrium,Right Atrium
1,103,04-18-2014,L temporal,Ltemporal,1,103_1_Ltemporal,Left Temporal
2,103,12-18-2014,Rt Frontal,RtFrontal,2,103_2_RtFrontal,Right Frontal
3,103,12-18-2014,Rt Sup Frontal,RtSupFrontal,2,103_2_RtSupFrontal,Right Sup Frontal
4,103,12-18-2014,R Inf Cerebellar,RInfCerebellar,2,103_2_RInfCerebellar,Right Inf Cerebellar


In [36]:
jdf=c.merge(s, on=['chat_gpt','sid','course'], how='outer', indicator=True)

In [37]:
jdf.to_csv('rt_lesions_camel_chat_gpt.csv', index=False)

!!!!! THe following file, holds the final hand work !!!!! You can start from here to create the connector !!!!!

In [3]:
jdf=pd.read_csv('rt_lesions_camel_chat_gpt_corrected.csv')
jdf

Unnamed: 0,sid,course,Diagnosis (Only want Mets),Primary Diagnosis,Age at Diagnosis,Gender,Lesion #,lesion_x,mri_type,duration_tx_to_imag (months),Fractions,chat_gpt,to_change,_merge,date,lesion_y,lesion_clean,file_name
0,151,1,Brain Mets -Lung,Adenocarcinoma of the lung,77.0,Female,1.0,1 Lt Inf Cerebellar1,stable,42.458101,1.0,1 Left Inf Cerebellar1,1 Left Inferior Cerebellar 1,left_only,,,,
1,151,1,,,,,,,,,,1 Left Inf Cerebellar 1,1 Left Inferior Cerebellar 1,right_only,10-31-2013,1 Lt Inf Cerebellar 1,LtInfCerebellar,151_1_LtInfCerebellar
2,151,1,Brain Mets -Lung,Adenocarcinoma of the lung,77.0,Female,10.0,10 Lt Ant Frontal 1,stable,42.458101,1.0,10 Left Ant Frontal 1,10 Left Anterior Frontal 1,both,10-31-2013,10 Lt ant Frontal 1,LtantFrontal,151_1_LtantFrontal
3,151,1,Brain Mets -Lung,Adenocarcinoma of the lung,77.0,Female,11.0,11 Lt Med Ant 1,stable,42.458101,1.0,11 Left Med Ant 1,11 Left Medial Anterior Frontal 1,left_only,,,,
4,151,1,,,,,,,,,,11 Left Med Ant Frontal 1,11 Left Medial Anterior Frontal 1,right_only,10-31-2013,11 Lt med ant frontal 1,Ltmedantfrontal,151_1_Ltmedantfrontal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339,492,2,Brain Mets - Lung,Adenocarcinoma of the lung,73.0,Male,13.0,Rt temporal,stable,11.501807,1.0,Right Temporal,Right Temporal,both,12-11-2014,R temporal,Rtemporal,492_2_Rtemporal
340,246,1,Brain Mets-Lung,Adenocarcinoma of the lung,62.0,Female,4.0,Rt Thalmic,stable,0.854420,1.0,Right Thalmic,Right Thalamic,left_only,,,,
341,246,1,,,,,,,,,,4 Right Thalamic 1,Right Thalamic,right_only,07-10-2013,4 Rt Thalamic 1,RtThalamic,246_1_RtThalamic
342,257,3,Brain Mets Renal cell,Renal cell carcinoma,58.0,Female,13.0,Rt Ventricle,stable,41.307920,1.0,Right Ventricle,Right Ventricle,both,09-29-2014,Rt Ventricle,RtVentricle,257_3_RtVentricle


In [4]:
cc=jdf[(jdf._merge=="both") | (jdf._merge=="left_only")].drop('_merge', axis=1).dropna(axis=1).reset_index(drop=True).rename(columns={'lesion_x': 'lesion'})
sc=jdf[(jdf._merge=="both") | (jdf._merge=="right_only")].drop('_merge', axis=1).dropna(axis=1).reset_index(drop=True).rename(columns={'lesion_y': 'lesion'})

In [5]:
# (cc['sid'].value_counts()-jdfc['sid'].value_counts()).sort_values(ascending=False).head(10)

In [6]:
cc

Unnamed: 0,sid,course,Diagnosis (Only want Mets),Primary Diagnosis,Age at Diagnosis,Gender,Lesion #,lesion,mri_type,duration_tx_to_imag (months),Fractions,chat_gpt,to_change
0,151,1,Brain Mets -Lung,Adenocarcinoma of the lung,77.0,Female,1.0,1 Lt Inf Cerebellar1,stable,42.458101,1.0,1 Left Inf Cerebellar1,1 Left Inferior Cerebellar 1
1,151,1,Brain Mets -Lung,Adenocarcinoma of the lung,77.0,Female,10.0,10 Lt Ant Frontal 1,stable,42.458101,1.0,10 Left Ant Frontal 1,10 Left Anterior Frontal 1
2,151,1,Brain Mets -Lung,Adenocarcinoma of the lung,77.0,Female,11.0,11 Lt Med Ant 1,stable,42.458101,1.0,11 Left Med Ant 1,11 Left Medial Anterior Frontal 1
3,151,1,Brain Mets -Lung,Adenocarcinoma of the lung,77.0,Female,12.0,12 Rt Ant Frontal 1,stable,42.458101,1.0,12 Right Ant Frontal 1,12 Right Anterior Frontal 1
4,151,1,Brain Mets -Lung,Adenocarcinoma of the lung,77.0,Female,13.0,13 Lt Vertex 1,stable,42.458101,1.0,13 Left Vertex 1,13 Left Vertex 1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,105,1,Brain Mets Kidney,Squamous cell carcinoma,58.0,Male,1.0,RT Temporal,stable,2.103188,1.0,Right Temporal,Right Temporal
240,492,2,Brain Mets - Lung,Adenocarcinoma of the lung,73.0,Male,13.0,Rt temporal,stable,11.501807,1.0,Right Temporal,Right Temporal
241,246,1,Brain Mets-Lung,Adenocarcinoma of the lung,62.0,Female,4.0,Rt Thalmic,stable,0.854420,1.0,Right Thalmic,Right Thalamic
242,257,3,Brain Mets Renal cell,Renal cell carcinoma,58.0,Female,13.0,Rt Ventricle,stable,41.307920,1.0,Right Ventricle,Right Ventricle


In [7]:
sc

Unnamed: 0,sid,course,chat_gpt,to_change,date,lesion,lesion_clean,file_name
0,151,1,1 Left Inf Cerebellar 1,1 Left Inferior Cerebellar 1,10-31-2013,1 Lt Inf Cerebellar 1,LtInfCerebellar,151_1_LtInfCerebellar
1,151,1,10 Left Ant Frontal 1,10 Left Anterior Frontal 1,10-31-2013,10 Lt ant Frontal 1,LtantFrontal,151_1_LtantFrontal
2,151,1,11 Left Med Ant Frontal 1,11 Left Medial Anterior Frontal 1,10-31-2013,11 Lt med ant frontal 1,Ltmedantfrontal,151_1_Ltmedantfrontal
3,151,1,12 Right Ant Frontal 1,12 Right Anterior Frontal 1,10-31-2013,12 Rt ant frontal 1,Rtantfrontal,151_1_Rtantfrontal
4,151,1,13 Left Vertex,13 Left Vertex 1,10-31-2013,13 Lt vertex,Ltvertex,151_1_Ltvertex
...,...,...,...,...,...,...,...,...
244,105,1,Right Temporal,Right Temporal,09-04-2014,RT Temporal,RTTemporal,105_1_RTTemporal
245,492,2,Right Temporal,Right Temporal,12-11-2014,R temporal,Rtemporal,492_2_Rtemporal
246,246,1,4 Right Thalamic 1,Right Thalamic,07-10-2013,4 Rt Thalamic 1,RtThalamic,246_1_RtThalamic
247,257,3,Right Ventricle,Right Ventricle,09-29-2014,Rt Ventricle,RtVentricle,257_3_RtVentricle


In [8]:
jdfc=cc.merge(sc, on=['to_change','sid','course'], how='inner', indicator=True)

In [9]:
jdfc=jdfc.rename(columns={'lesion_x': 'lesion_clinical', 'lesion_y': 'lesion_rtstruct','to_change': 'lesion'}).drop(['_merge','chat_gpt_x','chat_gpt_y'], axis=1).dropna(axis=1).reset_index(drop=True)
jdfc

Unnamed: 0,sid,course,Diagnosis (Only want Mets),Primary Diagnosis,Age at Diagnosis,Gender,Lesion #,lesion_clinical,mri_type,duration_tx_to_imag (months),Fractions,lesion,date,lesion_rtstruct,lesion_clean,file_name
0,151,1,Brain Mets -Lung,Adenocarcinoma of the lung,77.0,Female,1.0,1 Lt Inf Cerebellar1,stable,42.458101,1.0,1 Left Inferior Cerebellar 1,10-31-2013,1 Lt Inf Cerebellar 1,LtInfCerebellar,151_1_LtInfCerebellar
1,151,1,Brain Mets -Lung,Adenocarcinoma of the lung,77.0,Female,10.0,10 Lt Ant Frontal 1,stable,42.458101,1.0,10 Left Anterior Frontal 1,10-31-2013,10 Lt ant Frontal 1,LtantFrontal,151_1_LtantFrontal
2,151,1,Brain Mets -Lung,Adenocarcinoma of the lung,77.0,Female,11.0,11 Lt Med Ant 1,stable,42.458101,1.0,11 Left Medial Anterior Frontal 1,10-31-2013,11 Lt med ant frontal 1,Ltmedantfrontal,151_1_Ltmedantfrontal
3,151,1,Brain Mets -Lung,Adenocarcinoma of the lung,77.0,Female,12.0,12 Rt Ant Frontal 1,stable,42.458101,1.0,12 Right Anterior Frontal 1,10-31-2013,12 Rt ant frontal 1,Rtantfrontal,151_1_Rtantfrontal
4,151,1,Brain Mets -Lung,Adenocarcinoma of the lung,77.0,Female,13.0,13 Lt Vertex 1,stable,42.458101,1.0,13 Left Vertex 1,10-31-2013,13 Lt vertex,Ltvertex,151_1_Ltvertex
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,105,1,Brain Mets Kidney,Squamous cell carcinoma,58.0,Male,1.0,RT Temporal,stable,2.103188,1.0,Right Temporal,09-04-2014,RT Temporal,RTTemporal,105_1_RTTemporal
240,492,2,Brain Mets - Lung,Adenocarcinoma of the lung,73.0,Male,13.0,Rt temporal,stable,11.501807,1.0,Right Temporal,12-11-2014,R temporal,Rtemporal,492_2_Rtemporal
241,246,1,Brain Mets-Lung,Adenocarcinoma of the lung,62.0,Female,4.0,Rt Thalmic,stable,0.854420,1.0,Right Thalamic,07-10-2013,4 Rt Thalamic 1,RtThalamic,246_1_RtThalamic
242,257,3,Brain Mets Renal cell,Renal cell carcinoma,58.0,Female,13.0,Rt Ventricle,stable,41.307920,1.0,Right Ventricle,09-29-2014,Rt Ventricle,RtVentricle,257_3_RtVentricle


In [13]:
rt_lesion_with_age=pd.read_csv('rt_lesions.csv')

In [15]:
rt_lesion_with_age = rt_lesions.rename(columns={'lesion': 'lesion_rtstruct'})

In [16]:
rt_lesion_with_age

Unnamed: 0,sid,date,lesion_rtstruct,lesion_clean,course,file_name,sex,height,weight,age
0,103,04-18-2014,R atrium,Ratrium,1,103_1_Ratrium,F,1.65,75,78
1,103,04-18-2014,L temporal,Ltemporal,1,103_1_Ltemporal,F,1.65,75,78
2,103,12-18-2014,Rt Frontal,RtFrontal,2,103_2_RtFrontal,F,-1.00,-1,79
3,103,12-18-2014,Rt Sup Frontal,RtSupFrontal,2,103_2_RtSupFrontal,F,-1.00,-1,79
4,103,12-18-2014,R Inf Cerebellar,RInfCerebellar,2,103_2_RInfCerebellar,F,-1.00,-1,79
...,...,...,...,...,...,...,...,...,...,...
252,492,12-11-2014,Rt Post parietal,RtPostparietal,2,492_2_RtPostparietal,M,1.85,92,72
253,492,12-11-2014,Rt frontal,Rtfrontal,2,492_2_Rtfrontal,M,1.85,92,72
254,492,12-11-2014,R motor cortex,Rmotorcortex,2,492_2_Rmotorcortex,M,1.85,92,72
255,492,12-11-2014,R occipital,Roccipital,2,492_2_Roccipital,M,1.85,92,72


In [17]:
jdfc.merge(rt_lesion_with_age, on=['lesion_rtstruct','sid','course'], how='inner', indicator=True)

Unnamed: 0,sid,course,Diagnosis (Only want Mets),Primary Diagnosis,Age at Diagnosis,Gender,Lesion #,lesion_clinical,mri_type,duration_tx_to_imag (months),...,lesion_clean_x,file_name_x,date_y,lesion_clean_y,file_name_y,sex,height,weight,age,_merge
0,151,1,Brain Mets -Lung,Adenocarcinoma of the lung,77.0,Female,1.0,1 Lt Inf Cerebellar1,stable,42.458101,...,LtInfCerebellar,151_1_LtInfCerebellar,10-31-2013,LtInfCerebellar,151_1_LtInfCerebellar,F,1.69,70,78,both
1,151,1,Brain Mets -Lung,Adenocarcinoma of the lung,77.0,Female,10.0,10 Lt Ant Frontal 1,stable,42.458101,...,LtantFrontal,151_1_LtantFrontal,10-31-2013,LtantFrontal,151_1_LtantFrontal,F,1.69,70,78,both
2,151,1,Brain Mets -Lung,Adenocarcinoma of the lung,77.0,Female,11.0,11 Lt Med Ant 1,stable,42.458101,...,Ltmedantfrontal,151_1_Ltmedantfrontal,10-31-2013,Ltmedantfrontal,151_1_Ltmedantfrontal,F,1.69,70,78,both
3,151,1,Brain Mets -Lung,Adenocarcinoma of the lung,77.0,Female,12.0,12 Rt Ant Frontal 1,stable,42.458101,...,Rtantfrontal,151_1_Rtantfrontal,10-31-2013,Rtantfrontal,151_1_Rtantfrontal,F,1.69,70,78,both
4,151,1,Brain Mets -Lung,Adenocarcinoma of the lung,77.0,Female,13.0,13 Lt Vertex 1,stable,42.458101,...,Ltvertex,151_1_Ltvertex,10-31-2013,Ltvertex,151_1_Ltvertex,F,1.69,70,78,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,105,1,Brain Mets Kidney,Squamous cell carcinoma,58.0,Male,1.0,RT Temporal,stable,2.103188,...,RTTemporal,105_1_RTTemporal,09-04-2014,RTTemporal,105_1_RTTemporal,M,1.78,66,57,both
240,492,2,Brain Mets - Lung,Adenocarcinoma of the lung,73.0,Male,13.0,Rt temporal,stable,11.501807,...,Rtemporal,492_2_Rtemporal,12-11-2014,Rtemporal,492_2_Rtemporal,M,1.85,92,72,both
241,246,1,Brain Mets-Lung,Adenocarcinoma of the lung,62.0,Female,4.0,Rt Thalmic,stable,0.854420,...,RtThalamic,246_1_RtThalamic,07-10-2013,RtThalamic,246_1_RtThalamic,F,1.60,44,63,both
242,257,3,Brain Mets Renal cell,Renal cell carcinoma,58.0,Female,13.0,Rt Ventricle,stable,41.307920,...,RtVentricle,257_3_RtVentricle,09-29-2014,RtVentricle,257_3_RtVentricle,F,1.70,58,63,both


In [18]:
jdfc.to_csv('connector.csv',index=0)