In [3]:
import pandas as pd
import os

# Read the Excel file
excel_file = pd.ExcelFile('../data/Brain-TR-GammaKnife Clinical Information.xlsx')

# Get the sheet names
sheet_names = excel_file.sheet_names

# Read each sheet into a DataFrame
dfs = {}
for sheet_name in sheet_names:
    dfs[sheet_name] = excel_file.parse(sheet_name)

# Access the DataFrame for a specific sheet
df_cl = dfs['course_level']
df_ll = dfs['lesion_level']
df_ll = df_ll.drop('No.', axis=1)
df_cl = df_cl.rename(columns={'Course #': 'Course'})
df_ll = df_ll.rename(columns={'Treatment Course': 'Course'})



In [4]:
merged_df = df_cl.merge(df_ll, on=['unique_pt_id', 'Course'], how='right')
clinical_a = merged_df.drop(['Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Lesion Name in NRRD files'], axis=1).rename(columns={'unique_pt_id':'pid','Lesion Location':'lesion_clinical'})
columns = [col for col in clinical_a.columns]
clinical_a = clinical_a[['pid', 'Course', 'lesion_clinical', 'Diagnosis (Only want Mets)', 'Primary Diagnosis', 'Age at Diagnosis', 'Gender', 'Lesion #', 'mri_type', 'duration_tx_to_imag (months)', 'Fractions']]
clinical_a.columns = [c.lower() for c in ['pid', 'Course', 'lesion_clinical', 'Diagnosis (Only want Mets)', 'Primary Diagnosis', 'Age at Diagnosis', 'Gender', 'Lesion #', 'mri_type', 'duration_tx_to_imag (months)', 'Fractions']]
clinical_a.head(1)

Unnamed: 0,pid,course,lesion_clinical,diagnosis (only want mets),primary diagnosis,age at diagnosis,gender,lesion #,mri_type,duration_tx_to_imag (months),fractions
0,463,1,Lt Frontal,Brain Mets-Lung,Adenocarcinoma of the lung,60,Male,1,recurrence,10.713112,1


In [5]:
connector=pd.read_csv('../connector/connector.csv')
clinical_b = connector.drop(columns=['date','lesion_clean','file_name'])
clinical_b = clinical_b.rename(columns={'sid':'pid', 'mri_type': 'label'})
columns = [col for col in clinical_b.columns]
clinical_b = clinical_b[['pid', 'course','lesion_clinical','lesion_rtstruct', 'lesion', 'Diagnosis (Only want Mets)', 'Primary Diagnosis', 'Age at Diagnosis', 'Gender', 'Lesion #', 'label', 'duration_tx_to_imag (months)', 'Fractions']]
clinical_b.head(1)

Unnamed: 0,pid,course,lesion_clinical,lesion_rtstruct,lesion,Diagnosis (Only want Mets),Primary Diagnosis,Age at Diagnosis,Gender,Lesion #,label,duration_tx_to_imag (months),Fractions
0,151,1,1 Lt Inf Cerebellar1,1 Lt Inf Cerebellar 1,1 Left Inferior Cerebellar 1,Brain Mets -Lung,Adenocarcinoma of the lung,77,Female,1,stable,42.458101,1


In [6]:
clinical_a.shape,clinical_b.shape

((244, 11), (244, 13))

In [7]:
clinical = clinical_b.merge(clinical_a, on=['pid', 'course','lesion_clinical'], how='inner')
clinical = clinical.drop(columns=['diagnosis (only want mets)','primary diagnosis','age at diagnosis','gender','lesion #','mri_type','duration_tx_to_imag (months)_y','fractions'], axis=1)
clinical = clinical.rename(columns={'duration_tx_to_imag (months)_x':'duration_tx_to_imag (months)', 'Diagnosis (Only want Mets)': 'metastasis'})
clinical.columns = clinical.columns.str.lower()
clinical.head(1)

Unnamed: 0,pid,course,lesion_clinical,lesion_rtstruct,lesion,metastasis,primary diagnosis,age at diagnosis,gender,lesion #,label,duration_tx_to_imag (months),fractions
0,151,1,1 Lt Inf Cerebellar1,1 Lt Inf Cerebellar 1,1 Left Inferior Cerebellar 1,Brain Mets -Lung,Adenocarcinoma of the lung,77,Female,1,stable,42.458101,1


In [8]:
clinical.to_csv('clinical_basic.csv', index=False)

In [9]:
clinical['metastasis'] = clinical['metastasis'].str.replace('Mets', '').str.replace('mets', '').str.replace('Brain', '').str.replace('brain', '').str.replace('-', '').str.strip()


In [10]:
unique_values = clinical['metastasis'].unique()
unique_values

array(['Lung', 'Uterine', 'Melanoma', 'Met', 'Breast', 'GK   2 lesions',
       'Renal cell', 'Lung/Urothelial', 'MetPost op cavity',
       'Endometrial Ca', 'Ovary', 'RCC', 'Esophageal',
       'Breast with Large Frontal Met', 'Kidney', 'Urothelial'],
      dtype=object)

In [11]:
unique_count = clinical['metastasis'].nunique()
unique_count


16

In [12]:
class_counts = clinical.groupby('metastasis').size()
class_counts


metastasis
Breast                           42
Breast with Large Frontal Met     1
Endometrial Ca                    1
Esophageal                        4
GK   2 lesions                    3
Kidney                            2
Lung                             95
Lung/Urothelial                   3
Melanoma                         17
Met                              23
MetPost op cavity                 4
Ovary                             8
RCC                               8
Renal cell                       31
Urothelial                        1
Uterine                           1
dtype: int64

In [None]:
# Function to map the old diagnosis to new categories
def map_diagnosis(diagnosis):
    if "Breast" in diagnosis:
        return "Breast"
    elif diagnosis in ["Kidney", "RCC", "Renal cell", "Urothelial", "Uterine"]:
        return "Genitourinary"
    elif diagnosis in ["Endometrial Ca", "Ovary"]:
        return "Gynecological"
    elif diagnosis == "Esophageal":
        return "Gastrointestinal"
    # elif diagnosis in ["Met", "MetPost op cavity"]:
    #     return "Metastatic Cancer"
    elif diagnosis in ["Lung", "Lung/Urothelial"]:
        return "Lung"
    else:
        return diagnosis  # For categories like "Melanoma" that remain unchanged

# Apply the mapping function to the "Diagnosis (Only want Mets)" column
merged_df["Diagnosis (Only want Mets)"] = merged_df["Diagnosis (Only want Mets)"].apply(map_diagnosis)

merged_df.head(1)


In [None]:
merged_df

In [13]:
clinical.to_csv('clinical_clean_met_1.csv', index=False)

In [16]:
object_columns = clinical.select_dtypes(include='object').columns
clinical[object_columns] = clinical[object_columns].astype('category')

In [17]:
clinical.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 12 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   pid                           244 non-null    int64   
 1   course                        244 non-null    int64   
 2   lesion_clinical               244 non-null    category
 3   lesion                        244 non-null    category
 4   metastasis                    244 non-null    category
 5   primary diagnosis             244 non-null    category
 6   age at diagnosis              244 non-null    int64   
 7   gender                        244 non-null    category
 8   lesion #                      244 non-null    int64   
 9   label                         244 non-null    category
 10  duration_tx_to_imag (months)  244 non-null    float64 
 11  fractions                     244 non-null    int64   
dtypes: category(6), float64(1), int64(5)
memory usage:

In [None]:
merged_df.to_csv('output_merged_pd.csv', index=False)

In [None]:
pd.read_csv('output_merged_pd.csv', index_col=0)

In [19]:
clinical.head(1)

Unnamed: 0,pid,course,lesion_clinical,lesion_rtstruct,lesion,metastasis,primary diagnosis,age at diagnosis,gender,lesion #,label,duration_tx_to_imag (months),fractions
0,151,1,1 Lt Inf Cerebellar1,1 Lt Inf Cerebellar 1,1 Left Inferior Cerebellar 1,Lung,Adenocarcinoma of the lung,77,Female,1,stable,42.458101,1


In [14]:
#read the metadata
import pandas as pd
df = pd.read_csv("../data/metadata.csv")


In [18]:
import SimpleITK as sitk
import pydicom
import glob
import os
from rt_utils import RTStructBuilder
import re
import numpy as np

grouped_df = df.groupby(['pid', 'Study Date'])

rt_lesions = pd.DataFrame(columns=['pid', 'date', 'lesion', 'lesion_clean', 'course','file_name','sex','height','weight','age']) 

# Iterate over each group
for group_name, group_data in grouped_df:
    # Get the 'SOP Class Name' and 'File Location' for each group
    folders_dic = dict(zip(group_data['SOP Class Name'], group_data['File Location']))
    
    mri_folder = os.path.join('../data',folders_dic['MR Image Storage'])
    str_folder = os.path.join('../data',folders_dic['RT Structure Set Storage'])

    # STRUCT
    str_file = glob.glob(os.path.join(str_folder, "*.dcm"))[0]   
    rtstruct = RTStructBuilder.create_from(dicom_series_path=mri_folder,rt_struct_path=str_file)
    names = rtstruct.get_roi_names()
    lesions = [name for name in names if "Skull" not in name] 

    try:
        sex = rtstruct.ds.PatientSex
    except AttributeError:
        sex = 'NA'

    try:
        height = np.round(rtstruct.ds.PatientSize, 2)
    except (AttributeError, TypeError):
        height = -1.0

    try:
        weight = int(rtstruct.ds.PatientWeight)
    except (AttributeError, ValueError):
        weight = -1

    try:
        age = int(''.join(re.findall(r'\d+', rtstruct.ds.PatientAge)))
    except (AttributeError, ValueError):
        age = -1

    for lesion in lesions:
        
        #build a dataframe
        pid, date = group_name
        course = int(group_data[group_data.Modality == 'RTSTRUCT']['Series Description'].values[0][-1])
        lesion_clean = re.sub('[^A-Za-z]+', '', lesion)
        file_name=f"{pid}_{course}_{lesion_clean}"  
        new_row = [pid, date, lesion, lesion_clean,course,file_name,sex,height,weight,age]
        rt_lesions.loc[len(rt_lesions)] = new_row



In [36]:
clinical=clinical.merge(rt_lesions,left_on=['pid','course','lesion_rtstruct'], right_on=['pid','course','lesion'], how='inner')
clinical.shape

(244, 21)

In [37]:
clinical=clinical.drop(columns=['file_name','lesion_clean','date','lesion_y'], axis=1)
clinical=clinical.rename(columns={'lesion_x':'lesion'})
clinical.head(1)

Unnamed: 0,pid,course,lesion_clinical,lesion_rtstruct,lesion,metastasis,primary diagnosis,age at diagnosis,gender,lesion #,label,duration_tx_to_imag (months),fractions,sex,height,weight,age
0,151,1,1 Lt Inf Cerebellar1,1 Lt Inf Cerebellar 1,1 Left Inferior Cerebellar 1,Lung,Adenocarcinoma of the lung,77,Female,1,stable,42.458101,1,F,1.69,70,78


In [38]:
clinical.to_csv('clinical_clean_met_1_with_height.csv', index=False)

In [40]:
clinical.head(1)

Unnamed: 0,pid,course,lesion_clinical,lesion_rtstruct,lesion,metastasis,primary diagnosis,age at diagnosis,gender,lesion #,label,duration_tx_to_imag (months),fractions,sex,height,weight,age
0,151,1,1 Lt Inf Cerebellar1,1 Lt Inf Cerebellar 1,1 Left Inferior Cerebellar 1,Lung,Adenocarcinoma of the lung,77,Female,1,stable,42.458101,1,F,1.69,70,78
