In [5]:
from datetime import date, datetime
from matplotlib import pyplot as plt

import collections
import csv
import os
import pickle
import pydicom

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [6]:
pd.set_option("display.max_rows", None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [7]:
# Specify Project details

INSTITUTION_NAME = r'TBP'
PHENOTYPE = r'TB'
TYPE_OF_DATA = r'CXR_METADATA'
DATE = str(date.today())

In [9]:
cxr_metadata_df = pd.read_csv(CXR_DATA_PATH + "TB_Portals_CXRs_August_2023.csv", dtype = 'str')
rif_sensitive_hiv_neg_baseline = pd.read_csv(PROCESSED_DATA_PATH + "new_rifs_baseline_with-imagingstudy_id_10252023update.csv", dtype = 'str')
rif_resistant_hiv_neg_baseline = pd.read_csv(PROCESSED_DATA_PATH + "new_rifr_baseline_with-imagingstudy_id_10252023update.csv", dtype = 'str')

In [15]:
def print_cohort_info(df):
    
    print(f"Number of patients: {df.patient_id.nunique()}")
    print(f"Number of records: {len(df)}")
    print(f"Number of cxrs: {df.series_instance_content_url.nunique()}")

In [16]:
print_cohort_info(cxr_metadata_df)

Number of patients: 8844
Number of records: 10953
Number of cxrs: 10953


In [19]:
cxr_metadata_df = cxr_metadata_df.drop_duplicates(subset=['patient_id', 'series_instance_content_url'], keep=False)

In [20]:
print_cohort_info(cxr_metadata_df)

Number of patients: 8844
Number of records: 10953
Number of cxrs: 10953


In [22]:
def print_num_cols(df):
    cols = len(df.axes[1])
    print("Number of Columns: ", cols)

In [23]:
print_num_cols(rif_sensitive_hiv_neg_baseline)   # Outcome column removed in the new file

Number of Columns:  12


In [24]:
print_num_cols(rif_resistant_hiv_neg_baseline)

Number of Columns:  12


In [27]:
print(f"Number of patients in rif_sensitive_hiv_neg_baseline: {rif_sensitive_hiv_neg_baseline.patient_id.nunique()}")
print(f"Number of records in rif_sensitive_hiv_neg_baseline: {len(rif_sensitive_hiv_neg_baseline)}")

Number of patients in rif_sensitive_hiv_neg_baseline: 3446
Number of records in rif_sensitive_hiv_neg_baseline: 3446


In [29]:
print(f"Number of patients in rif_resistant_hiv_neg_baseline: {rif_resistant_hiv_neg_baseline.patient_id.nunique()}")
print(f"Number of records in rif_resistant_hiv_neg_baseline: {len(rif_resistant_hiv_neg_baseline)}")

Number of patients in rif_resistant_hiv_neg_baseline: 3767
Number of records in rif_resistant_hiv_neg_baseline: 3767


In [32]:
cxr_metadata_df_copy = cxr_metadata_df.copy()

In [33]:
rif_resistant_hiv_neg_baseline_with_cxr = pd.merge(rif_resistant_hiv_neg_baseline, cxr_metadata_df_copy, how='inner', on=['patient_id', 'imagingstudy_id'])
rif_sensitive_hiv_neg_baseline_with_cxr = pd.merge(rif_sensitive_hiv_neg_baseline, cxr_metadata_df_copy, how='inner', on=['patient_id', 'imagingstudy_id'])

In [34]:
print_cohort_info(rif_sensitive_hiv_neg_baseline_with_cxr)

Number of patients: 3446
Number of records: 3446
Number of cxrs: 3446


In [36]:
print_cohort_info(rif_resistant_hiv_neg_baseline_with_cxr)

Number of patients: 3767
Number of records: 3767
Number of cxrs: 3767


In [44]:
rif_resistant_hiv_neg_baseline_with_cxr = rif_resistant_hiv_neg_baseline_with_cxr[['patient_id', 'imagingstudy_id', 
                                                      'timika', 'overallpercentofabnormalvolume', 'large_cavity',
                                                      'series_instance_content_url', 'cxr_outlier']]
rif_sensitive_hiv_neg_baseline_with_cxr = rif_sensitive_hiv_neg_baseline_with_cxr[['patient_id', 'imagingstudy_id', 
                                                      'timika', 'overallpercentofabnormalvolume', 'large_cavity',
                                                      'series_instance_content_url', 'cxr_outlier']]

# Extract Pixel and Orientation Info

In [45]:
# Ask Drew how to get Patient Orientation info

In [46]:
def extract_pixel_and_orientation_info(cohort_df, all_dicoms_folder_path):
    
    images_path = []
    for index, row in cohort_df.iterrows():
        images_path.append(all_dicoms_folder_path + row['series_instance_content_url'])    
    
    df = pd.DataFrame()
    
    for image_path in images_path:
        
        
        try:
            dicom_file_dataset = pydicom.read_file(image_path, force=True)
            Rows = dicom_file_dataset.Rows
            Columns = dicom_file_dataset.Columns
#             PatientOrientation = dicom_file_dataset.PatientOrientation
            PhotometricInterpretation = dicom_file_dataset.PhotometricInterpretation
            
        except AttributeError:
            Rows = np.nan
#             PatientOrientation = np.nan
            PhotometricInterpretation = np.nan
            
        except FileNotFoundError:
            print(image_path)
            
            
            
        df = df.append(pd.DataFrame({'series_instance_content_url': image_path.replace(all_dicoms_folder_path, '').strip(), 
                                     'row_pixel_val': Rows,'column_pixel_val': Columns, 
#                                      'patient_orientation': PatientOrientation, 
                                     'photometric_interpretation': PhotometricInterpretation
                                    }, index=[0]))
        

    return df

In [47]:
rif_sensitive_hiv_neg_baseline_dicom_attr = extract_pixel_and_orientation_info(rif_sensitive_hiv_neg_baseline_with_cxr, CXR_IMAGE_PATH)
rif_resistant_hiv_neg_baseline_dicom_attr = extract_pixel_and_orientation_info(rif_resistant_hiv_neg_baseline_with_cxr, CXR_IMAGE_PATH)

In [50]:
rif_sensitive_hiv_neg_baseline_dicom_attr.head()

Unnamed: 0,series_instance_content_url,row_pixel_val,column_pixel_val,photometric_interpretation
0,0036983b-7577-487f-83d1-00522ed03ed5/2.25.1570...,1191,1173,MONOCHROME2
0,0064a99c-422f-4447-b462-cfb10f376446/1.2.410.2...,3268,2796,MONOCHROME2
0,006c7d01-3dd9-4c0c-91d1-2db4fe20594a/2.25.3055...,2935,3128,MONOCHROME2
0,0070e6ec-2727-4e74-80c9-fb7dc8b3430b/2.25.1354...,3032,3032,MONOCHROME2
0,009f6d9b-4ae3-47e3-bb7d-68bb961dd1c4/1.2.826.0...,4640,4210,MONOCHROME1


In [51]:
len(rif_sensitive_hiv_neg_baseline_dicom_attr)

3446

In [52]:
rif_resistant_hiv_neg_baseline_dicom_attr.head()

Unnamed: 0,series_instance_content_url,row_pixel_val,column_pixel_val,photometric_interpretation
0,00069df2-2406-43b6-8c58-5f5e164c7e35/1.3.6.1.4...,2724,2248,MONOCHROME2
0,0045e42a-8d57-4e3a-8127-08a714ea3d4f/2.25.1838...,2227,2229,MONOCHROME2
0,0046f70e-395f-40bb-b3a7-57b3fe3e97d1/2.25.3401...,1024,1024,MONOCHROME2
0,00617d3b-7975-49e1-9b44-cadfcb01d739/1.2.840.1...,2248,2724,MONOCHROME2
0,00755090-59a8-4e4f-bc7d-451e1ec533f0/2.25.3083...,3106,3706,MONOCHROME2


In [53]:
len(rif_resistant_hiv_neg_baseline_dicom_attr)

3767

In [56]:
rif_sensitive_hiv_neg_baseline_dicom_attr.photometric_interpretation.value_counts()

MONOCHROME2    3216
MONOCHROME1     230
Name: photometric_interpretation, dtype: int64

In [57]:
rif_resistant_hiv_neg_baseline_dicom_attr.photometric_interpretation.value_counts()

MONOCHROME2    3635
MONOCHROME1     132
Name: photometric_interpretation, dtype: int64

In [59]:
rif_resistant = pd.merge(rif_resistant_hiv_neg_baseline_with_cxr, rif_resistant_hiv_neg_baseline_dicom_attr, how='inner', on=['series_instance_content_url'])
rif_sensitive = pd.merge(rif_sensitive_hiv_neg_baseline_with_cxr, rif_sensitive_hiv_neg_baseline_dicom_attr, how='inner', on=['series_instance_content_url'])

In [61]:
len(rif_resistant)

3767

In [65]:
len(rif_sensitive)

3446

In [66]:
columns_to_convert = ['timika', 'overallpercentofabnormalvolume', 'large_cavity', 'row_pixel_val', 'column_pixel_val']

In [67]:
rif_resistant[columns_to_convert] = rif_resistant[columns_to_convert].apply(pd.to_numeric)
rif_sensitive[columns_to_convert] = rif_sensitive[columns_to_convert].apply(pd.to_numeric)

In [68]:
rif_sensitive[['overallpercentofabnormalvolume','timika', 'large_cavity']].agg(['min','max'])

Unnamed: 0,overallpercentofabnormalvolume,timika,large_cavity
min,0.0,0.0,0.0
max,100.0,140.0,1.0


In [69]:
rif_resistant[['overallpercentofabnormalvolume','timika', 'large_cavity']].agg(['min','max'])

Unnamed: 0,overallpercentofabnormalvolume,timika,large_cavity
min,0.0,0.0,0.0
max,100.0,140.0,1.0


# Outlier removal

In [73]:
rif_sensitive_ml_ready = rif_sensitive.loc[rif_sensitive['cxr_outlier'] == 'normal']
rif_resistant_ml_ready = rif_resistant.loc[rif_resistant['cxr_outlier'] == 'normal']

In [74]:
len(rif_sensitive_ml_ready)

2368

In [75]:
rif_sensitive_ml_ready.patient_id.nunique()

2368

In [76]:
len(rif_resistant_ml_ready)

2893

In [77]:
rif_resistant_ml_ready.patient_id.nunique()

2893