# <center> X-ray Data

#### **1/ Import all necessary libraries and classes to run this notebook**

In [None]:
# Import libraries
from datetime import timedelta
import os

import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from IPython.display import display, HTML, Image
%matplotlib inline

plt.style.use('ggplot')
plt.rcParams.update({'font.size': 20})

# Access data using Google BigQuery.
from google.colab import auth
from google.cloud import bigquery

#### **2/ Set up MIMIC big query environment**

In [None]:
# authenticate
auth.authenticate_user()

In [None]:
# Set up environment variables
project_id = 'your_project_id'
os.environ["GOOGLE_CLOUD_PROJECT"] = project_id

# Read data from BigQuery into pandas dataframes.
def run_query(query, project_id=project_id):
  return pd.io.gbq.read_gbq(
      query,
      project_id=project_id,
      dialect='standard')

# set the dataset
dataset = 'mimiciv'


#### **3/ Read and preprocess the data**

In [None]:
data = pd.read_csv('processed_data.csv')

In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,file_name,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,0,91ea24c1-ddf8f918-0c579885-c0bf36ed-3a2b306a.jpg,-1.920675,-3.472368,3.423497e-39,-4.019975,-2.682965,3.095561e-39,2.35094e-40,-2.235377,-4.163281,-4.5954030000000005e-39,-3.747526,-2.24311e-40,-4.80337e-40,-2.707837e-39,-3.875355,-2.638527,-1.517044,-3.526651
1,1,35469b32-ecaab79e-6029f852-160bd61a-9e39d5bb.jpg,0.088068,-1.032863,2.666148e-39,-3.083943,-0.21408,1.675793e-39,-5.3053e-40,0.470033,-1.843389,-2.014102e-39,-0.644985,-2.280837e-39,5.66623e-40,-1.3074510000000002e-39,-1.202289,-0.816857,0.823061,-1.040545
2,2,15a9cb0e-a198aa68-69378bf4-b2b1c36c-fd8020c2.jpg,-0.521883,-0.844022,1.648815e-39,-2.451156,0.017114,-9.18897e-40,2.053338e-39,-0.68777,-1.1409,1.66672e-39,-2.638187,1.417642e-39,3.50608e-40,-1.324915e-39,-0.342451,-0.532974,0.810412,-2.13141
3,3,884910a8-5d7a8bbd-1d59d71f-a97fa282-f7b9850f.jpg,0.401914,-1.997322,-4.84884e-40,-3.875956,-0.883333,-1.4519990000000002e-39,-8.22674e-40,-0.259665,-1.303978,8.3198e-41,0.441326,4.066523e-39,-1.0162440000000001e-39,-3.090698e-39,-1.631463,-1.706191,0.775801,-1.260036
4,4,ebdae94d-7ace4119-3ca77b0b-bc8b0cd9-77429e52.jpg,-0.574579,-0.706262,2.5765830000000003e-39,-2.607862,-0.230711,-2.66528e-40,1.6375840000000002e-39,-0.094715,-1.650083,1.24854e-40,-2.060443,-2.347249e-39,7.23549e-40,-3.353733e-39,-1.342637,-0.839634,0.467398,-1.756306


In [None]:
data['dicom'] = data['file_name'].str.replace('.jpg', '')

  data['dicom'] = data['file_name'].str.replace('.jpg', '')


In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,file_name,0,1,2,3,4,5,6,7,...,9,10,11,12,13,14,15,16,17,dicom
0,0,91ea24c1-ddf8f918-0c579885-c0bf36ed-3a2b306a.jpg,-1.920675,-3.472368,3.423497e-39,-4.019975,-2.682965,3.095561e-39,2.35094e-40,-2.235377,...,-4.5954030000000005e-39,-3.747526,-2.24311e-40,-4.80337e-40,-2.707837e-39,-3.875355,-2.638527,-1.517044,-3.526651,91ea24c1-ddf8f918-0c579885-c0bf36ed-3a2b306a
1,1,35469b32-ecaab79e-6029f852-160bd61a-9e39d5bb.jpg,0.088068,-1.032863,2.666148e-39,-3.083943,-0.21408,1.675793e-39,-5.3053e-40,0.470033,...,-2.014102e-39,-0.644985,-2.280837e-39,5.66623e-40,-1.3074510000000002e-39,-1.202289,-0.816857,0.823061,-1.040545,35469b32-ecaab79e-6029f852-160bd61a-9e39d5bb
2,2,15a9cb0e-a198aa68-69378bf4-b2b1c36c-fd8020c2.jpg,-0.521883,-0.844022,1.648815e-39,-2.451156,0.017114,-9.18897e-40,2.053338e-39,-0.68777,...,1.66672e-39,-2.638187,1.417642e-39,3.50608e-40,-1.324915e-39,-0.342451,-0.532974,0.810412,-2.13141,15a9cb0e-a198aa68-69378bf4-b2b1c36c-fd8020c2
3,3,884910a8-5d7a8bbd-1d59d71f-a97fa282-f7b9850f.jpg,0.401914,-1.997322,-4.84884e-40,-3.875956,-0.883333,-1.4519990000000002e-39,-8.22674e-40,-0.259665,...,8.3198e-41,0.441326,4.066523e-39,-1.0162440000000001e-39,-3.090698e-39,-1.631463,-1.706191,0.775801,-1.260036,884910a8-5d7a8bbd-1d59d71f-a97fa282-f7b9850f
4,4,ebdae94d-7ace4119-3ca77b0b-bc8b0cd9-77429e52.jpg,-0.574579,-0.706262,2.5765830000000003e-39,-2.607862,-0.230711,-2.66528e-40,1.6375840000000002e-39,-0.094715,...,1.24854e-40,-2.060443,-2.347249e-39,7.23549e-40,-3.353733e-39,-1.342637,-0.839634,0.467398,-1.756306,ebdae94d-7ace4119-3ca77b0b-bc8b0cd9-77429e52


#### **4/ Link each x-ray to a specific subject_id and hadm_id according to the StudyDate**

In [None]:
data2 = run_query(f"""
SELECT r.subject_id, dm.dicom, dm.StudyDate, h.hadm_id, h.admittime, h.dischtime
FROM `physionet-data.mimic_cxr.dicom_metadata_string` AS dm
JOIN `physionet-data.mimic_cxr.record_list` AS r ON r.dicom_id = dm.dicom
JOIN `physionet-data.mimiciv_hosp.admissions` AS h ON h.subject_id = r.subject_id
""")
print(data2)

         subject_id                                         dicom StudyDate  \
0          18415616  75d67482-46fbfcfb-b9d3be10-98f1b1dd-ba9748dc  21640428   
1          18415616  75d67482-46fbfcfb-b9d3be10-98f1b1dd-ba9748dc  21640428   
2          18415616  91ea24c1-ddf8f918-0c579885-c0bf36ed-3a2b306a  21640428   
3          18415616  91ea24c1-ddf8f918-0c579885-c0bf36ed-3a2b306a  21640428   
4          19136768  69ea47d2-8e44c7ea-8fd5dada-9385460a-fd8863d2  21481021   
...             ...                                           ...       ...   
2509603    17956863  4baead16-4c878e7c-fcc6071d-e06ebfb2-61d25c4c  22060814   
2509604    17956863  328fba1d-fb7a2246-f361ecc5-09c96079-48a89f06  22060927   
2509605    17956863  328fba1d-fb7a2246-f361ecc5-09c96079-48a89f06  22060927   
2509606    17956863  59218cf7-f2c291c7-25ed5f1f-1a839972-c93bb3b3  22060814   
2509607    17956863  59218cf7-f2c291c7-25ed5f1f-1a839972-c93bb3b3  22060814   

          hadm_id           admittime           dis

In [None]:
data2['time'] = pd.to_datetime(data2['StudyDate'], format='%Y%m%d', errors='coerce').dt.strftime('%Y-%m-%dT%H:%M:%S')
data2['admittime'] = pd.to_datetime(data2['admittime']).dt.strftime('%Y-%m-%dT00:00:00')
data2


Unnamed: 0,subject_id,dicom,StudyDate,hadm_id,admittime,dischtime,time
0,18415616,75d67482-46fbfcfb-b9d3be10-98f1b1dd-ba9748dc,21640428,29138337,2164-04-28T00:00:00,2164-04-30 16:20:00,2164-04-28T00:00:00
1,18415616,75d67482-46fbfcfb-b9d3be10-98f1b1dd-ba9748dc,21640428,21610375,2158-09-09T00:00:00,2158-09-16 16:57:00,2164-04-28T00:00:00
2,18415616,91ea24c1-ddf8f918-0c579885-c0bf36ed-3a2b306a,21640428,29138337,2164-04-28T00:00:00,2164-04-30 16:20:00,2164-04-28T00:00:00
3,18415616,91ea24c1-ddf8f918-0c579885-c0bf36ed-3a2b306a,21640428,21610375,2158-09-09T00:00:00,2158-09-16 16:57:00,2164-04-28T00:00:00
4,19136768,69ea47d2-8e44c7ea-8fd5dada-9385460a-fd8863d2,21481021,27903990,2149-07-22T00:00:00,2149-07-23 19:20:00,2148-10-21T00:00:00
...,...,...,...,...,...,...,...
2509603,17956863,4baead16-4c878e7c-fcc6071d-e06ebfb2-61d25c4c,22060814,27809961,2206-03-20T00:00:00,2206-03-21 15:30:00,2206-08-14T00:00:00
2509604,17956863,328fba1d-fb7a2246-f361ecc5-09c96079-48a89f06,22060927,28747558,2206-09-27T00:00:00,2206-09-29 16:24:00,2206-09-27T00:00:00
2509605,17956863,328fba1d-fb7a2246-f361ecc5-09c96079-48a89f06,22060927,27809961,2206-03-20T00:00:00,2206-03-21 15:30:00,2206-09-27T00:00:00
2509606,17956863,59218cf7-f2c291c7-25ed5f1f-1a839972-c93bb3b3,22060814,28747558,2206-09-27T00:00:00,2206-09-29 16:24:00,2206-08-14T00:00:00


In [None]:
data2 = data2[(data2['admittime'] <= data2['time']) & (data2['time'] <= data2['dischtime'])]


In [None]:
# Drop duplicate 'hadm_id' rows, keeping only the first occurrence
data2 = data2.drop_duplicates(subset='dicom', keep='first')

print(data2)

         subject_id                                         dicom StudyDate  \
0          18415616  75d67482-46fbfcfb-b9d3be10-98f1b1dd-ba9748dc  21640428   
2          18415616  91ea24c1-ddf8f918-0c579885-c0bf36ed-3a2b306a  21640428   
13         19136768  c2822cd9-785880dd-b21df2f4-feae6873-a3dbcc34  21501224   
27         19136768  35469b32-ecaab79e-6029f852-160bd61a-9e39d5bb  21510408   
33         19136768  aad41f8e-432a7f56-264d490c-740a1fa6-94f4f537  21510408   
...             ...                                           ...       ...   
2509593    15007487  81f647b5-856ae069-e0941845-c6ddb0f6-e9a9aaec  21421214   
2509594    15007487  2acfc474-14dcc92e-4c225ec7-4451198e-601be5b0  21421213   
2509595    15007487  4690a275-0ffc1451-af9921ce-cebf8df2-24a0236a  21421215   
2509596    17956863  aaf308c6-4bdb0059-affc2594-f26cb1f6-702c5e9d  22060927   
2509604    17956863  328fba1d-fb7a2246-f361ecc5-09c96079-48a89f06  22060927   

          hadm_id            admittime           di

In [None]:

# Merge the two DataFrames on 'dicom'
data_f = data.merge(data2, on='dicom', how='inner')
print(data_f)

        Unnamed: 0                                         file_name  \
0                0  91ea24c1-ddf8f918-0c579885-c0bf36ed-3a2b306a.jpg   
1                1  35469b32-ecaab79e-6029f852-160bd61a-9e39d5bb.jpg   
2                2  15a9cb0e-a198aa68-69378bf4-b2b1c36c-fd8020c2.jpg   
3                3  884910a8-5d7a8bbd-1d59d71f-a97fa282-f7b9850f.jpg   
4                4  ebdae94d-7ace4119-3ca77b0b-bc8b0cd9-77429e52.jpg   
...            ...                                               ...   
112026      145590  68d34ebb-d6a26d81-95dc7596-cf6b9b52-837fa967.jpg   
112027      145592  d549956e-a4caae3e-978d2446-f5280ec9-ffc321ee.jpg   
112028      145593  95fd7365-ef0d4f84-d108a31f-d847bc31-a28109d0.jpg   
112029      145594  304b0b21-0a8d3feb-313fac0e-432a4904-efb3dc6b.jpg   
112030      145595  32eb07cd-6dba43b7-858fb880-1a9bc182-6360bd42.jpg   

               0         1             2         3         4             5  \
0      -1.920675 -3.472368  3.423497e-39 -4.019975 -2.682

In [None]:
# Drop duplicate 'hadm_id' rows, keeping only the first occurrence
data_f = data_f.drop_duplicates(subset='hadm_id', keep='first')


In [None]:
data_f.to_csv('x_ray_data.csv', index=False)