In [None]:
import pandas as pd
import os
from sqlalchemy import create_engine

In [None]:
connection_str =  "postgresql://{}:{}@{}/{}".format(credentials['user'],
                                                        credentials['password'],
                                                        credentials['host'],
                                                        credentials['database'])
conn = create_engine(connection_str)

In [None]:
projectdir = os.path.dirname(os.getcwd())
datadir = os.path.join(projectdir, 'data')
metadata_path = os.path.join(datadir, '02_intermediate','dicom_metadata.csv')

Get dicom tags that are of interest

In [None]:
dicom_tags = dict(
    sop_instance_uid=('0008','0018'),
    rows=('0028','0010'), #in metadata
    columns=('0028','0011'), #in metadata
    number_of_frames=('0028','0008'), #in metadata
    cine_rate=('0018','0040'), #in metadata
    sequence_of_ultrasound_regions=('0018','6011'), #in metadata
    region_location_min_x0=('0018','6018'), #in metadata
    region_location_min_y0=('0018','601A'),
    region_location_max_x1=('0018','601C'), 
    region_location_max_y1=('0018','601E'),
    physical_delta_x=('0018','602C'),
    physical_delta_y=('0018','602E'),
    physical_units_x_direction=('0018','6024'), #in metadata
    physical_units_y_direction=('0018','6026'), #in metadata
    region_spatial_format=('0018','6012'), #in metadata
    heart_rate=('0018', '1088'), #in metadata
    pixel_data=('7FE0','0010'),
    transfer_syntax_uid=('0002','0010'), #in metadata
    photometric_interpretation=('0028','0004'), #in metadata
    ultrasound_color_data_present=('0028','0014'), #in metadata
    acquisition_datetime=('0008', '002a'), #in metadata
    institution_name=('0008', '0080'),
    manufacturers_model_name=('0008', '1090') #in metadata
)

In [None]:
dicom_tags.values()

Read metadata in chunks to avoid kernel crashing due to large data volume.

In [None]:
datalist = []
for chunk in pd.read_csv(metadata_path, chunksize=1000000):
    chunk['tags'] = list(zip(chunk['tag1'],chunk['tag2']))
    filtered_chunk = chunk[chunk['tags'].isin(dicom_tags.values())]
    datalist.append(filtered_chunk)

In [None]:
dp = datalist[0].unstack()#pivot(index=['dirname','filename'], columns=['tag1','tag2'], values='value')

In [None]:
datalist[0].tags.unique()

In [None]:
chunk[chunk.tag1==('0010')]

In [None]:
stackdf = datalist[0].set_index(['dirname','filename'])#,'tag1','tag2'])
stackdf['tag_tuple'] = list(zip(stackdf['tag1'],stackdf['tag2']))
stackdf[stackdf.tag_tuple==('0008','0018')]

In [None]:
datachunks = pd.read_csv(metadata_path, chunksize=100000, dtype={'dirname':'category', 'filename':'category',
                                                                 'tag1':'category','tag2':'category'})

In [None]:
chunklist = []
for chunk in datachunks:
    chunk.set_index(['dirname','filename'], inplace=True)
    chunklist.append(chunk)

In [None]:
chunklist[2].memory_usage()

In [None]:
chunklist[2].head()

In [None]:
df = pd.concat(chunklist)

In [None]:
dindexed = df.set_index(['dirname','filename'])

In [None]:
for col in df.columns[:-1]:
    df[col] = df[col].astype('category')

In [None]:
chunklist[0].dtypes