# Processing Raw Dicom Metadata
*author: Wiebke Toussaint*  

This notebook performs the steps that are required to add the relevant dicom tags from the raw metadata csv to a postgres database.

In [None]:
import os
import sys
import pandas as pd
from sqlalchemy import create_engine
from json import load
import psycopg2

#### Specify paths and import project modules

In [None]:
projectdir = os.path.dirname(os.getcwd())
sys.path.append(projectdir) # to use project modules in notebook
datadir = os.path.join(projectdir, 'data')
metadata_path = os.path.join(datadir, '02_intermediate','dicom_metadata.csv')

from src.d01_data import dicom_metadata
from src.d00_utils.s3_utils import get_matching_s3_keys
from src.d00_utils.db_utils import save_to_db

#### Functions to add metadata table to database

In [None]:
def dbtable_to_df(db_table, credentials_file):
    credentials = _load_db_credentials(credentials_file)
    connection_str =  "postgresql://{}:{}@{}/{}".format(credentials['user'],
                                                         credentials['psswd'],
                                                         credentials['host'],
                                                         credentials['database'])
    conn = create_engine(connection_str)
    df = pd.read_sql_query('select * from "'+ db_table +'"', con=conn, chunksize=100000)
    return df

In [None]:
def get_meta_lite(dicom_tags, save_to_db=False, credentials_file=None, db_table=None):
    
    # Read metadata in chunks to avoid kernel crashing due to large data volume.
    
    datalist = []
    for chunk in pd.read_csv(metadata_path, chunksize=1000000, dtype={'dirname':'category','filename':'category',
                                                                      'tag1':'category','tag2':'category'}):
        chunk['tags'] = list(zip(chunk['tag1'],chunk['tag2']))
        filtered_chunk = chunk[chunk['tags'].isin(dicom_tags.values())]
        if save_to_db is True:
            try:
                save_to_db(filtered_chunk, db_table, credentials_file)
            except:
                raise
            print('saved chunks to db')
        datalist.append(filtered_chunk)
    
    meta = pd.concat(datalist)
    
    return meta

### Generate sample metadata tags

In [None]:
test_files_keys = get_matching_s3_keys('cibercv','99966','.dcm')
test_files = []
for key in test_files_keys:
    test_files.append(key)

if os.path.exists(os.path.join(datadir,'02_intermediate','dicom_metadata_sample.csv')) is True:
    sample = pd.read_csv(os.path.join(datadir, '02_intermediate','dicom_metadata_sample.csv'))
else:
    for file in test_files:
        df_sample = dicom_metadata.get_dicom_metadata('cibercv', file)
        dicom_metadata.write_dicom_metadata(df_sample, 'sample')
    sample = pd.read_csv(os.path.join(datadir, '02_intermediate','dicom_metadata_sample.csv'))
    
if os.path.exists(os.path.join(datadir,'02_intermediate','dicom_metadata_tag_descriptions.csv')) is True:
    tag_descriptions = pd.read_csv(os.path.join(datadir, '02_intermediate','dicom_metadata_tag_descriptions.csv'))
else:
    for file in test_files:
        df_desc = dicom_metadata.get_dicom_metadata('cibercv', file, description=True)
        dicom_metadata.write_dicom_metadata(df_desc, 'tag_descriptions')
    tag_descriptions = pd.read_csv(os.path.join(datadir, '02_intermediate','dicom_metadata_tag_descriptions.csv'))

In [None]:
tag_descriptions.rename(columns={'value':'description'}, inplace=True)
all_tags = sample.merge(tag_descriptions, how='left')
all_tags.drop_duplicates(subset=['tag1','tag2'], inplace=True)
all_tags.dropna(inplace=True)

#### Search sample metadata tag descriptions

In [None]:
all_tags[all_tags['description'].str.contains('UID')]

In [None]:
all_tags[all_tags['description'].str.contains('Time|Date')]

### Get dicom tags that are of interest

In [None]:
dicom_tags = dict(
    sop_class_uid=('0008','0016'),
    rows=('0028','0010'),
    columns=('0028','0011'), 
    number_of_frames=('0028','0008'), 
    cine_rate=('0018','0040'),
    sequence_of_ultrasound_regions=('0018','6011'), 
    region_location_min_x0=('0018','6018'), 
    physical_units_x_direction=('0018','6024'),
    physical_units_y_direction=('0018','6026'),
    region_spatial_format=('0018','6012'), 
    heart_rate=('0018', '1088'), 
    transfer_syntax_uid=('0002','0010'),
    photometric_interpretation=('0028','0004'),
#    ultrasound_color_data_present=('0028','0014'), seems to be an incorrect tag
    acquisition_datetime=('0008', '002a'),
    manufacturers_model_name=('0008', '1090'), 
    instance_creation_date=('0008','0012'),
)

In [None]:
credentials_file = os.path.join(os.path.expanduser('~'),'.psql_credentials.json')

Getting the metadata lite takes several minutes.

In [None]:
meta = dicom_metadata.get_meta_lite(dicom_tags)

In [None]:
print(len(meta))
meta.head()

#### Get frequency of unique tag values

In [None]:
# photometric_interpretation values
meta.loc[meta['tags']==('0028','0004'),'value'].value_counts()

In [None]:
# transfer_syntax_uid
meta.loc[meta['tags']==('0002','0010'),'value'].value_counts()

In [None]:
# number_of_frames
number_of_frames = meta.loc[meta['tags']==('0028','0008'),'value'].astype('int')
number_of_frames.value_counts(bins=20)

In [None]:
#region_spatial_format
meta.loc[meta['tags']==('0018','6012'),'value'].value_counts()