# Notebook to test download of dicom images
*author: Wiebke Toussaint*

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import boto3
import tempfile
from time import time

projectdir = os.path.dirname(os.getcwd())
sys.path.append(os.path.join(projectdir,'src'))

from d01_data.ingestion_xtdb import ingest_xtdb
from d02_intermediate.clean_xtdb import *
from d00_utils.db_utils import *
from d00_utils.s3_utils import *

db_table = 'metadata'
credentials_file = os.path.join(os.path.expanduser('~'), '.psql_credentials.json')

In [None]:
io_views = dbReadWriteViews()
io_views.list_tables()

In [None]:
def extract_imgs_from_dicom(bucket, prefix='', suffix='.dcm', outdir=None):
    """
    Get all the keys with a specific suffix from a s3 bucket.

    :param bucket: Name of the S3 bucket.
    :param prefix: Only fetch objects whose key starts with this prefix (optional).
    :param suffix: Only fetch objects whose keys end with this suffix, default='dcm'
    :return: list of keys
    """
    s3 = boto3.client('s3')

    keys = []
    kwargs = {'Bucket': bucket}

    os.makedirs(outdir, exist_ok=True)

    if isinstance(prefix, str):
        kwargs['Prefix'] = prefix

    while True:
        resp = s3.list_objects_v2(**kwargs)
        try:
            for obj in resp['Contents']:
                if obj['Key'].startswith(prefix) and obj['Key'].endswith(suffix):
                    keys.append(obj['Key'])
                    s3.download_file(bucket, obj['Key'], os.path.join(outdir, 
                                                                      'a_' + ''.join(p.split('/a')) + '.dcm'))
        except:
            print(prefix, 'DOWNLOAD ERROR')
        
        try:
            kwargs['ContinuationToken'] = resp['NextContinuationToken']
        
        except KeyError:
            break

    return keys

### Download Sample Dicom Images

In [None]:
instances_with_labels_scrappy100 = io_views.get_table('instances_with_labels_scrappy100')
instances_labeled_other_scrappy100 = io_views.get_table('instances_labeled_other_scrappy100')

prefix_labelled = instances_with_labels_scrappy100['studyidk'].astype(str) + '/a_' + instances_with_labels_scrappy100['instancefilename'].astype(str)
prefix_other = instances_labeled_other_scrappy100['studyidk'].astype(str) + '/a_' + instances_labeled_other_scrappy100['instancefilename'].astype(str)

lablelled_datadir = os.path.expanduser('/home/ubuntu/data/01_raw/dcm_sample_labelled')
other_datadir = os.path.expanduser('/home/ubuntu/data/01_raw/dcm_sample_other')

In [None]:
for p in prefix_labelled.values:
    extract_imgs_from_dicom('cibercv', prefix=p, suffix='.dcm', outdir=lablelled_datadir)
    
for p in prefix_other.values:
    extract_imgs_from_dicom('cibercv', prefix=p, suffix='.dcm', outdir=other_datadir)