# Experimental notebook for testing data retrieval from S3
*author: Wiebke Toussaint*

The functions tested in this notebook have been integrated into `usal_echo.d00_utils.xxx` and `usal_echo.d01_data.dicom_processing`

### Useful links  
[boto3](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#bucket)  
[s3fs](https://s3fs.readthedocs.io/en/latest/api.html#s3fs.core.S3FileSystem.metadata)  
[gdcmdump](http://gdcm.sourceforge.net/wiki/index.php/Gdcmdump)

In [None]:
import os
import sys
import time
import pandas as pd

import boto3
import s3fs

import tempfile
import io

projectdir = os.path.dirname(os.getcwd())
sys.path.append(os.path.join(projectdir,'src'))

## Getting started with boto3

In [None]:
s3_client = boto3.client('s3')
all_objects = s3_client.list_objects_v2(Bucket = 'cibercv') 
all_files = [d['Key'] for d in all_objects['Contents']] #get a list of all files in bucket

s3_resource = boto3.resource('s3')
s3_bucket = s3_resource.Bucket('cibercv')

In [None]:
dicom_files = [f for f in all_files if f.endswith('.dcm')]
print('all_files', len(all_files))
print('dicom_files', len(dicom_files))
dicom_files[-1]

In [None]:
def get_dicom_metadata(bucket, file_path, description=False):
    
    s3 = boto3.client('s3')
    tmp = tempfile.NamedTemporaryFile()

    # Dump metadata of file to temp file
    s3.download_file(bucket, file_path, tmp.name)
    os.system('gdcmdump '+ tmp.name +' > temp.txt')

    dir_name = file_path.split('/')[0]
    file_name = file_path.split('/')[1].split('.')[0]

    # Parse temp.txt file to extract tags
    temp_file='temp.txt'
    meta = []
    with open(temp_file, 'r') as f:
        line_meta = []
        for one_line in f:            
            try:
                clean_line = one_line.replace(']','').strip()
                if not clean_line: # ignore empty lines
                    continue
                elif not clean_line.startswith('#'): # ignore comment lines:
                    tag1 = clean_line[1:5]
                    tag2 = clean_line[6:10]
                    if description == False:
                        value = clean_line[15:clean_line.find('#')].strip().replace('[','')
                    elif description == True:
                        value = clean_line[clean_line.find('#')+2:].strip()
                    line_meta=[dir_name, file_name, tag1, tag2, value]
                    meta.append(line_meta)
            except IndexError:
                break
                    
    df = pd.DataFrame.from_records(meta, columns=['dirname','filename','tag1','tag2','value'])
    df_dedup = df.drop_duplicates(keep='first')
    df_dedup_goodvals = df_dedup[~df_dedup.value.str.contains('no value')]
    df_dedup_goodvals_short = df_dedup_goodvals[(df_dedup_goodvals['value'].str.len()>0)&(df_dedup_goodvals['value'].str.len()<50)]
    
    return df_dedup_goodvals_short

In [None]:
# code from https://alexwlchan.net/2018/01/listing-s3-keys-redux/

def get_matching_s3_objects(bucket, prefix='', suffix=''):
    """
    Generate objects in an S3 bucket.

    :param bucket: Name of the S3 bucket.
    :param prefix: Only fetch objects whose key starts with
        this prefix (optional).
    :param suffix: Only fetch objects whose keys end with
        this suffix (optional).
    """
    s3 = boto3.client('s3')
    kwargs = {'Bucket': bucket}

    # If the prefix is a single string (not a tuple of strings), we can
    # do the filtering directly in the S3 API.
    if isinstance(prefix, str):
        kwargs['Prefix'] = prefix

    while True:

        # The S3 API response is a large blob of metadata.
        # 'Contents' contains information about the listed objects.
        resp = s3.list_objects_v2(**kwargs)

        try:
            contents = resp['Contents']
        except KeyError:
            return

        for obj in contents:
            key = obj['Key']
            if key.startswith(prefix) and key.endswith(suffix):
                yield obj

        # The S3 API is paginated, returning up to 1000 keys at a time.
        # Pass the continuation token into the next response, until we
        # reach the final page (when this field is missing).
        try:
            kwargs['ContinuationToken'] = resp['NextContinuationToken']
        except KeyError:
            break


def get_matching_s3_keys(bucket, prefix='', suffix=''):
    """
    Generate the keys in an S3 bucket.

    :param bucket: Name of the S3 bucket.
    :param prefix: Only fetch keys that start with this prefix (optional).
    :param suffix: Only fetch keys that end with this suffix (optional).
    """
    for obj in get_matching_s3_objects(bucket, prefix, suffix):
        yield obj['Key']

In [None]:
def write_dicom_metadata(df, metadata_file_name=None):

    # Save metadata as csv file
    data_path = os.path.join(os.path.expanduser('~'),'data_usal','02_intermediate')
    os.makedirs(os.path.expanduser(data_path), exist_ok=True)
    if metadata_file_name is None:
        dicom_meta_path = os.path.join(data_path,'dicom_metadata.csv')
    else:
        dicom_meta_path = os.path.join(data_path,'dicom_metadata_'+str(metadata_file_name)+'.csv')
    if not os.path.isfile(dicom_meta_path): # create new file if it does not exist
        print('Creating new metadata file')
        df.to_csv(dicom_meta_path, index=False)
    else: # if file exists append
        df.to_csv(dicom_meta_path, mode='a', index=False, header=False)
               
    print('dicom metadata saved for study {}, directory {}'.format(df.iloc[0,0], df.iloc[0,1]))

In [None]:
s3keys = get_matching_s3_keys('cibercv','100155','.dcm')

In [None]:
t1 = time.time()
for key in get_matching_s3_keys('cibercv','100000','.dcm'): 
    df = get_dicom_metadata('cibercv', key)
    write_dicom_metadata(df, 'test')
#os.remove('temp.txt')
t2 = time.time()
print(t2-t1)

In [None]:
tf = pd.read_csv(os.path.join(projectdir,'data','02_intermediate','dicom_metadata_test.csv'))

In [None]:
tf['tags'] = list(zip(tf['tag1'],tf['tag2']))

In [None]:
tf[tf['tags']==('fffe','e000')]

## Getting started with s3fs

In [None]:
s3 = s3fs.S3FileSystem()
s3_bucket = 's3://cibercv/' 
s3_studies = s3.ls(s3_bucket)
s3_studies[:15]

In [None]:
dirpath = s3.url(s3_study_paths[3])
all_files = s3.ls(s3_study_paths[3])
all_files

In [None]:
meta = get_dicom_metadata(dirpath, s3_study_paths[3])

In [None]:
os.system('gdcmdump '+ dirpath +' > temp.txt')