##### Copyright (C) Microsoft Corporation.  
see license file for details 


## Copies data from blob to local host


downloads to local disk data from Azure Data Storage containers (see in cell below the local and Azure storage containers structure)

In [None]:
# Local data structure:
#     /data_dir/chestxray/ChestX-ray8/  
#         BBox_List_2017.csv         
#         Data_Entry_2017.csv  
#         blacklist.csv
#         ChestXray-NIHCC/  
#             /tmp/  
#                 images_01.tar.gz    
#                 images_03.tar.gz    
#                 images_05.tar.gz   
#                 images_07.tar.gz   
#                 images_09.tar.gz   
#                 images_11.tar.gz  
#                 images_02.tar.gz   
#                 images_04.tar.gz   
#                 images_06.tar.gz   
#                 images_08.tar.gz   
#                 images_10.tar.gz   
#                 images_12.tar.gz
#             /images/
#                   00000001_000.png
#                   00000001_001.png
#                   ... 112120 png images
# 
#    Azure Data Storage container structure:
#       container nihchestxrayimages
#                   00000001_000.png
#                   00000001_001.png
#                   ... 112120 png image files, total 41.96 GiB
#       container nihchestxraydatacompressed:
#                   images_01.tar.gz
#                   ... 12 files, total 41.98 GiB
#       container nihchestxraydata
#                   BBox_List_2017.csv
#                   blacklist.csv
#                   Data_Entry_2017.csv

In [1]:
# Allow multiple displays per cell

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
%reload_ext dotenv
dotenv_file_path='./../not_shared/general.env'

In [4]:
import os, platform
import sys
import urllib.request

In [5]:
platform.platform()
os.getcwd()

'Linux-4.15.0-1063-azure-x86_64-with-debian-stretch-sid'

'/datadrive01/prj/AzureChestXRayNoAML/code/01_DataPrep'

In [6]:
# import utlity functions

paths_to_append = [os.path.join(os.getcwd(), os.path.join(*(['..','src'])))]
paths_to_append
def add_path_to_sys_path(path_to_append):
    if not (any(path_to_append in paths for paths in sys.path)):
        sys.path.append(path_to_append)
[add_path_to_sys_path(crt_path) for crt_path in paths_to_append]

import azure_chestxray_utils

['/datadrive01/prj/AzureChestXRayNoAML/code/01_DataPrep/../src']

[None]

  return f(*args, **kwds)
  return f(*args, **kwds)


In [7]:
# create the file path variables 
# create nih_chest_xray_data_dir (in container dir mapped to a host dir for data persistence), 
# where data will be copied from blob

prj_consts = azure_chestxray_utils.chestxray_consts()

nih_chest_xray_data_dir=os.path.join(os.path.join(*([os.sep]+prj_consts.BASE_INPUT_DIR_list)), 
                                     os.path.join(*(prj_consts.ChestXray_IMAGES_DIR_list)))
os.makedirs(nih_chest_xray_data_dir, exist_ok=True)

nih_chest_xray_data_dir
!ls -l {nih_chest_xray_data_dir}

'/data_dir/chestxray/ChestX-ray8/ChestXray-NIHCC'

total 3900
drwxrwxr-x 2 loginvm022 loginvm022 3989504 Sep  4  2017 images
drwxrwxrwx 2 root       root          4096 Nov 16 05:48 tmp


In [8]:
#check azcopy is available. Uncomment second line to print _response_ if needed

response = !azcopy
response[1:10]

['azcopy 7.2.0-netcore Copyright (c) 2018 Microsoft Corp. All Rights Reserved.',
 '------------------------------------------------------------------------------',
 '# azcopy is designed for high-performance uploading, downloading, and copying',
 'data to and from Microsoft Azure Blob, and File storage.',
 '',
 '# Command Line Usage:',
 '    azcopy --source <source> --destination <destination> [options]',
 '',
 '# Options:']

In [9]:
%dotenv $dotenv_file_path

storage_account_key = os.getenv('sa_key')

###### Copy meta-data  
 - NIH Chest Xray: BBox_List_2017.csv, Data_Entry_2017.csv  
 - Azure Chest Xray: blacklist.csv   

In [10]:
# copy data to local host dir
source_container = 'https://'+os.getenv('sa_name')+'.blob.core.windows.net/'+os.getenv('sa_container_name')
dest_local_dir = os.path.join(nih_chest_xray_data_dir, '..')

!azcopy --quiet \
    --source {source_container}  \
    --destination {dest_local_dir} \
    --source-key {storage_account_key} \
    --verbose \
    --include "BBox_List_2017.csv"

!azcopy --quiet \
    --source {source_container}  \
    --destination {dest_local_dir} \
    --source-key {storage_account_key} \
    --verbose \
    --include "Data_Entry_2017.csv"

!azcopy --quiet \
    --source {source_container}  \
    --destination {dest_local_dir} \
    --source-key {storage_account_key} \
    --verbose \
    --include "blacklist.csv"

[?1h=[2019/11/17 21:12:37.185+00:00] >>>>>>>>>>>>>>>>
[2019/11/17 21:12:37.194+00:00][VERBOSE] Finished: 0 file(s), 0 B; Average Speed:0 B/s.
[2019/11/17 21:12:37.197+00:00][VERBOSE] 7.2.0 : azcopy --quiet --source https://ghiordanazrchestxraysa.blob.core.windows.net/nihchestxraydata --destination /data_dir/chestxray/ChestX-ray8/ChestXray-NIHCC/.. --source-key ****** --verbose --include BBox_List_2017.csv
[2019/11/17 21:12:37.271+00:00][VERBOSE] Attempt to parse address 'https://ghiordanazrchestxraysa.blob.core.windows.net/nihchestxraydata' to a directory as a candidate location succeeded.
[2019/11/17 21:12:37.273+00:00][VERBOSE] Source is interpreted as a Cloud blob directory: https://ghiordanazrchestxraysa.blob.core.windows.net/nihchestxraydata.
[2019/11/17 21:12:37.275+00:00][VERBOSE] Attempt to parse address '/data_dir/chestxray/ChestX-ray8/ChestXray-NIHCC/..' to a directory as a candidate location succeeded.
[2019/11/17 21:12:37.277+00:00][VERBOSE] Destination is interpreted as 

###### Copy NIH Chest Xray  compressed image data
Make:
```
copy_NIH_compressed_data_FLAG = False
``` 
to skip next 2 cells and use the following cells to download uncompressed images.

Make:
```
copy_NIH_compressed_data_FLAG = True
```
if you want to download compressed data and uncompress locally in the next 2 cells. 

In [11]:
copy_NIH_compressed_data_FLAG = False

if copy_NIH_compressed_data_FLAG:
    tmp_dir = os.path.join(nih_chest_xray_data_dir, 'tmp')
    tmp_dir
    os.makedirs(tmp_dir, exist_ok=True)

    compressed_data_container = 'https://'+os.getenv('sa_name')+'.blob.core.windows.net/'+os.getenv('sa_container_name_compressed_data')

    answer = !azcopy --quiet \
    --source {compressed_data_container}  \
    --destination {tmp_dir} \
    --source-key {crt_key} \
    --recursive \
    --verbose \
    --include "images_"
    
    print(answer[-7:])
    

The azcopy download process above will run for several minutes. 

Make sure you have these file sizes when it finishes:  
total __44023372__  
-rw-r--r-- 1 1000 users __2008470987__ Sep 26 17:19 images_001.tar.gz  
-rw-r--r-- 1 1000 users __3952623504__ Sep 26 19:05 images_002.tar.gz  
-rw-r--r-- 1 1000 users __3929234850__ Sep 26 17:22 images_003.tar.gz  
-rw-r--r-- 1 1000 users __3838903983__ Sep 26 17:25 images_004.tar.gz  
-rw-r--r-- 1 1000 users __3935496531__ Sep 26 17:22 images_005.tar.gz  
-rw-r--r-- 1 1000 users __3986301172__ Sep 26 17:25 images_006.tar.gz  
-rw-r--r-- 1 1000 users __4016328426__ Sep 26 19:05 images_007.tar.gz  
-rw-r--r-- 1 1000 users __4018347353__ Sep 26 19:05 images_008.tar.gz  
-rw-r--r-- 1 1000 users __4111327929__ Sep 26 19:05 images_009.tar.gz  
-rw-r--r-- 1 1000 users __4181556296__ Sep 26 19:05 images_010.tar.gz  
-rw-r--r-- 1 1000 users __4187084020__ Sep 26 19:05 images_011.tar.gz  
-rw-r--r-- 1 1000 users __2914187733__ Sep 26 17:21 images_012.tar.gz  
  
If the unzipping process below does not end up with __112120__ .png images, you may need to repeat the download process for the above 12 .tar.gz files.

The uncompressiing process below will run for several minutes.

While it runs, you can monitor the progress by ssh-ing 
into your compute context machine and running:   
find /data_dir/chestxray/ChestX-ray8/ChestXray-NIHCC/images  -type f | wc -l  

The final number of images is:  
__112120__  



In [12]:
if copy_NIH_compressed_data_FLAG:
    print(nih_chest_xray_data_dir)
    print(tmp_dir)
    !ls -l $tmp_dir
    # !find $nih_chest_xray_data_dir -type f | wc -l

    old_crt_dir = os.getcwd()
    os.chdir(nih_chest_xray_data_dir)
    !pwd    
    !ls -l $tmp_dir/*.tar.gz 
    allImages = ! cat $tmp_dir/*.tar.gz | tar -zxf - -i
    # !for file in $tmp_dir/*.tar.gz; do tar -zxf $file; done
    os.chdir(old_crt_dir)
    !pwd

##### Download uncomressed data (112120 .png images like 00000001_000.png) to local host dir

In [13]:
if not copy_NIH_compressed_data_FLAG:
    source_container = 'https://'+os.getenv('sa_name')+'.blob.core.windows.net/'+os.getenv('sa_container_name_images')
    dest_local_dir = os.path.join(nih_chest_xray_data_dir, 'images')

    !azcopy --quiet \
        --source {source_container}  \
        --destination {dest_local_dir} \
        --source-key {storage_account_key} \
        --recursive

[?1h=[6nFinished: 0 file(s), 0 B; Average Speed:0 B/s.                                 [6n[1;1H[6nFinished: 0 file(s), 0 B; Average Speed:0 B/s.                                 [6n[1;1H[6nFinished: 0 file(s), 0 B; Average Speed:0 B/s.                                 [6n[1;1H[6nFinished: 0 file(s), 0 B; Average Speed:0 B/s.                                 [6n[1;1H[6nFinished: 0 file(s), 361.92 KB; Average Speed:39.03 KB/s.                      [6n[1;1H[6nFinished: 1932 file(s), 748.54 MB; Average Speed:66.06 MB/s.                   [6n[1;1H[6nFinished: 3926 file(s), 1.476 GB; Average Speed:113 MB/s.                      [6n[1;1H[6nFinished: 5990 file(s), 2.239 GB; Average Speed:148.64 MB/s.                   [6n[1;1H[6nFinished: 8147 file(s), 3.027 GB; Average Speed:177.39 MB/s.                   [6n[1;1H[6nFinished: 10207 file(s), 3.788 GB; Average Speed:198.66 MB/s.                  [6n[1;1H[6nFinished: 12362 file(s), 4.585 GB; Average Speed:217.74 MB/

Finished: 66719 file(s), 24.539 GB; Average Speed:138.59 MB/s.                 [6n[1;1H[6nFinished: 67350 file(s), 24.776 GB; Average Speed:138.37 MB/s.                 [6n[1;1H[6nFinished: 67920 file(s), 24.988 GB; Average Speed:138.01 MB/s.                 [6n[1;1H[6nFinished: 68276 file(s), 25.121 GB; Average Speed:137.23 MB/s.                 [6n[1;1H[6nFinished: 68609 file(s), 25.246 GB; Average Speed:136.42 MB/s.                 [6n[1;1H[6nFinished: 69037 file(s), 25.407 GB; Average Speed:135.83 MB/s.                 [6n[1;1H[6nFinished: 69286 file(s), 25.503 GB; Average Speed:134.89 MB/s.                 [6n[1;1H[6nFinished: 69819 file(s), 25.702 GB; Average Speed:134.52 MB/s.                 [6n[1;1H[6nFinished: 70318 file(s), 25.89 GB; Average Speed:134.1 MB/s.                   [6n[1;1H[6nFinished: 70690 file(s), 26.028 GB; Average Speed:133.43 MB/s.                 [6n[1;1H[6nFinished: 71106 file(s), 26.18 GB; Average Speed:132.85 MB/s.         

[1;1H[6nFinished: 101175 file(s), 37.77 GB; Average Speed:106.38 MB/s.                 [6n[1;1H[6nFinished: 101504 file(s), 37.899 GB; Average Speed:106.14 MB/s.                [6n[1;1H[6nFinished: 101907 file(s), 38.054 GB; Average Speed:105.98 MB/s.                [6n[1;1H[6nFinished: 102304 file(s), 38.208 GB; Average Speed:105.82 MB/s.                [6n[1;1H[6nFinished: 102676 file(s), 38.354 GB; Average Speed:105.64 MB/s.                [6n[1;1H[6nFinished: 103084 file(s), 38.508 GB; Average Speed:105.48 MB/s.                [6n[1;1H[6nFinished: 103486 file(s), 38.664 GB; Average Speed:105.34 MB/s.                [6n[1;1H[6nFinished: 103662 file(s), 38.732 GB; Average Speed:104.95 MB/s.                [6n[1;1H[6nFinished: 104206 file(s), 38.943 GB; Average Speed:104.95 MB/s.                [6n[1;1H[6nFinished: 104684 file(s), 39.129 GB; Average Speed:104.89 MB/s.                [6n[1;1H[6nFinished: 105165 file(s), 39.315 GB; Average Speed:104.82 MB

In [14]:
 
!jupyter nbconvert --to html 001_get_data_B.ipynb

[NbConvertApp] Converting notebook 001_get_data_B.ipynb to html
[NbConvertApp] Writing 291207 bytes to 001_get_data_B.html
