# Data Download and Pre-processing

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import yaml
import os

In [None]:
from usal_echo import bucket, dcm_dir, img_dir, segmentation_dir, model_dir, classification_model
from usal_echo.d00_utils.db_utils import *
from usal_echo.d01_data.ingestion_dcm import ingest_dcm
from usal_echo.d01_data.ingestion_xtdb import ingest_xtdb
from usal_echo.d02_intermediate.clean_dcm import clean_dcm_meta
from usal_echo.d02_intermediate.clean_xtdb import clean_tables
from usal_echo.d02_intermediate.filter_instances import filter_all
from usal_echo.d02_intermediate.download_dcm import _downsample_train_test, s3_download_decomp_dcm, dcmdir_to_jpgs_for_classification

## Ingest dicom metada and Xcelera csv files
Retrieve data from s3 bucket. These functions write to database schema `.raw`.

In [None]:
#ingest_dcm(bucket) # This function takes ~3 days to run. 
ingest_xtdb(bucket)

## Clean dicom metadata and Xcelera database tables
These functions write to database schema `.clean`.

In [None]:
clean_dcm_meta()
clean_tables()

## Filter study instances
Theis function writes to database schema `.views`.

The following tables are created:  
* **views.machines_all_bmi**: list of all studies in db; columns: studyidk, machine type and bmi  
* **views.machines_new_bmi**: same as machines_all_bmi, but only includes studies with new machines (_i.e. machine types ECOEPIQ2, EPIQ7-1, ECOIE33, AFFINITI_1, AFFINITI_2_) 
* **views.instances_unique_master_list**, a list of unique instances in the database (_unique means that instances with naming conflicts (e.g. duplicate instanceidk's) have been removed_)
* **views.frames_w_labels**: all frames with labels plax, a4c, a2c
* **views.frames_sorted_by_views_temp**: intermediate table; used by other scripts
* **views.instances_w_conflicts**: instances to avoid
* **views.instances_w_labels**: all instances which are labeled plax, a4c, a2c   
    Assumption: if a frame has a view label, other frames within that instance correspond to the same view. This discludes instances which have >1 frames with conflicting labels
    
<font color='red'>All subsequent processes use **views.instances_w_labels** which are the ground truths for classification.</font> 

In [None]:
filter_all()

## Check that tables have been created

In [None]:
io_raw = dbReadWriteRaw()
io_raw.list_tables()

In [None]:
io_clean = dbReadWriteClean()
io_clean.list_tables()

In [None]:
dm_spain_view_study_summary = io_clean.get_table('dm_spain_view_study_summary')
dm_spain_view_study_summary[:2]

In [None]:
io_views = dbReadWriteViews()
io_views.list_tables()

In [None]:
groundtruth_classification = io_views.get_table('instances_w_labels')
groundtruth_classification.head()

## Download and decompress dicom files

In [None]:
s3_download_decomp_dcm(train_test_ratio=0.5, downsample_ratio=0.0001, dcm_dir=dcm_dir, bucket=bucket)