# Chequeo de Descarga

In [1]:
#!/usr/bin/env python3

"""Check the md5sum of the zipped SARFish dataset products.

! ./check_SARFish_md5sum.py
"""

from hashlib import md5
from pathlib import Path
from typing import Union, Tuple, List
import yaml

import pandas as pd

def get_md5sum(scene_id: str, file_path: Path) -> Tuple[str, str]:
    with open(file_path, "rb") as f:
        file_md5sum = md5(f.read()).hexdigest() 
        print(f"scene_id: {scene_id}".ljust(30) + f"file_md5sum: {file_md5sum}")
    return scene_id, file_md5sum

def map_SARFish_md5sum(
        df: pd.DataFrame, SARFish_root_directory: Path
    ) -> pd.DataFrame:
    print(f"checking md5sums:")
    for product_type in ["GRD", "SLC"]:
        print(f"\nproduct type: {product_type}")
        df.loc[:, f'{product_type}_file_path'] = df.apply(
            lambda x: Path(
                SARFish_root_directory, f"{product_type}", x['DATA_PARTITION'], 
                f"{x[f'{product_type}_product_identifier']}.SAFE.zip"
            ), axis = 1
        )
        df.loc[:, f'does_{product_type}_exist'] = (
            df[f'{product_type}_file_path'].apply(lambda x: x.is_file())
        )
        df.loc[:, f'computed_{product_type}_md5sum'] = None
        if df.loc[:, f'does_{product_type}_exist'].sum() == 0: 
            continue

        mapped_md5sum = dict(
            df.loc[df[f'does_{product_type}_exist']].apply(
                lambda x: get_md5sum(x['scene_id'], x[f'{product_type}_file_path']),
                axis = 1
            ).to_list()
        )
        df.loc[:, f'computed_{product_type}_md5sum'] = df['scene_id'].map(mapped_md5sum)

    return df

def check_SARFish_md5sum(
        df: pd.DataFrame, SARFish_root_directory: Path
    ):
    df = map_SARFish_md5sum(df, SARFish_root_directory)
    for product_type in ["GRD", "SLC"]:
        md5sum_matches = (
            df[f'computed_{product_type}_md5sum'] == 
            df[f'{product_type}_md5sum']
        )
        print(f"\n{product_type} products matching md5sum:")
        df.loc[df[f'does_{product_type}_exist'] & md5sum_matches].apply(
            lambda x: print(x[f'{product_type}_file_path']), 
            axis = 1
        )
        print(f"\n{product_type} products NOT matching md5sum:")
        df.loc[df[f'does_{product_type}_exist'] & ~md5sum_matches].apply(
            lambda x: print(x[f'{product_type}_file_path']), 
            axis = 1
        )
        print(f"\n{product_type} products NOT downloaded:\n")
        df.loc[~df[f'does_{product_type}_exist']].apply(
            lambda x: print(x[f'{product_type}_file_path']), 
            axis = 1
        )

In [9]:
data_path =  "../../data/SARFish"

In [None]:
SARFish_root_directory = Path(config["SARFish_root_directory"])

In [7]:
SARFish_root_directory = Path("../../data/SARFish")

In [11]:
xView3_SLC_GRD_correspondences = pd.read_csv(data_path + "/labels/xView3_SLC_GRD_correspondences.csv")

In [12]:
check_SARFish_md5sum(
        xView3_SLC_GRD_correspondences.copy(deep = True), 
        SARFish_root_directory
    )

checking md5sums:

product type: GRD
scene_id: b5857d9d4719c304p   file_md5sum: 6241265105398ae752cf7f0017c67e23
scene_id: 0d30f9dfc2891b6bp   file_md5sum: ac57a8cc17699c9e5d5773a17247fbf4
scene_id: dfa80e10f1e1e51cp   file_md5sum: 7c1030067010dafb31e7df0acc4b9222
scene_id: b0b4729bdfa985b4p   file_md5sum: 58043bfc5a947cd4071a65cabec5eed2
scene_id: 66a20d213523e230p   file_md5sum: 04c7e3deea98d0fc94a1f999f92483b3
scene_id: 570f8bf76cac1d94p   file_md5sum: 9f399810405f95d57f813ed3e658c695
scene_id: d1318f6812c769d9p   file_md5sum: 0070f84093af2d9bbf3f1a3596a45fbb
scene_id: 5b44fe761873df4dp   file_md5sum: 0708b2628cfd1761c905f2f216816e48
scene_id: 2afbfbcc8499a514p   file_md5sum: 5b7f852926291732959896e71fca1135
scene_id: 52f79458b164cd8dp   file_md5sum: 1f4959b8ab73a217d6ec530e823bc377
scene_id: f83a117333641141p   file_md5sum: 02aa82fcee2ce12ff4230789058c058f
scene_id: 9ff6ca8c41ef83d6p   file_md5sum: ea72351fd8ebae633fec2254fb196830
scene_id: 84bc065b1b9de04ep   file_md5sum: 48e17a75

In [13]:
!du -sh data_path


du: cannot access ‘data_path’: No such file or directory
