<a href="https://colab.research.google.com/github/deftorch/alexnet-ifood2019/blob/main/notebooks/00_download_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# üì• Download Dataset iFood 2019 - Otomatis

Notebook ini akan **otomatis mendownload dan mengekstrak** dataset iFood 2019 ke Google Drive.

### Dataset Info:
| File | Size | Isi |
|------|------|-----|
| Annotations | 3 MB | Labels & class list |
| Train Images | 2.3 GB | 120,216 gambar |
| Val Images | 231 MB | 12,170 gambar |
| Test Images | 548 MB | 28,399 gambar |

**Total: ~3.1 GB**

---

In [7]:
# ============================================================
# STEP 1: Mount Google Drive
# ============================================================

from google.colab import drive
drive.mount('/content/drive')

import os

# Buat folder struktur
PROJECT_PATH = '/content/drive/MyDrive/AlexNet_iFood2019'
DATASET_PATH = os.path.join(PROJECT_PATH, 'dataset')

os.makedirs(DATASET_PATH, exist_ok=True)
os.makedirs(os.path.join(PROJECT_PATH, 'checkpoints'), exist_ok=True)
os.makedirs(os.path.join(PROJECT_PATH, 'evaluation_results'), exist_ok=True)
os.makedirs(os.path.join(PROJECT_PATH, 'analysis_results'), exist_ok=True)

print(f"‚úÖ Google Drive mounted")
print(f"üìÅ Dataset akan disimpan di: {DATASET_PATH}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
‚úÖ Google Drive mounted
üìÅ Dataset akan disimpan di: /content/drive/MyDrive/AlexNet_iFood2019/dataset


In [None]:
import os
import urllib.request
import tarfile
import time
import hashlib
from tqdm import tqdm

# Dataset URLs dari iFood 2019 Official
DATASET_URLS = {
    'annotations': {
        'url': 'https://food-x.s3.amazonaws.com/annot.tar',
        'filename': 'annot.tar',
        'size': '3 MB',
        'md5': '0c632c543ceed0e70f0eb2db58eda3ab'
    },
    'train': {
        'url': 'https://food-x.s3.amazonaws.com/train.tar',
        'filename': 'train.tar',
        'size': '2.3 GB',
        'md5': '8e56440e365ee852dcb0953a9307e27f'
    },
    'val': {
        'url': 'https://food-x.s3.amazonaws.com/val.tar',
        'filename': 'val.tar',
        'size': '231 MB',
        'md5': 'fa9a4c1eb929835a0fe68734f4868d3b'
    },
    'test': {
        'url': 'https://food-x.s3.amazonaws.com/test.tar',
        'filename': 'test.tar',
        'size': '548 MB',
        'md5': '32479146dd081d38895e46bb93fed58f'
    }
}

class DownloadProgressBar(tqdm):
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None:
            self.total = tsize
        self.update(b * bsize - self.n)

def calculate_md5(filepath, chunk_size=8192):
    """Menghitung MD5 checksum dari file."""
    md5 = hashlib.md5()
    with open(filepath, 'rb') as f:
        for chunk in iter(lambda: f.read(chunk_size), b''):
            md5.update(chunk)
    return md5.hexdigest()

def download_file_with_retries(url, output_path, desc, expected_md5, max_retries=5, initial_delay=5):
    """Download file dengan progress bar, retries, dan verifikasi MD5."""
    for attempt in range(max_retries):
        try:
            print(f"  Attempt {attempt + 1}/{max_retries} for {desc}...")
            with DownloadProgressBar(unit='B', unit_scale=True, miniters=1, desc=desc) as t:
                urllib.request.urlretrieve(url, filename=output_path, reporthook=t.update_to)

            # Verifikasi MD5
            actual_md5 = calculate_md5(output_path)
            if actual_md5 == expected_md5:
                return True # Sukses download dan verifikasi
            else:
                print(f"  ‚ùå MD5 checksum mismatch for {desc}. Expected {expected_md5}, got {actual_md5}.")
                os.remove(output_path) # Hapus file yang rusak
                raise Exception("MD5 mismatch")

        except (urllib.error.URLError, Exception) as e:
            print(f"  ‚ö†Ô∏è  Error downloading {desc} (attempt {attempt + 1}): {e}")
            if attempt < max_retries - 1:
                delay = initial_delay * (2 ** attempt) # Exponential backoff
                print(f"  Retrying in {delay} seconds...")
                time.sleep(delay)
            else:
                print(f"  ‚ùå Failed to download {desc} after {max_retries} attempts.")
                return False # Gagal setelah semua percobaan
    return False

# Download semua file
print("üì• Mulai download dataset iFood 2019...")
print("="*60)

download_dir = '/content/downloads'
os.makedirs(download_dir, exist_ok=True)

all_downloads_successful = True

for name, info in DATASET_URLS.items():
    output_path = os.path.join(download_dir, info['filename'])

    # Skip jika sudah ada dan MD5 cocok
    if os.path.exists(output_path):
        print(f"‚è≠Ô∏è  {name}: sudah ada. Memverifikasi MD5...")
        try:
            actual_md5 = calculate_md5(output_path)
            if actual_md5 == info['md5']:
                print(f"‚úÖ MD5 cocok. {name} sudah siap.")
                continue
            else:
                print(f"‚ö†Ô∏è  MD5 mismatch for existing {name}. Redownloading... (Expected {info['md5']}, got {actual_md5})")
                os.remove(output_path) # Hapus file yang rusak
        except Exception as e:
            print(f"‚ö†Ô∏è  Error checking existing file {name}: {e}. Redownloading...")
            if os.path.exists(output_path): os.remove(output_path)

    print(f"\nüì• Downloading {name} ({info['size']})...")
    start_time = time.time()

    success = download_file_with_retries(
        info['url'], output_path, info['filename'], info['md5']
    )

    if success:
        elapsed = time.time() - start_time
        print(f"‚úÖ {name} selesai dalam {elapsed:.1f} detik")
    else:
        all_downloads_successful = False
        print(f"‚ùå Gagal mendownload {name}. Proses mungkin tidak lengkap.")

print("\n" + "="*60)
if all_downloads_successful:
    print("‚úÖ Semua file berhasil didownload dan diverifikasi!")
else:
    print("‚ùå Ada beberapa file yang gagal didownload atau diverifikasi. Mohon periksa log di atas.")
    print("Disarankan untuk mengulang kembali proses download.")



In [None]:
# ============================================================
# STEP 3: Extract ke Google Drive (FIXED - handle existing files)
# ============================================================

import tarfile
import shutil
import os
from tqdm import tqdm

download_dir = '/content/downloads'
DATASET_PATH = '/content/drive/MyDrive/AlexNet_iFood2019/dataset'

def safe_extract(tar_path, extract_to, filter_arg='data'):
    """Extract tar file dengan handling untuk Python 3.12+"""
    with tarfile.open(tar_path, 'r') as tar:
        # Gunakan filter untuk Python 3.12+ (menghilangkan warning)
        try:
            tar.extractall(extract_to, filter=filter_arg)
        except TypeError:
            # Fallback untuk Python versi lama
            tar.extractall(extract_to)

def move_files_safe(src_dir, dest_dir):
    """Move files dengan skip jika sudah ada dan laporkan status."""
    if not os.path.exists(src_dir):
        return 0, 0, 0 # moved, skipped, failed

    files = os.listdir(src_dir)
    moved = 0
    skipped = 0
    failed_moves = 0

    for f in tqdm(files, desc=f"Moving files to {os.path.basename(dest_dir)}"):
        src_path = os.path.join(src_dir, f)
        dest_path = os.path.join(dest_dir, f)

        if os.path.exists(dest_path):
            skipped += 1
            continue

        try:
            shutil.move(src_path, dest_path)
            moved += 1
        except Exception as e:
            print(f"Warning: Could not move {f} to {dest_dir}: {e}")
            failed_moves += 1

    if skipped > 0:
        print(f"   ‚ÑπÔ∏è  Skipped {skipped} existing files.")
    if failed_moves > 0:
        print(f"   ‚ùå Failed to move {failed_moves} files.")

    return moved, skipped, failed_moves

print("üì¶ Mulai ekstraksi ke Google Drive...")
print("="*60)

# 1. Extract annotations
print("\nüì¶ Extracting annotations...")
annot_tar = os.path.join(download_dir, 'annot.tar')
if os.path.exists(annot_tar):
    safe_extract(annot_tar, DATASET_PATH)
    print("‚úÖ Annotations extracted")
else:
    print("‚è≠Ô∏è  Annotations tar not found, skipping")

# 2. Extract train images
print("\nüì¶ Extracting train images (ini akan memakan waktu ~10-15 menit)...")
train_tar = os.path.join(download_dir, 'train.tar')
train_dir = os.path.join(DATASET_PATH, 'train_images')
temp_train = '/content/temp_train'

if os.path.exists(train_tar):
    os.makedirs(train_dir, exist_ok=True)

    # Clean temp folder if exists
    if os.path.exists(temp_train):
        shutil.rmtree(temp_train)

    print("   Extracting to temp...")
    safe_extract(train_tar, temp_train)

    # Find the extracted folder
    src_dir = os.path.join(temp_train, 'train_set')
    if not os.path.exists(src_dir):
        # Try alternative path
        subdirs = os.listdir(temp_train)
        if subdirs:
            src_dir = os.path.join(temp_train, subdirs[0])

    if os.path.exists(src_dir):
        print("   Moving to Drive...")
        moved, skipped, failed = move_files_safe(src_dir, train_dir)
        shutil.rmtree(temp_train, ignore_errors=True)
        current_file_count = len(os.listdir(train_dir)) if os.path.exists(train_dir) else 0
        print(f"‚úÖ Train images: {moved} files moved, {skipped} skipped, {failed} failed. Total in target: {current_file_count} files.")
    else:
        print("‚ùå Could not find train_set folder in temp extraction.")
else:
    current_file_count = len(os.listdir(train_dir)) if os.path.exists(train_dir) else 0
    print(f"‚è≠Ô∏è  Train tar not found. Checking existing: {current_file_count} files.")

# 3. Extract val images
print("\nüì¶ Extracting validation images...")
val_tar = os.path.join(download_dir, 'val.tar')
val_dir = os.path.join(DATASET_PATH, 'val_images')
temp_val = '/content/temp_val'

if os.path.exists(val_tar):
    os.makedirs(val_dir, exist_ok=True)

    if os.path.exists(temp_val):
        shutil.rmtree(temp_val)

    print("   Extracting to temp...")
    safe_extract(val_tar, temp_val)

    src_dir = os.path.join(temp_val, 'val_set')
    if not os.path.exists(src_dir):
        subdirs = os.listdir(temp_val)
        if subdirs:
            src_dir = os.path.join(temp_val, subdirs[0])

    if os.path.exists(src_dir):
        print("   Moving to Drive...")
        moved, skipped, failed = move_files_safe(src_dir, val_dir)
        shutil.rmtree(temp_val, ignore_errors=True)
        current_file_count = len(os.listdir(val_dir)) if os.path.exists(val_dir) else 0
        print(f"‚úÖ Val images: {moved} files moved, {skipped} skipped, {failed} failed. Total in target: {current_file_count} files.")
    else:
        print("‚ùå Could not find val_set folder in temp extraction.")
else:
    current_file_count = len(os.listdir(val_dir)) if os.path.exists(val_dir) else 0
    print(f"‚è≠Ô∏è  Val tar not found. Checking existing: {current_file_count} files.")

# 4. Extract test images
print("\nüì¶ Extracting test images...")
test_tar = os.path.join(download_dir, 'test.tar')
test_dir = os.path.join(DATASET_PATH, 'test_images')
temp_test = '/content/temp_test'

if os.path.exists(test_tar):
    os.makedirs(test_dir, exist_ok=True)

    if os.path.exists(temp_test):
        shutil.rmtree(temp_test)

    print("   Extracting to temp...")
    safe_extract(test_tar, temp_test)

    src_dir = os.path.join(temp_test, 'test_set')
    if not os.path.exists(src_dir):
        subdirs = os.listdir(temp_test)
        if subdirs:
            src_dir = os.path.join(temp_test, subdirs[0])

    if os.path.exists(src_dir):
        print("   Moving to Drive...")
        moved, skipped, failed = move_files_safe(src_dir, test_dir)
        shutil.rmtree(temp_test, ignore_errors=True)
        current_file_count = len(os.listdir(test_dir)) if os.path.exists(test_dir) else 0
        print(f"‚úÖ Test images: {moved} files moved, {skipped} skipped, {failed} failed. Total in target: {current_file_count} files.")
    else:
        print("‚ùå Could not find test_set folder in temp extraction.")
else:
    current_file_count = len(os.listdir(test_dir)) if os.path.exists(test_dir) else 0
    print(f"‚è≠Ô∏è  Test tar not found. Checking existing: {current_file_count} files.")

print("\n" + "="*60)
print("‚úÖ Ekstraksi selesai!")


In [3]:
import os
import pandas as pd

DATASET_PATH = '/content/drive/MyDrive/AlexNet_iFood2019/dataset'

print("üîç Verifikasi dataset...")
print("="*60)

# Check files
required_items = {
    'class_list.txt': 'file',
    'train_info.csv': 'file',
    'val_info.csv': 'file',
    'test_info.csv': 'file',
    'train_images': 'dir',
    'val_images': 'dir',
    'test_images': 'dir'
}

all_ok = True
for item, item_type in required_items.items():
    path = os.path.join(DATASET_PATH, item)

    if item_type == 'file':
        exists = os.path.isfile(path)
    else:
        exists = os.path.isdir(path)

    if exists:
        if item_type == 'dir':
            count = len(os.listdir(path))
            print(f"‚úÖ {item}: {count:,} files")
        else:
            print(f"‚úÖ {item}")
    else:
        print(f"‚ùå {item}: TIDAK DITEMUKAN")
        all_ok = False

print("\n" + "="*60)

# --- New: Verify image files based on info CSVs ---
print("üîç Verifikasi file gambar berdasarkan info CSV...")

def verify_image_paths(info_csv_path, images_dir_path, dataset_name):
    global all_ok
    try:
        df = pd.read_csv(info_csv_path)
        missing_count = 0
        print(f"  Mengecek {len(df)} gambar di {dataset_name}...")
        for index, row in df.iterrows():
            image_filename = row['image_id'] + '.jpg'
            image_path = os.path.join(images_dir_path, image_filename)
            if not os.path.isfile(image_path):
                print(f"    ‚ùå File {image_filename} tidak ditemukan di {dataset_name}")
                missing_count += 1
                all_ok = False
        if missing_count == 0:
            print(f"  ‚úÖ Semua {len(df)} gambar di {dataset_name} ditemukan.")
        else:
            print(f"  ‚ö†Ô∏è  Total {missing_count} gambar hilang di {dataset_name}.")
    except FileNotFoundError:
        print(f"  ‚ùå Info CSV untuk {dataset_name} tidak ditemukan: {info_csv_path}")
        all_ok = False
    except Exception as e:
        print(f"  ‚ùå Error saat memverifikasi {dataset_name}: {e}")
        all_ok = False

if os.path.isfile(os.path.join(DATASET_PATH, 'train_info.csv')) and os.path.isdir(os.path.join(DATASET_PATH, 'train_images')):
    verify_image_paths(os.path.join(DATASET_PATH, 'train_info.csv'), os.path.join(DATASET_PATH, 'train_images'), 'train_images')

if os.path.isfile(os.path.join(DATASET_PATH, 'val_info.csv')) and os.path.isdir(os.path.join(DATASET_PATH, 'val_images')):
    verify_image_paths(os.path.join(DATASET_PATH, 'val_info.csv'), os.path.join(DATASET_PATH, 'val_images'), 'val_images')

if os.path.isfile(os.path.join(DATASET_PATH, 'test_info.csv')) and os.path.isdir(os.path.join(DATASET_PATH, 'test_images')):
    verify_image_paths(os.path.join(DATASET_PATH, 'test_info.csv'), os.path.join(DATASET_PATH, 'test_images'), 'test_images')

print("\n" + "="*60)
if all_ok:
    print("üéâ DATASET SIAP DIGUNAKAN!")
    print(f"\nLokasi: {DATASET_PATH}")
alternate_suggestion = ""
if not os.path.exists(os.path.join(DATASET_PATH, 'train_images')) or len(os.listdir(os.path.join(DATASET_PATH, 'train_images'))) < 100000: # Heuristic for 'incomplete' train_images
    alternate_suggestion = "\n  * Pastikan folder `train_images` terisi penuh. Terkadang proses ekstraksi dapat terganggu.\n  * Periksa kembali ukuran `train.tar` di Google Drive Anda jika proses download tidak berhasil."

if not all_ok:
    print("‚ö†Ô∏è  Ada file yang hilang atau bermasalah.\nSilakan coba langkah-langkah berikut:")
    print("  * Jalankan ulang dari Step 2: Download Dataset.")
    print("  * Jika masalah berlanjut, periksa koneksi internet Anda atau coba di waktu lain.")
    print("  * Periksa log ekstraksi di Step 3 untuk memastikan tidak ada error.")
    print(alternate_suggestion)


üîç Verifikasi dataset...
‚úÖ class_list.txt
‚úÖ train_info.csv
‚úÖ val_info.csv
‚úÖ test_info.csv


OSError: [Errno 5] Input/output error: '/content/drive/MyDrive/AlexNet_iFood2019/dataset/train_images'

In [None]:
# ============================================================
# STEP 5: Cleanup (Hapus file tar untuk hemat storage)
# ============================================================

import shutil
import os

download_dir = '/content/downloads'

if os.path.exists(download_dir):
    size_before = sum(os.path.getsize(os.path.join(download_dir, f))
                      for f in os.listdir(download_dir)
                      if os.path.isfile(os.path.join(download_dir, f)))

    shutil.rmtree(download_dir)
    print(f"üóëÔ∏è  Deleted download cache: {size_before / 1e9:.2f} GB freed")
else:
    print("‚úÖ No cache to clean")

# Cleanup temp folders juga
for temp in ['/content/temp_train', '/content/temp_val', '/content/temp_test']:
    if os.path.exists(temp):
        shutil.rmtree(temp, ignore_errors=True)

print("\n" + "="*60)
print("üéâ SELESAI!")
print("="*60)
print("\nDataset sudah tersimpan di Google Drive.")
print("Anda bisa menutup notebook ini dan lanjut ke training.")
print("\nNotebook selanjutnya:")
print("  - train_member1_baseline.ipynb (Member 1)")
print("  - train_member2_mod1.ipynb (Member 2)")
print("  - train_member3_mod2.ipynb (Member 3)")
print("  - train_member4_combined.ipynb (Member 4)")