# Download, Upload, and Unzip Large Datasets to Google Cloud Storage

This notebook demonstrates how to:

1. Download large .zip files from URLs and upload them directly to Google Cloud Storage (GCS) without filling up local disk.
2. Unzip the files inside a Google Cloud VM (or Vertex AI Workbench) and upload the extracted files back to GCS.

## Steps

- **Step 1:** Download and upload .zip files to GCS.
- **Step 2:** Unzip files inside a GCP VM and upload extracted files back to GCS.
- **Step 3:** (Optional) Automate the unzip and upload process in Python.

> **Note:** Do not unzip in Colab if your dataset is very large. Use a VM or Vertex AI Workbench for unzipping and uploading extracted files.

In [30]:
import os
import requests
from google.oauth2 import service_account
from google.cloud import storage

key_json_path = "../service-account.json"

credentials = service_account.Credentials.from_service_account_file(key_json_path)

storage_client = storage.Client(credentials=credentials)

In [None]:
import os
import requests
from google.cloud import storage

# Initialize GCS client
bucket_name = "test-video-retrieval"
bucket = storage_client.bucket(bucket_name)

NameError: name 'storage_client' is not defined

In [None]:
import requests

# List of files and URLs
files = {
    "Keyframes_L21.zip": "https://aic-data.ledo.io.vn/Keyframes_L21.zip",
    "Keyframes_L22.zip": "https://aic-data.ledo.io.vn/Keyframes_L22.zip",
    "Keyframes_L23.zip": "https://aic-data.ledo.io.vn/Keyframes_L23.zip",
    "Keyframes_L24.zip": "https://aic-data.ledo.io.vn/Keyframes_L24.zip",
    "Keyframes_L25.zip": "https://aic-data.ledo.io.vn/Keyframes_L25.zip",
    "Keyframes_L26_a.zip": "https://aic-data.ledo.io.vn/Keyframes_L26_a.zip",
    "Keyframes_L26_b.zip": "https://aic-data.ledo.io.vn/Keyframes_L26_b.zip",
    "Keyframes_L26_c.zip": "https://aic-data.ledo.io.vn/Keyframes_L26_c.zip",
    "Keyframes_L26_d.zip": "https://aic-data.ledo.io.vn/Keyframes_L26_d.zip",
    "Keyframes_L26_e.zip": "https://aic-data.ledo.io.vn/Keyframes_L26_e.zip",
    "Keyframes_L27.zip": "https://aic-data.ledo.io.vn/Keyframes_L27.zip",
    "Keyframes_L28.zip": "https://aic-data.ledo.io.vn/Keyframes_L28.zip",
    "Keyframes_L29.zip": "https://aic-data.ledo.io.vn/Keyframes_L29.zip",
    "Keyframes_L30.zip": "https://aic-data.ledo.io.vn/Keyframes_L30.zip",
    "Videos_L21_a.zip": "https://aic-data.ledo.io.vn/Videos_L21_a.zip",
    "Videos_L22_a.zip": "https://aic-data.ledo.io.vn/Videos_L22_a.zip",
    "Videos_L23_a.zip": "https://aic-data.ledo.io.vn/Videos_L23_a.zip",
    "Videos_L24_a.zip": "https://aic-data.ledo.io.vn/Videos_L24_a.zip",
    "Videos_L25_a.zip": "https://aic-data.ledo.io.vn/Videos_L25_a.zip",
    "Videos_L25_a1.zip": "https://aic-data.ledo.io.vn/Videos_L25_a1.zip",
    "Videos_L25_b.zip": "https://aic-data.ledo.io.vn/Videos_L25_b.zip",
    "Videos_L26_a.zip": "https://aic-data.ledo.io.vn/Videos_L26_a.zip",
    "Videos_L26_b.zip": "https://aic-data.ledo.io.vn/Videos_L26_b.zip",
    "Videos_L26_c.zip": "https://aic-data.ledo.io.vn/Videos_L26_c.zip",
    "Videos_L26_d.zip": "https://aic-data.ledo.io.vn/Videos_L26_d.zip",
    "Videos_L26_e.zip": "https://aic-data.ledo.io.vn/Videos_L26_e.zip",
    "Videos_L27_a.zip": "https://aic-data.ledo.io.vn/Videos_L27_a.zip",
    "Videos_L28_a.zip": "https://aic-data.ledo.io.vn/Videos_L28_a.zip",
    "Videos_L29_a.zip": "https://aic-data.ledo.io.vn/Videos_L29_a.zip",
    "Videos_L30_a.zip": "https://aic-data.ledo.io.vn/Videos_L30_a.zip",
    "clip-features-32-aic25-b1.zip": "https://aic-data.ledo.io.vn/clip-features-32-aic25-b1.zip",
    "map-keyframes-aic25-b1.zip": "https://aic-data.ledo.io.vn/map-keyframes-aic25-b1.zip",
    "media-info-aic25-b1.zip": "https://aic-data.ledo.io.vn/media-info-aic25-b1.zip",
    "objects-aic25-b1.zip": "https://aic-data.ledo.io.vn/objects-aic25-b1.zip"
}

def download_and_upload(filename, url):
    local_path = f"{filename}"
    # Stream download to avoid memory issues
    with requests.get(url, stream=True, timeout=None) as r:
        r.raise_for_status()
        with open(local_path, "wb") as f:
            for chunk in r.iter_content(chunk_size=1024*1024):  # 1 MB
                if chunk:
                    f.write(chunk)
    print(f"Downloaded {filename} to {local_path}")

    # Upload to GCS
    blob = bucket.blob(f"dataset/raw_zips/{filename}")
    blob.chunk_size = 5 * 1024 * 1024
    blob.upload_from_filename(local_path)
    print(f"Uploaded {filename} to gs://{bucket_name}/raw_zips/")
    os.remove(local_path)

In [5]:
total_size_bytes = 0

for name, url in files.items():
    response = requests.head(url)
    size = int(response.headers.get('Content-Length', 0))
    total_size_bytes += size
    print(f"{name}: {size / (1024**3):.2f} GB")  # size in GB

print(f"Total storage: {total_size_bytes / (1024**3):.2f} GB")

Keyframes_L21.zip: 1.35 GB
Keyframes_L22.zip: 1.60 GB
Keyframes_L23.zip: 0.47 GB
Keyframes_L24.zip: 1.61 GB
Keyframes_L25.zip: 5.67 GB
Keyframes_L26_a.zip: 2.14 GB
Keyframes_L26_b.zip: 2.25 GB
Keyframes_L26_c.zip: 2.31 GB
Keyframes_L26_d.zip: 2.30 GB
Keyframes_L26_e.zip: 2.30 GB
Keyframes_L27.zip: 1.02 GB
Keyframes_L28.zip: 2.02 GB
Keyframes_L29.zip: 2.34 GB
Keyframes_L30.zip: 1.31 GB
Videos_L21_a.zip: 3.15 GB
Videos_L22_a.zip: 3.87 GB
Videos_L23_a.zip: 1.90 GB
Videos_L24_a.zip: 5.40 GB
Videos_L25_a.zip: 11.97 GB
Videos_L25_a1.zip: 6.72 GB
Videos_L25_b.zip: 5.13 GB
Videos_L26_a.zip: 6.13 GB
Videos_L26_b.zip: 6.37 GB
Videos_L26_c.zip: 6.43 GB
Videos_L26_d.zip: 6.31 GB
Videos_L26_e.zip: 6.46 GB
Videos_L27_a.zip: 2.37 GB
Videos_L28_a.zip: 6.77 GB
Videos_L29_a.zip: 6.30 GB
Videos_L30_a.zip: 3.85 GB
clip-features-32-aic25-b1.zip: 0.16 GB
map-keyframes-aic25-b1.zip: 0.00 GB
media-info-aic25-b1.zip: 0.00 GB
objects-aic25-b1.zip: 0.60 GB
Total storage: 118.58 GB


In [35]:
for name, url in files.items():
    download_and_upload(name, url)

ChunkedEncodingError: ('Connection broken: IncompleteRead(5194799680 bytes read, 897715842 more expected)', IncompleteRead(5194799680 bytes read, 897715842 more expected))

## Step 2: Unzip and Upload Extracted Files Back to GCS on a GCP VM

After uploading the .zip files to GCS, use a Compute Engine VM (or Vertex AI Workbench) to unzip and upload the extracted files:

1. **Install required tools:**
   ```sh
   sudo apt-get update && sudo apt-get install unzip -y
   ```
2. **Copy a zip file from GCS:**
   ```sh
   gsutil cp gs://your-bucket/raw_zips/Keyframes_L21.zip .
   ```
3. **Unzip locally:**
   ```sh
   unzip Keyframes_L21.zip -d Keyframes_L21/
   ```
4. **Upload extracted files back to GCS:**
   ```sh
   gsutil -m cp -r Keyframes_L21/ gs://your-bucket/unzipped/
   ```

Repeat for each zip file as needed.

In [None]:
import zipfile

# This cell should be run on a GCP VM or Vertex AI Workbench with access to the bucket.
# It will download each zip from GCS, unzip it, and upload the extracted files back to GCS.

# Initialize GCS client
bucket_name = "test-video-retrieval"
bucket = storage_client.bucket(bucket_name)

# List all blobs in the raw_zips folder
blobs = bucket.list_blobs(prefix="dataset/raw_zips/")

for blob in blobs:
    filename = os.path.basename(blob.name)
    local_zip = f"/{filename}"
    print(f"Processing {filename}")

    # Download zip from GCS
    blob.download_to_filename(local_zip)
    print(f"Downloaded {filename}")

    # Unzip
    extract_dir = f"/{filename}"
    with zipfile.ZipFile(local_zip, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
    print(f"Unzipped {filename}")

    # Upload extracted files back to GCS
    for root, dirs, files in os.walk(extract_dir):
        for f in files:
            local_path = os.path.join(root, f)
            rel_path = os.path.relpath(local_path, extract_dir)
            new_blob = bucket.blob(f"unzipped/{filename}/{rel_path}")
            new_blob.upload_from_filename(local_path)
    print(f"Uploaded unzipped {filename} to GCS")

    os.remove(local_zip)

## Tips and Troubleshooting

- For very large files, consider increasing the VM disk size or using a VM with SSD.
- Use `gsutil -m` for parallel uploads to speed up transfers.
- If you have many files, you can batch process them by modifying the Python code to process a subset at a time.
- If you run into permission errors, make sure your VM service account has Storage Object Admin permissions.
- Clean up `/tmp` after processing to avoid running out of disk space.

---

You can now efficiently move and extract large datasets between public URLs and Google Cloud Storage!