This Colab notebook explores the contents of an S3 bucket named deepdrug-dpeb (https://registry.opendata.aws/deepdrug-dpeb/)

Install and Import Required Libraries

In [None]:
!pip install boto3

import boto3
from botocore import UNSIGNED
from botocore.config import Config


Collecting boto3
  Downloading boto3-1.40.39-py3-none-any.whl.metadata (6.7 kB)
Collecting botocore<1.41.0,>=1.40.39 (from boto3)
  Downloading botocore-1.40.39-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.15.0,>=0.14.0 (from boto3)
  Downloading s3transfer-0.14.0-py3-none-any.whl.metadata (1.7 kB)
Downloading boto3-1.40.39-py3-none-any.whl (139 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading botocore-1.40.39-py3-none-any.whl (14.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.0/14.0 MB[0m [31m85.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Downloading s3transfer-0.14.0-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m6.5 MB/s[0m eta [36m0:0

Connect to S3 (No Credentials Needed)

In [None]:
# Connect anonymously using 'UNSIGNED' config
s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
bucket_name = 'deepdrug-dpeb'


List Top-Level Folders

In [None]:
response = s3.list_objects_v2(Bucket=bucket_name, Delimiter='/')
folders = [prefix['Prefix'] for prefix in response.get('CommonPrefixes', [])]
print("Top-level folders in DPEB:")
for folder in folders:
    print(folder)


Top-level folders in DPEB:
Alphafold-2/
BioEmbedding/
ESM-2/
ProtVec/


### deepdrug-dpeb S3 bucket structure





In [None]:
from collections import defaultdict

# Step 1: Build tree from S3 keys
def build_tree(paths):
    tree = lambda: defaultdict(tree)
    root = tree()
    for path in paths:
        parts = path.strip("/").split("/")
        current = root
        for part in parts:
            current = current[part]
    return root

# Step 2: Print tree recursively
def print_tree(d, prefix=""):
    items = list(d.items())
    for i, (key, subtree) in enumerate(items):
        connector = "└── " if i == len(items) - 1 else "├── "
        print(prefix + connector + key)
        extension = "    " if i == len(items) - 1 else "│   "
        print_tree(subtree, prefix + extension)

# Step 3: Run it on your list
tree = build_tree(all_keys)
print("deepdrug-dpeb/")
print_tree(tree)


deepdrug-dpeb/
├── Alphafold-2
│   ├── All_ePPI_Alphafold2_Embeddings_np_v1.3.rar
│   └── eppi_alphafold_aggregated_embeddings.csv
├── BioEmbedding
│   ├── All_ePPI_Bio_Embeddings_np.rar
│   └── bio_embeddings_ePPI.csv
├── ESM-2
│   ├── ProteinID_proteinSEQ_ESM_emb.csv
│   └── esm2_dict_embeddings.rar
├── ProtVec
│   ├── protvec_aggregated_embeddings.csv
│   └── protvec_dict_embeddings.rar
└── dpeb_aggreagated_embeddings_all_in_one.csv


List Files in a Specific Folder (e.g., AlphaFold2/)

In [None]:
prefix = 'Alphafold-2/'  # You can change this to ESM-2/, ProtVec/, etc.
response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)

print(f"\nFiles under {prefix}:")
for obj in response.get('Contents', []):
    print(obj['Key'])



Files under Alphafold-2/:
Alphafold-2/
Alphafold-2/All_ePPI_Alphafold2_Embeddings_np_v1.3.rar
Alphafold-2/eppi_alphafold_aggregated_embeddings.csv


In [None]:
import boto3
from botocore.config import Config
from botocore import UNSIGNED

# Initialize anonymous S3 client
s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
bucket_name = 'deepdrug-dpeb'

# Use paginator to get all files
paginator = s3.get_paginator('list_objects_v2')
operation_parameters = {
    'Bucket': bucket_name,
    'RequestPayer': 'requester'
}

# Print file path and size in GB or MB
for page in paginator.paginate(**operation_parameters):
    for obj in page.get('Contents', []):
        size_bytes = obj['Size']
        size_gb = size_bytes / (1024 ** 3)
        size_mb = size_bytes / (1024 ** 2)

        if size_gb >= 1:
            print(f"{obj['Key']} — {round(size_gb, 4)} GB")
        else:
            print(f"{obj['Key']} — {round(size_mb, 2)} MB")


Alphafold-2/ — 0.0 MB
Alphafold-2/All_ePPI_Alphafold2_Embeddings_np_v1.3.rar — 14.857 GB
Alphafold-2/eppi_alphafold_aggregated_embeddings.csv — 171.96 MB
BioEmbedding/ — 0.0 MB
BioEmbedding/All_ePPI_Bio_Embeddings_np.rar — 45.6364 GB
BioEmbedding/bio_embeddings_ePPI.csv — 501.52 MB
ESM-2/ — 0.0 MB
ESM-2/ProteinID_proteinSEQ_ESM_emb.csv — 622.86 MB
ESM-2/esm2_dict_embeddings.rar — 49.3481 GB
ProtVec/ — 0.0 MB
ProtVec/protvec_aggregated_embeddings.csv — 90.46 MB
ProtVec/protvec_dict_embeddings.rar — 3.8175 GB
dpeb_aggreagated_embeddings_all_in_one.csv — 1.2749 GB


Python Code to Download the File

In [None]:
import boto3
from botocore.config import Config
from botocore import UNSIGNED

# Initialize anonymous S3 client
s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
bucket_name = 'deepdrug-dpeb'
file_key = 'Alphafold-2/eppi_alphafold_aggregated_embeddings.csv'
local_filename = 'eppi_alphafold_aggregated_embeddings.csv'

# Download the file
s3.download_file(bucket_name, file_key, local_filename, ExtraArgs={'RequestPayer': 'requester'})
print(f"Downloaded: {local_filename}")


Downloaded: eppi_alphafold_aggregated_embeddings.csv
