In [None]:
import time
import sys
from chembl_webresource_client.new_client import new_client
from datetime import datetime

def estimate_chembl_assays():
    """
    Estimate ChEMBL assay counts using different methods
    Since .count() doesn't exist, we'll use sampling and binary search
    """
    print("=== ChEMBL Assay Size Estimator ===")
    print(f"Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print()
    
    assay = new_client.assay
    
    # Method 1: Binary search to find approximate size
    print("Method 1: Binary Search Estimation")
    print("-" * 40)
    
    target_filter = {
        'assay_type': 'A',
        'assay_organism__iexact': 'Homo sapiens'
    }
    
    estimated_size = binary_search_size(assay, target_filter)
    print(f"Estimated Human Type A assays: ~{estimated_size:,} records")
    print()
    
    # Method 2: Sample at different offsets
    print("Method 2: Offset Sampling")
    print("-" * 40)
    
    sample_sizes = sample_at_offsets(assay, target_filter)
    print()
    
    # Method 3: Progressive sampling
    print("Method 3: Progressive Sampling")
    print("-" * 40)
    
    progressive_estimate = progressive_sampling(assay, target_filter)
    print()
    
    # Summary and recommendations
    print("=== SUMMARY ===")
    estimates = [estimated_size, progressive_estimate]
    estimates = [e for e in estimates if e is not None]
    
    if estimates:
        min_est = min(estimates)
        max_est = max(estimates)
        avg_est = sum(estimates) / len(estimates)
        
        print(f"Size estimates range: {min_est:,} - {max_est:,} records")
        print(f"Average estimate: {avg_est:,.0f} records")
        
        # File size and time estimates
        estimate_download_specs(avg_est)
    else:
        print("Could not get reliable estimates")
    
    return estimates

def binary_search_size(assay, filter_params):
    """
    Use binary search to find approximate dataset size
    """
    print("Using binary search to estimate total size...")
    
    # Start with a range
    low = 0
    high = 1000000  # Start with 1M as upper bound
    
    # First, find an upper bound
    print("Finding upper bound...")
    while True:
        try:
            result = assay.filter(**filter_params).only(['assay_chembl_id'])[high-1:high]
            result_list = list(result)
            
            if len(result_list) == 0:
                print(f"  Upper bound found: {high:,}")
                break
            else:
                print(f"  {high:,} records exist, trying higher...")
                high *= 2
                
            time.sleep(0.2)
            
        except Exception as e:
            print(f"  Error at {high:,}: {e}")
            break
    
    # Now binary search for exact size
    print("Binary searching for exact size...")
    last_valid = 0
    
    while low <= high:
        mid = (low + high) // 2
        
        try:
            result = assay.filter(**filter_params).only(['assay_chembl_id'])[mid-1:mid]
            result_list = list(result)
            
            if len(result_list) > 0:
                last_valid = mid
                low = mid + 1
                print(f"  Record exists at {mid:,}, searching higher...")
            else:
                high = mid - 1
                print(f"  No record at {mid:,}, searching lower...")
                
            time.sleep(0.2)
            
        except Exception as e:
            print(f"  Error at {mid:,}: {e}")
            high = mid - 1
    
    return last_valid

def sample_at_offsets(assay, filter_params):
    """
    Sample at different offsets to estimate size
    """
    print("Sampling at different offsets...")
    
    offsets_to_test = [0, 10000, 50000, 100000, 500000]
    sample_size = 100
    
    valid_offsets = []
    
    for offset in offsets_to_test:
        try:
            print(f"  Testing offset {offset:,}...")
            
            result = assay.filter(**filter_params).only(['assay_chembl_id'])[offset:offset + sample_size]
            result_list = list(result)
            
            if len(result_list) > 0:
                valid_offsets.append(offset)
                print(f"    ✓ Got {len(result_list)} records")
            else:
                print(f"    ❌ No records found")
                break
                
            time.sleep(0.5)
            
        except Exception as e:
            print(f"    Error: {e}")
            break
    
    if valid_offsets:
        # Estimate based on highest valid offset
        max_valid = max(valid_offsets)
        # Add some buffer since we don't know exactly where it ends
        estimated = max_valid + 50000
        print(f"  Estimate based on sampling: >{max_valid:,}, probably ~{estimated:,}")
        return estimated
    
    return None

def progressive_sampling(assay, filter_params):
    """
    Keep sampling until we hit the end
    """
    print("Progressive sampling to find end...")
    
    batch_size = 1000
    offset = 0
    last_successful_offset = 0
    consecutive_empty = 0
    
    while consecutive_empty < 3:  # Stop after 3 consecutive empty batches
        try:
            print(f"  Sampling at offset {offset:,}...")
            
            result = assay.filter(**filter_params).only(['assay_chembl_id'])[offset:offset + batch_size]
            result_list = list(result)
            
            if len(result_list) > 0:
                last_successful_offset = offset + len(result_list)
                consecutive_empty = 0
                print(f"    ✓ Got {len(result_list)} records")
                
                # Jump ahead based on what we got
                if len(result_list) == batch_size:
                    # Full batch, jump ahead more
                    offset += batch_size * 10
                else:
                    # Partial batch, we're near the end
                    offset += batch_size
            else:
                consecutive_empty += 1
                print(f"    ❌ Empty batch ({consecutive_empty}/3)")
                offset += batch_size
                
            time.sleep(0.3)
            
        except Exception as e:
            print(f"    Error: {e}")
            consecutive_empty += 1
            offset += batch_size
            
        # Safety limit
        if offset > 2000000:
            print("  Reached safety limit of 2M")
            break
    
    print(f"  Progressive estimate: ~{last_successful_offset:,} records")
    return last_successful_offset

def estimate_download_specs(record_count):
    """
    Estimate download time and file size
    """
    print(f"\n=== DOWNLOAD ESTIMATES FOR {record_count:,.0f} RECORDS ===")
    
    # File size estimates (bytes per record)
    bytes_per_record_low = 150   # Minimal record
    bytes_per_record_high = 600  # Detailed record
    
    size_mb_low = (record_count * bytes_per_record_low) / (1024 * 1024)
    size_mb_high = (record_count * bytes_per_record_high) / (1024 * 1024)
    
    print(f"Estimated file size: {size_mb_low:.1f} - {size_mb_high:.1f} MB")
    print(f"                    ({size_mb_low/1024:.2f} - {size_mb_high/1024:.2f} GB)")
    
    # Time estimates for different speeds
    speeds = [
        (500, "Conservative"),
        (1000, "Typical"),
        (2000, "Optimistic")
    ]
    
    print(f"\nEstimated download times:")
    for speed, label in speeds:
        time_seconds = record_count / speed
        time_minutes = time_seconds / 60
        time_hours = time_minutes / 60
        
        if time_hours >= 1:
            print(f"  {label} ({speed} rec/sec): {time_hours:.1f} hours")
        else:
            print(f"  {label} ({speed} rec/sec): {time_minutes:.0f} minutes")
    
    # Batch size recommendations
    print(f"\nRecommended batch sizes:")
    for batch_size in [1000, 5000, 10000]:
        num_batches = (record_count + batch_size - 1) // batch_size
        print(f"  {batch_size:,} records/batch: {num_batches:,} API calls")

def quick_sample_check():
    """
    Quick check to see if the query works and what data looks like
    """
    print("\n=== QUICK DATA SAMPLE ===")
    
    assay = new_client.assay
    
    try:
        print("Getting small sample to check data quality...")
        
        sample = assay.filter(
            assay_type='A',
            assay_organism__iexact='Homo sapiens'
        ).only(['assay_type', 'description', 'assay_chembl_id', 'assay_organism'])[:5]
        
        sample_list = list(sample)
        
        print(f"Sample size: {len(sample_list)} records")
        
        if sample_list:
            print("\nSample records:")
            for i, record in enumerate(sample_list, 1):
                print(f"  {i}. {record.get('assay_chembl_id', 'N/A')}")
                desc = record.get('description', 'No description')
                if len(desc) > 80:
                    desc = desc[:80] + "..."
                print(f"     {desc}")
                print()
        else:
            print("❌ No records found with your filter criteria!")
            
    except Exception as e:
        print(f"Error getting sample: {e}")

if __name__ == "__main__":
    print("ChEMBL Assay Size Estimator")
    print("Since direct counting isn't available, we'll use estimation methods")
    print()
    
    # Quick sample first
    quick_sample_check()
    
    # Main estimation
    estimates = estimate_chembl_assays()
    
    print("\n=== NEXT STEPS ===")
    if estimates:
        avg_estimate = sum(estimates) / len(estimates)
        if avg_estimate < 50000:
            print("✅ Dataset seems manageable - you can download it relatively quickly")
            print("💡 Suggest starting with your 100K test, then full dataset")
        elif avg_estimate < 200000:
            print("⚠️  Medium-sized dataset - will take some time")
            print("💡 Suggest using streaming approach with checkpoints")
        else:
            print("🔥 Large dataset - plan accordingly")
            print("💡 Definitely use checkpointing and consider running overnight")
    
    print("\nRun your 100K test first to validate these estimates!")

In [None]:
import pandas as pd
import json
import os
import time
from datetime import datetime
from chembl_webresource_client.new_client import new_client

def conservative_chembl_query():
    """
    Very conservative ChEMBL query that respects rate limits
    Based on your experience: after ~5K records, you get throttled to 20 records/batch
    """
    # Ultra-conservative parameters
    initial_batch_size = 1000  # Start smaller
    throttled_batch_size = 20   # What we get when throttled
    max_records = 10000         # Much smaller test - just 10K records
    rawfile = "../data/raw/chembl_assays_human_A_conservative_10k.csv"
    
    # Longer delays to avoid rate limiting
    normal_delay = 0.1          # 2 seconds between normal requests
    throttled_delay = 1.0       # 5 seconds when we detect throttling
    
    assay = new_client.assay
    os.makedirs(os.path.dirname(rawfile), exist_ok=True)
    
    if os.path.exists(rawfile):
        os.remove(rawfile)
    
    total_records = 0
    offset = 0
    first_write = True
    start_time = time.time()
    
    # Track throttling
    is_throttled = False
    consecutive_small_batches = 0
    
    print("=== ChEMBL Conservative Query (10K records) ===")
    print(f"Target: {max_records:,} records")
    print(f"Initial batch size: {initial_batch_size:,}")
    print(f"Normal delay: {normal_delay}s, Throttled delay: {throttled_delay}s")
    print(f"Output file: {rawfile}")
    print(f"Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print()
    
    while offset < max_records:
        # Determine batch size based on throttling status
        if is_throttled:
            current_batch_size = throttled_batch_size
            delay = throttled_delay
        else:
            current_batch_size = min(initial_batch_size, max_records - offset)
            delay = normal_delay
        
        print(f"Fetching batch {offset:,} to {offset + current_batch_size:,}...")
        print(f"  Mode: {'THROTTLED' if is_throttled else 'NORMAL'}")
        
        batch_start_time = time.time()
        
        try:
            batch = assay.filter(
                assay_type='A',
                assay_organism__iexact='Homo sapiens'
            ).only(['assay_type', 'description', 'assay_chembl_id', 'assay_organism'])[offset:offset + current_batch_size]
            
            batch_list = list(batch)
            batch_size_actual = len(batch_list)
            
            if batch_size_actual == 0:
                print("  No more records - stopping")
                break
            
            # Detect throttling
            if not is_throttled and batch_size_actual < current_batch_size * 0.5:
                # Got significantly fewer records than requested
                consecutive_small_batches += 1
                print(f"  ⚠️  Small batch detected ({batch_size_actual}/{current_batch_size}) - count: {consecutive_small_batches}")
                
                if consecutive_small_batches >= 2:
                    print("  🚨 THROTTLING DETECTED - switching to conservative mode")
                    is_throttled = True
                    consecutive_small_batches = 0
            else:
                consecutive_small_batches = 0
            
            # Write data
            df_batch = pd.DataFrame(batch_list)
            df_batch.to_csv(rawfile, mode='a', header=first_write, index=False)
            first_write = False
            
            total_records += batch_size_actual
            batch_time = time.time() - batch_start_time
            
            # Progress info
            elapsed_time = time.time() - start_time
            records_per_second = total_records / elapsed_time if elapsed_time > 0 else 0
            progress_percent = (total_records / max_records) * 100
            
            print(f"  ✓ Got {batch_size_actual:,} records in {batch_time:.1f}s")
            print(f"  Progress: {total_records:,}/{max_records:,} ({progress_percent:.1f}%)")
            print(f"  Overall rate: {records_per_second:.0f} records/sec")
            
            # ETA calculation
            if records_per_second > 0:
                remaining_records = max_records - total_records
                eta_seconds = remaining_records / records_per_second
                eta_minutes = eta_seconds / 60
                print(f"  ETA: ~{eta_minutes:.1f} minutes")
            
            print()
            
        except Exception as e:
            print(f"  ❌ Error: {e}")
            print(f"  Retrying in {delay * 2} seconds...")
            time.sleep(delay * 2)
            continue
        
        offset += batch_size_actual
        
        # Respectful delay
        print(f"  Waiting {delay}s...")
        time.sleep(delay)
    
    # Final summary
    end_time = time.time()
    total_time = end_time - start_time
    
    print("=== COMPLETED ===")
    print(f"Records retrieved: {total_records:,}")
    print(f"Total time: {total_time:.1f} seconds ({total_time/60:.1f} minutes)")
    if total_time > 0:
        print(f"Average rate: {total_records/total_time:.0f} records/second")
    print(f"Throttling detected: {'Yes' if is_throttled else 'No'}")
    print(f"File: {rawfile}")
    
    # Quick analysis
    if total_records > 0:
        analyze_data_quick(rawfile, total_records)
    
    return total_records

def analyze_data_quick(filepath, record_count):
    """
    Quick analysis of the retrieved data
    """
    print(f"\n=== QUICK ANALYSIS ===")
    
    try:
        df = pd.read_csv(filepath)
        file_size_mb = os.path.getsize(filepath) / 1024 / 1024
        
        print(f"File size: {file_size_mb:.2f} MB")
        print(f"Records per MB: ~{record_count / file_size_mb:.0f}")
        
        # Extrapolate for larger datasets
        print(f"\nExtrapolation:")
        for target_size in [50000, 100000, 500000]:
            estimated_mb = target_size / (record_count / file_size_mb)
            estimated_time_hours = (target_size / record_count) * (time.time() - start_time) / 3600
            print(f"  {target_size:,} records → ~{estimated_mb:.0f} MB, ~{estimated_time_hours:.1f} hours")
        
        print(f"\nSample data:")
        print(df.head(2))
        
    except Exception as e:
        print(f"Analysis error: {e}")

def estimate_full_dataset_time():
    """
    Based on your throttling experience, estimate time for full dataset
    """
    print("\n=== FULL DATASET TIME ESTIMATES ===")
    print("Based on your throttling experience:")
    print()
    
    # Your observed performance
    normal_rate = 5000 / 0.3  # ~16,667 records/sec for first batch
    throttled_rate = 20 / 0.2  # ~100 records/sec when throttled
    
    # Assume we get throttled after 5K records
    throttle_threshold = 5000
    
    for dataset_size in [10000, 50000, 100000, 500000]:
        if dataset_size <= throttle_threshold:
            # All at normal rate
            time_seconds = dataset_size / normal_rate
        else:
            # First 5K at normal rate, rest at throttled rate
            normal_time = throttle_threshold / normal_rate
            throttled_records = dataset_size - throttle_threshold
            throttled_time = throttled_records / throttled_rate
            time_seconds = normal_time + throttled_time
        
        time_minutes = time_seconds / 60
        time_hours = time_minutes / 60
        
        if time_hours >= 1:
            print(f"  {dataset_size:,} records: ~{time_hours:.1f} hours")
        else:
            print(f"  {dataset_size:,} records: ~{time_minutes:.0f} minutes")
    
    print()
    print("💡 Recommendations:")
    print("  - Start with 10K records to validate approach")
    print("  - For larger datasets, consider running overnight")
    print("  - Monitor for throttling and adjust delays accordingly")

In [None]:
if __name__ == "__main__":
    print("ChEMBL Conservative Query")
    print("Designed to work with rate limiting based on your experience")
    print()
    
    # Show time estimates first
    estimate_full_dataset_time()
    
    print()
    choice = input("Proceed with 10K record test? (y/n): ")
    
    if choice.lower() == 'y':
        start_time = time.time()
        total = conservative_chembl_query()
        
        if total > 0:
            print(f"\n✅ Successfully retrieved {total:,} records!")
        else:
            print("\n❌ No data retrieved")
    else:
        print("Cancelled.")

In [8]:
import pandas as pd
import os
import time
from datetime import datetime
from chembl_webresource_client.new_client import new_client

def chembl_query_throttle_safe():
    """
    Conservative ChEMBL query that uses fixed small batches and long delays
    to respect known throttling behavior.
    """
    batch_size = 50
    max_records = 10000
    delay = 0.5  # seconds between requests
    rawfile = "../data/raw/chembl_assays_human_A_throttled_10k.csv"

    assay = new_client.assay
    os.makedirs(os.path.dirname(rawfile), exist_ok=True)
    if os.path.exists(rawfile):
        os.remove(rawfile)

    offset = 0
    total_records = 0
    first_write = True
    start_time = time.time()

    print(f"=== ChEMBL Fixed Throttle Query ===")
    print(f"Records target: {max_records}")
    print(f"Batch size: {batch_size}")
    print(f"Delay per batch: {delay}s")
    print(f"Output file: {rawfile}")
    print(f"Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")

    while total_records < max_records:
        print(f"Fetching records {offset} to {offset + batch_size}...")
        try:
            batch = assay.filter(
                assay_type='A',
                assay_organism__iexact='Homo sapiens'
            ).only(['assay_type', 'description', 'assay_chembl_id', 'assay_organism'])[offset:offset + batch_size]

            data = list(batch)
            if not data:
                print("No more data returned.")
                break

            df = pd.DataFrame(data)
            df.to_csv(rawfile, mode='a', header=first_write, index=False)
            first_write = False

            total_records += len(df)
            offset += len(df)

            print(f"  ✓ Got {len(df)} records. Total: {total_records}")
        except Exception as e:
            print(f"  ❌ Error: {e}")
            print(f"  Waiting extra {delay*2}s before retrying...")
            time.sleep(delay * 2)
            continue

        print(f"  Sleeping {delay}s to avoid throttling...\n")
        time.sleep(delay)

    elapsed = time.time() - start_time
    print(f"=== DONE ===")
    print(f"Total records: {total_records}")
    print(f"Total time: {elapsed/60:.1f} minutes")
    return total_records

if __name__ == "__main__":
    chembl_query_throttle_safe()


=== ChEMBL Fixed Throttle Query ===
Records target: 10000
Batch size: 50
Delay per batch: 0.5s
Output file: ../data/raw/chembl_assays_human_A_throttled_10k.csv
Started at: 2025-06-17 13:13:42

Fetching records 0 to 50...
  ✓ Got 50 records. Total: 50
  Sleeping 0.5s to avoid throttling...

Fetching records 50 to 100...
  ✓ Got 20 records. Total: 70
  Sleeping 0.5s to avoid throttling...

Fetching records 70 to 120...
  ✓ Got 20 records. Total: 90
  Sleeping 0.5s to avoid throttling...

Fetching records 90 to 140...
  ✓ Got 20 records. Total: 110
  Sleeping 0.5s to avoid throttling...

Fetching records 110 to 160...
  ✓ Got 20 records. Total: 130
  Sleeping 0.5s to avoid throttling...

Fetching records 130 to 180...
  ✓ Got 20 records. Total: 150
  Sleeping 0.5s to avoid throttling...

Fetching records 150 to 200...
  ✓ Got 20 records. Total: 170
  Sleeping 0.5s to avoid throttling...

Fetching records 170 to 220...
  ✓ Got 20 records. Total: 190
  Sleeping 0.5s to avoid throttling...



KeyboardInterrupt: 