In [1]:
import requests
import pandas as pd
import json
import os
import re
import time
import seaborn as sns
import matplotlib.pyplot as plt

from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import IPythonConsole
from chembl_webresource_client.new_client import new_client

  __version__ = __import__('pkg_resources').get_distribution('chembl_webresource_client').version


In [2]:
assay = new_client.assay

In [None]:
batch_size = 20
offset = 0
max_records = 1000000  # or None for full
rawfile = "../data/raw/test_1M.csv"

# Batch collection settings
write_frequency = 100  # Write every 100 batches
results = []  # Temporary storage for batches
first_write = True
total_records = 0
batch_count = 0

while True:
    print(f"Fetching records {offset} to {offset + batch_size}...")
    
    batch = assay.filter(
        assay_type='A',
        assay_organism__iexact='Homo sapiens'
    ).only(['assay_type', 'description', 'assay_chembl_id', 'assay_organism'])[offset:offset + batch_size]
    
    if not batch:
        break
    
    # Convert batch to DataFrame and collect
    df_batch = pd.DataFrame(batch)
    results.append(df_batch)
    batch_count += 1
    
    # Write every 100 batches or when we reach max_records
    should_write = (batch_count % write_frequency == 0) or (max_records and offset + batch_size >= max_records)
    
    if should_write:
        # Combine collected batches
        chunk_df = pd.concat(results, ignore_index=True)
        
        # Write to CSV
        if first_write:
            chunk_df.to_csv(rawfile, mode='w', header=True, index=False)
            print(f"Created {rawfile} and wrote first chunk ({len(chunk_df)} records)")
            first_write = False
        else:
            chunk_df.to_csv(rawfile, mode='a', header=False, index=False)
            print(f"Appended chunk ({len(chunk_df)} records)")
        
        total_records += len(chunk_df)
        print(f"Total records written so far: {total_records}")
        
        # Clear memory
        results.clear()
    
    offset += batch_size
    if max_records and offset >= max_records:
        break
    time.sleep(0.5)  # adjust delay if needed

# Write any remaining batches
if results:
    final_chunk = pd.concat(results, ignore_index=True)
    if first_write:
        final_chunk.to_csv(rawfile, mode='w', header=True, index=False)
    else:
        final_chunk.to_csv(rawfile, mode='a', header=False, index=False)
    total_records += len(final_chunk)
    print(f"Wrote final chunk ({len(final_chunk)} records)")

if total_records > 0:
    print(f"\n✅ Successfully wrote {total_records} records to {rawfile}")
else:
    print("\n⚠️ No data retrieved.")

Fetching records 0 to 20...
Fetching records 20 to 40...
Fetching records 40 to 60...
Fetching records 60 to 80...
Fetching records 80 to 100...
Fetching records 100 to 120...
Fetching records 120 to 140...
Fetching records 140 to 160...
Fetching records 160 to 180...
Fetching records 180 to 200...
Fetching records 200 to 220...
Fetching records 220 to 240...
Fetching records 240 to 260...
Fetching records 260 to 280...
Fetching records 280 to 300...
Fetching records 300 to 320...
Fetching records 320 to 340...
Fetching records 340 to 360...
Fetching records 360 to 380...
Fetching records 380 to 400...
Fetching records 400 to 420...
Fetching records 420 to 440...
Fetching records 440 to 460...
Fetching records 460 to 480...
Fetching records 480 to 500...
Fetching records 500 to 520...
Fetching records 520 to 540...
Fetching records 540 to 560...
Fetching records 560 to 580...
Fetching records 580 to 600...
Fetching records 600 to 620...
Fetching records 620 to 640...
Fetching records 6