Rate Limit Testing 

The purpose of this notebook is to explore techniques to overcome OpenAI's rate limits

In [2]:
#import src
#from src.components.promptrunner import ResponseClient
#from src.prj_logger import ProjectLogger, get_logs
#from src import utils

"""
Async Support: Fully compatible with AsyncOpenAI and async/await patterns

Proactive Rate Limiting: Instead of just reacting to rate limit errors, this implementation:

Tracks request timestamps
Proactively waits when approaching the limit
Avoids most rate limit errors entirely
Concurrent Request Handling: Supports making multiple requests concurrently with asyncio.gather

Clean API: Wraps the complexity in a simple OpenAIClient class

Type Hints: Added for better IDE support and code clarity

Usage Notes
The max_requests_per_minute is set to 490 by default (slightly below the 500 limit) to provide a safety buffer
The rate limiter uses a rolling window approach, tracking timestamps of requests in the last minute
The async backoff function properly awaits between retries
This implementation should handle high-volume API usage efficiently while minimizing rate limit errors.
"""

In [2]:
import asyncio
import random
import time
from typing import Callable, TypeVar, Any, Awaitable, List, Dict, Union
from openai import OpenAI, AsyncOpenAI, RateLimitError
from openai.types.chat import ChatCompletion
import nest_asyncio

In [3]:
nest_asyncio.apply()

In [4]:
# Type variable for generic function return type
T = TypeVar('T')

In [5]:
import sys

if sys.platform == 'win32':
    loop = asyncio.ProactorEventLoop()
    asyncio.set_event_loop(loop)

In [None]:
loop = asyncio.ProactorEventLoop()
asyncio.set_event_loop(loop)

class OpenAIRateLimiter:
    """Rate limiter for OpenAI API to stay under 500 requests per minute"""
    
    def __init__(self, max_requests_per_minute: int = 490):  # Buffer of 10 under the limit
        self.max_requests = max_requests_per_minute
        self.request_timestamps = []
        self.lock = asyncio.Lock()
    
    async def wait_if_needed(self):
        """Wait if we're approaching the rate limit"""
        async with self.lock:
            # Remove timestamps older than 1 minute
            current_time = time.time()
            self.request_timestamps = [ts for ts in self.request_timestamps 
                                      if current_time - ts < 60]
            
            # If we're at capacity, wait until we can make another request
            if len(self.request_timestamps) >= self.max_requests:
                oldest_timestamp = min(self.request_timestamps)
                wait_time = 60 - (current_time - oldest_timestamp) + 0.1  # Add a small buffer
                if wait_time > 0:
                    print(f"Rate limit approaching, waiting {wait_time:.2f}s...")
                    await asyncio.sleep(wait_time)
                    # After waiting, clean up timestamps again
                    current_time = time.time()
                    self.request_timestamps = [ts for ts in self.request_timestamps 
                                             if current_time - ts < 60]
            
            # Record this request
            self.request_timestamps.append(time.time())


async def async_retry_with_backoff(
    func: Callable[..., Awaitable[T]],
    *args: Any,
    initial_delay: float = 1,
    factor: float = 2,
    jitter: bool = True,
    max_retries: int = 5,
    **kwargs: Any
) -> T:
    """
    Async retry decorator with exponential backoff for OpenAI API calls
    """
    delay = initial_delay
    
    for i in range(max_retries):
        try:
            return await func(*args, **kwargs)
        except RateLimitError as e:
            if i == max_retries - 1:
                raise Exception(f"Max retries exceeded: {e}")
            
            sleep_time = delay * (1 + (random.random() if jitter else 0))
            print(f"Rate limited, retrying in {sleep_time:.1f}s... (Attempt {i+1}/{max_retries})")
            await asyncio.sleep(sleep_time)
            delay *= factor
    
    raise Exception("Max retries exceeded.")


class RateLimitOpenAIClient:
    """Wrapper for AsyncOpenAI client with rate limiting and retries"""
    
    def __init__(self, api_key: str = 'None', max_requests_per_minute: int = 490):
        self.client = AsyncOpenAI(api_key=api_key)
        self.rate_limiter = OpenAIRateLimiter(max_requests_per_minute)
    
    async def chat_completion(self, model: str, messages: list, **kwargs) -> ChatCompletion:
        """Make a chat completion request with rate limiting and retries"""
        await self.rate_limiter.wait_if_needed()
        
        return await async_retry_with_backoff(
            self.client.chat.completions.create,
            model=model,
            messages=messages,
            **kwargs
        )


from dotenv import dotenv_values
from pydantic import BaseModel, SecretStr, Field

# Example usage
async def main():
    # Initialize the client
    # Load config settings
    DOT_ENV = dotenv_values("../.env")
    openai_api_key = str(DOT_ENV['OPENAI_API_KEY'])
    openai_api_key_secret = SecretStr(openai_api_key)
    openai_client = RateLimitOpenAIClient(api_key=openai_api_key_secret.get_secret_value())
    
    # Example of making multiple concurrent requests
    tasks = []
    for i in range(1000):
        tasks.append(
            openai_client.chat_completion(
                model="gpt-4o-mini",
                messages=[{"role": "user", "content": f"Explain Kubernetes concept #{i+1} briefly."}]
            )
        )
    
    # Wait for all requests to complete
    responses = await asyncio.gather(*tasks)
    
    # Process responses
    for i, response in enumerate(responses):
        print(f"\nResponse {i+1}:")
        print(response.choices[0].message.content)

asyncio.run(main())


Rate limit approaching, waiting 59.76s...
Rate limit approaching, waiting 55.64s...

Response 1:
Kubernetes is an open-source container orchestration platform that automates the deployment, scaling, and management of containerized applications. The first key concept in Kubernetes is **"Pod."**

A **Pod** is the smallest and simplest unit in Kubernetes. It represents a single instance of a running process in your cluster and can contain one or more containers that share the same network namespace and storage. Pods are used to host applications, and they enable containers to communicate easily and share resources like volumes. Each Pod is assigned a unique IP address, and it can be managed, scaled, and orchestrated using Kubernetes principles and tools.

Response 2:
Kubernetes Concept #2: **Pods**

A Pod is the smallest and simplest deployable unit in Kubernetes. It can contain one or more containers that share the same lifecycle, network, and storage resources. Containers within a Pod a

In [None]:
class RateLimiter:
    def __init__(self, max_calls: int, time_period: int):
        """
        Initialize a rate limiter
        
        Args:
            max_calls: Maximum number of calls allowed in the time period
            time_period: Time period in seconds
        """
        self.max_calls = max_calls
        self.time_period = time_period
        self.calls_made = 0
        self.start_time = time.time()
        self.lock = asyncio.Lock()
        
    async def acquire(self):
        """Acquire permission to make a call, waiting if necessary"""
        async with self.lock:
            current_time = time.time()
            time_elapsed = current_time - self.start_time
            
            # Reset counter if time period has passed
            if time_elapsed > self.time_period:
                self.calls_made = 0
                self.start_time = current_time
            
            # If we've reached the limit, calculate wait time
            if self.calls_made >= self.max_calls:
                wait_time = self.time_period - time_elapsed
                if wait_time > 0:
                    await asyncio.sleep(wait_time)
                    # Reset after waiting
                    self.calls_made = 0
                    self.start_time = time.time()
            
            # Increment the call counter
            self.calls_made += 1

async def rate_limited_gather(
    rate_limiter: RateLimiter,
    coroutines: List[Callable[..., T]],
    *args,
    **kwargs
) -> List[T]:
    """
    Execute coroutines with rate limiting
    
    Args:
        rate_limiter: The rate limiter to use
        coroutines: List of coroutine functions to execute
        *args, **kwargs: Arguments to pass to each coroutine
    
    Returns:
        List of results from the coroutines
    """
    async def wrapped_coroutine(coro_func):
        await rate_limiter.acquire()
        return await coro_func(*args, **kwargs)
    
    tasks = [wrapped_coroutine(coro) for coro in coroutines]
    return await asyncio.gather(*tasks)

# Example usage
async def make_request(url: str) -> Any:
    """Example async function that makes an HTTP request"""
    # In a real implementation, you would use aiohttp or httpx here
    await asyncio.sleep(0.1)  # Simulate network delay
    print(url)
    return f"Response from {url}"

async def main():
    # Create a rate limiter for 500 requests per minute
    rate_limiter = RateLimiter(max_calls=500, time_period=60)
    
    # Create a list of 1000 requests (which exceeds our rate limit)
    urls = [f"https://example.com/api/endpoint/{i}" for i in range(1000)]
    request_coroutines = [lambda url=url: make_request(url) for url in urls]
    
    print(f"Starting {len(request_coroutines)} requests with rate limit of 500/minute")
    start_time = time.time()
    
    # Process all requests with rate limiting
    results = await rate_limited_gather(rate_limiter, request_coroutines)
    
    elapsed = time.time() - start_time
    print(f"Completed {len(results)} requests in {elapsed:.2f} seconds")
    print(f"Average rate: {len(results)/elapsed*60:.2f} requests per minute")

asyncio.run(main())

Starting 1000 requests with rate limit of 500/minute
https://example.com/api/endpoint/0
https://example.com/api/endpoint/2
https://example.com/api/endpoint/6
https://example.com/api/endpoint/14
https://example.com/api/endpoint/30
https://example.com/api/endpoint/62
https://example.com/api/endpoint/126
https://example.com/api/endpoint/125
https://example.com/api/endpoint/61
https://example.com/api/endpoint/124
https://example.com/api/endpoint/123
https://example.com/api/endpoint/29
https://example.com/api/endpoint/60
https://example.com/api/endpoint/122
https://example.com/api/endpoint/121
https://example.com/api/endpoint/59
https://example.com/api/endpoint/120
https://example.com/api/endpoint/119
https://example.com/api/endpoint/13
https://example.com/api/endpoint/28
https://example.com/api/endpoint/58
https://example.com/api/endpoint/118
https://example.com/api/endpoint/117
https://example.com/api/endpoint/57
https://example.com/api/endpoint/116
https://example.com/api/endpoint/115
ht

Updated code for Langchain RunnableSequence Integration

In [None]:
import asyncio
import random
import time
from typing import Any, Callable, Awaitable, List, TypeVar
T = TypeVar("T")

# --- Rate Limiter ------------------------------------

class OpenAIRateLimiter:
    """
    Rate limiter for OpenAI API to stay under X requests per minute.
    Useful for controlling LangChain chain LLM calls.
    """
    def __init__(self, max_requests_per_minute: int = 490):  # Conservative buffer
        self.max_requests = max_requests_per_minute
        self.request_timestamps: List[float] = []
        self.lock = asyncio.Lock()

    async def wait_if_needed(self):
        """Block until request is safe under the rate limit."""
        async with self.lock:
            now = time.time()
            self.request_timestamps = [
                ts for ts in self.request_timestamps if now - ts < 60
            ]
            if len(self.request_timestamps) >= self.max_requests:
                oldest = min(self.request_timestamps)
                wait_time = 60 - (now - oldest) + 0.1
                if wait_time > 0:
                    print(f"Rate limit approaching, waiting {wait_time:.2f}s ...")
                    await asyncio.sleep(wait_time)
                    now = time.time()
                    self.request_timestamps = [
                        ts for ts in self.request_timestamps if now - ts < 60
                    ]
            self.request_timestamps.append(time.time())


async def async_retry_with_backoff(
    func: Callable[..., Awaitable[T]],
    *args,
    initial_delay=1,
    factor=2,
    jitter=True,
    max_retries=5,
    **kwargs
) -> T:
    delay = initial_delay
    for i in range(max_retries):
        try:
            return await func(*args, **kwargs)
        except Exception as e:  # OpenAI RateLimitError or generic AI API exception
            # You may want to customize this to check specific error types
            if i == max_retries - 1:
                raise
            sleep_time = delay * (1 + (random.random() if jitter else 0))
            print(f"Rate limited, retry attempt {i+1}/{max_retries}, sleeping {sleep_time:.1f}s...")
            await asyncio.sleep(sleep_time)
            delay *= factor
    raise Exception("Max retries exceeded.")

# --- Langchain Integration ---------------------------

class RateLimitedChain:
    """
    Wrapper to rate-limit async invocation of any LangChain RunnableSequence
    """
    def __init__(
        self,
        rate_limiter: OpenAIRateLimiter,
        max_retries: int = 5,
        **retry_kwargs
    ):
        self.rate_limiter = rate_limiter
        self.max_retries = max_retries
        self.retry_kwargs = retry_kwargs

    async def ainvoke(self, chain, input: Any, **chain_kwargs):
        """
        Awaitably invoke one chain, obeying the rate limit and applying backoff on failure.
        """
        await self.rate_limiter.wait_if_needed()
        return await async_retry_with_backoff(chain.ainvoke, input, max_retries=self.max_retries, **chain_kwargs, **self.retry_kwargs)

    async def abatch(self, chain, inputs: List[Any], **chain_kwargs):
        """
        Awaitably invoke many chain calls (one per input), obeying the rate limit.
        This schedules all invocations under rate limiting + retries.
        """
        tasks = [
            self.ainvoke(chain, inp, **chain_kwargs)
            for inp in inputs
        ]
        return await asyncio.gather(*tasks)

# ---------- Usage Example with Langchain --------------

# from langchain_core.runnables import RunnableSequence (for LangChain 0.1+)
# chain = ...  # some RunnableSequence

# from dotenv import dotenv_values
# DOT_ENV = dotenv_values("../.env")
# openai_api_key = DOT_ENV['OPENAI_API_KEY']

async def main():
    from langchain_core.runnables import RunnableSequence
    # Build or load your LangChain chain, e.g.:
    # chain = RunnableSequence([...])
    # In real use, chain would use OpenAI/Anthropic LLM components internally

    # Let's assume for demo you have a `chain`:
    chain = ...

    limiter = OpenAIRateLimiter(max_requests_per_minute=490)
    rl_chain = RateLimitedChain(limiter)

    # List of prompts/inputs
    inputs = [
        {"question": f"Explain Kubernetes concept #{i+1} briefly."}
        for i in range(50)
    ]

    # Run all, rate-limited and retried on errors:
    responses = await rl_chain.abatch(chain, inputs)
    for i, resp in enumerate(responses):
        print(f"Response {i+1}: {resp}")

# To run:
# asyncio.run(main())