In [1]:
from langchain_community.document_loaders import UnstructuredURLLoader
import time
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

In [2]:
urls = [
"https://www.binance.com/en/square/post/08-22-2025-whale-sells-ethereum-for-significant-profit-in-august-28652349496146","https://www.binance.com/en/square/post/08-22-2025-hemi-blockchain-network-reveals-token-economics-28641299655561"]

In [3]:
# Method 1: Basic retry with delays
def load_with_retry(urls, max_retries=3, delay=2):
    """Load URLs with retry logic and delays"""
    successful_docs = []
    failed_urls = []
    
    for url in urls:
        for attempt in range(max_retries):
            try:
                print(f"Attempting to load: {url} (attempt {attempt + 1})")
                loader = UnstructuredURLLoader(urls=[url])
                docs = loader.load()
                successful_docs.extend(docs)
                print(f"✓ Successfully loaded: {url}")
                break
            except Exception as e:
                print(f"✗ Failed attempt {attempt + 1} for {url}: {str(e)}")
                if attempt < max_retries - 1:
                    time.sleep(delay * (attempt + 1))  # Exponential backoff
                else:
                    failed_urls.append(url)
    
    return successful_docs, failed_urls

In [4]:
# Method 2: Custom session with headers and retry strategy
def load_with_custom_session(urls):
    """Load URLs with custom session configuration"""
    
    # Create a custom session with retry strategy
    session = requests.Session()
    
    # Add retry strategy
    retry_strategy = Retry(
        total=3,
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504],
    )
    
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    
    # Add headers to mimic a browser
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
    })
    
    successful_docs = []
    failed_urls = []
    
    for url in urls:
        try:
            print(f"Loading with custom session: {url}")
            
            # First, test the URL with requests
            response = session.get(url, timeout=30)
            response.raise_for_status()
            
            # If successful, use UnstructuredURLLoader
            loader = UnstructuredURLLoader(urls=[url])
            docs = loader.load()
            successful_docs.extend(docs)
            print(f"✓ Successfully loaded: {url}")
            
        except Exception as e:
            print(f"✗ Failed to load {url}: {str(e)}")
            failed_urls.append(url)
            
        time.sleep(1)  # Be respectful to the server
    
    return successful_docs, failed_urls


In [5]:
# Method 3: Individual URL processing with detailed error handling
def load_urls_individually(urls):
    """Process each URL individually with detailed error reporting"""
    results = {
        'successful': [],
        'failed': [],
        'errors': {}
    }
    
    for i, url in enumerate(urls, 1):
        print(f"\n--- Processing URL {i}/{len(urls)} ---")
        print(f"URL: {url}")
        
        try:
            # Test connectivity first
            response = requests.head(url, timeout=10, allow_redirects=True)
            print(f"Status Code: {response.status_code}")
            
            if response.status_code == 200:
                loader = UnstructuredURLLoader(urls=[url])
                docs = loader.load()
                results['successful'].extend(docs)
                print(f"✓ Content loaded successfully")
                print(f"  Document length: {len(docs[0].page_content) if docs else 0} characters")
            else:
                results['failed'].append(url)
                results['errors'][url] = f"HTTP {response.status_code}"
                
        except requests.exceptions.ConnectionError as e:
            results['failed'].append(url)
            results['errors'][url] = f"Connection Error: {str(e)}"
            print(f"✗ Connection Error: {str(e)}")
            
        except requests.exceptions.Timeout as e:
            results['failed'].append(url)
            results['errors'][url] = f"Timeout Error: {str(e)}"
            print(f"✗ Timeout Error: {str(e)}")
            
        except Exception as e:
            results['failed'].append(url)
            results['errors'][url] = f"General Error: {str(e)}"
            print(f"✗ Error: {str(e)}")
    
    return results


In [6]:
print("=== Method 1: Basic Retry ===")
successful_docs, failed_urls = load_with_retry(urls)
print(f"\nSuccessfully loaded: {len(successful_docs)} documents")
print(f"Failed URLs: {len(failed_urls)}")

=== Method 1: Basic Retry ===
Attempting to load: https://www.binance.com/en/square/post/08-22-2025-whale-sells-ethereum-for-significant-profit-in-august-28652349496146 (attempt 1)
✓ Successfully loaded: https://www.binance.com/en/square/post/08-22-2025-whale-sells-ethereum-for-significant-profit-in-august-28652349496146
Attempting to load: https://www.binance.com/en/square/post/08-22-2025-hemi-blockchain-network-reveals-token-economics-28641299655561 (attempt 1)
✓ Successfully loaded: https://www.binance.com/en/square/post/08-22-2025-hemi-blockchain-network-reveals-token-economics-28641299655561

Successfully loaded: 2 documents
Failed URLs: 0


In [7]:
print("\n=== Method 2: Custom Session ===")
successful_docs_2, failed_urls_2 = load_with_custom_session(urls)
print(f"\nSuccessfully loaded: {len(successful_docs_2)} documents")
print(f"Failed URLs: {len(failed_urls_2)}")


=== Method 2: Custom Session ===
Loading with custom session: https://www.binance.com/en/square/post/08-22-2025-whale-sells-ethereum-for-significant-profit-in-august-28652349496146
✓ Successfully loaded: https://www.binance.com/en/square/post/08-22-2025-whale-sells-ethereum-for-significant-profit-in-august-28652349496146
Loading with custom session: https://www.binance.com/en/square/post/08-22-2025-hemi-blockchain-network-reveals-token-economics-28641299655561
✓ Successfully loaded: https://www.binance.com/en/square/post/08-22-2025-hemi-blockchain-network-reveals-token-economics-28641299655561

Successfully loaded: 2 documents
Failed URLs: 0


In [8]:
print("\n=== Method 3: Individual Processing ===")
results = load_urls_individually(urls)
print(f"\nSuccessful: {len(results['successful'])} documents")
print(f"Failed: {len(results['failed'])} URLs")
for url, error in results['errors'].items():
    print(f"  {url}: {error}")

# If you got some successful documents, you can work with them:
if successful_docs:
    print(f"\n=== Sample Content ===")
    print(f"First document preview:")
    print(successful_docs[0].page_content[:500] + "..." if len(successful_docs[0].page_content) > 500 else successful_docs[0].page_content)


=== Method 3: Individual Processing ===

--- Processing URL 1/2 ---
URL: https://www.binance.com/en/square/post/08-22-2025-whale-sells-ethereum-for-significant-profit-in-august-28652349496146
Status Code: 202

--- Processing URL 2/2 ---
URL: https://www.binance.com/en/square/post/08-22-2025-hemi-blockchain-network-reveals-token-economics-28641299655561
Status Code: 202

Successful: 0 documents
Failed: 2 URLs
  https://www.binance.com/en/square/post/08-22-2025-whale-sells-ethereum-for-significant-profit-in-august-28652349496146: HTTP 202
  https://www.binance.com/en/square/post/08-22-2025-hemi-blockchain-network-reveals-token-economics-28641299655561: HTTP 202

=== Sample Content ===
First document preview:

