In [None]:
import sys

# Install brave-search package if not already installed
# Use the magic command only at the top level, not inside a code block or script
%pip install brave-search

In [1]:
# just use config.py. For this prototype's purpose, we'll hardcode this here.
import json
import os

config_path = r"D:\Ennchan\OneDrive\Documents\sandbox\ennchan-langchain-rag\config.json"

try:
    # Load the configuration
    with open(config_path, "r") as f:
        config = json.load(f)
    
    print(f"Config loaded successfully with {len(config)} settings")

except FileNotFoundError:
    print("File not found")


Config loaded successfully with 11 settings


In [2]:
# InvokeSearch
from brave import Brave
from pprint import pprint as pr

os.environ["BRAVE_API_KEY"] = config["BRAVE_API_KEY"]
brave = Brave()

query = "cobalt mining"
num_results = 10

search_results = brave.search(q=query, raw=True)
output = search_results["web"]["results"]

In [3]:
# SearchResultClass
search_results = [{
    "title": result["title"],
    "url": result["url"],
    "description": result["description"],
} for result in output]


In [6]:
import requests
from bs4 import BeautifulSoup
import re
import concurrent.futures
from tqdm.notebook import tqdm
import time
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

# Install required packages
import sys
import subprocess
try:
    import lxml
except ImportError:
    print("Installing lxml...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "lxml"])
    import lxml

def create_session():
    """Create a session with connection pooling and retry logic"""
    session = requests.Session()
    
    # Configure retry strategy
    retry_strategy = Retry(
        total=2,  # Maximum number of retries
        backoff_factor=0.5,  # Backoff factor for retries
        status_forcelist=[429, 500, 502, 503, 504],  # Retry on these status codes
        allowed_methods=["GET"]  # Only retry GET requests
    )
    
    # Mount the adapter to the session with connection pooling
    adapter = HTTPAdapter(
        max_retries=retry_strategy,
        pool_connections=20,  # Number of connection pools
        pool_maxsize=20  # Connections per pool
    )
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    
    # Set a more browser-like user agent to avoid blocking
    session.headers.update({
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Referer": "https://www.google.com/",
        "DNT": "1",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1"
    })
    
    return session

def extract_main_content(url, session):
    """Extract main content from a URL using an existing session"""
    try:
        # Use session for connection pooling
        response = session.get(url, timeout=10)
        response.raise_for_status()
        
        # Use lxml parser which is faster than the default
        soup = BeautifulSoup(response.content, "lxml")
        
        # Fix for CSS selector syntax error
        for element_type in ['nav', 'header', 'footer', 'script', 'style', 'noscript', 'aside', 'iframe', 'form', 'button']:
            for element in soup.find_all(element_type):
                element.decompose()
        
        # Also remove elements with common ad/popup class names
        for class_name in ['ads', 'banner', 'cookie', 'popup', 'newsletter', 'subscription']:
            for element in soup.find_all(class_=lambda c: c and class_name in c.lower()):
                element.decompose()
        
        # Try to find main content container
        main_content = None
        for container in ['main', 'article', 'div[role="main"]', 'div.main-content', 'div#content', 
                         'div.post-content', 'div.article-content', 'div.entry-content', 'div.content']:
            try:
                content = soup.select(container)
                if content:
                    main_content = content[0]
                    break
            except Exception:
                continue
        
        # If no main container found, use body
        if not main_content:
            main_content = soup.body
        
        # Extract paragraphs with meaningful content
        paragraphs = main_content.find_all('p')
        meaningful_paragraphs = [p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 50]
        
        if meaningful_paragraphs:
            text = '\n\n'.join(meaningful_paragraphs)
        else:
            # Fall back to main content text
            text = main_content.get_text(separator='\n\n', strip=True)
            
        # Clean up the text with a single regex
        text = re.sub(r'(\n{2,}|\s{2,})', ' ', text)
        
        return {"url": url, "content": text, "success": True}
    except Exception as e:
        return {"url": url, "content": "", "success": False, "error": str(e)}

def process_batch(urls, session):
    """Process a batch of URLs using the same session"""
    results = []
    for url in urls:
        results.append(extract_main_content(url, session))
    return results

def process_search_results(search_results, max_workers=5, batch_size=3):
    """Process search results in parallel with optimized batching"""
    urls = [result["url"] for result in search_results]
    results = {}
    
    # Create a mapping of URL to original search result
    url_to_result = {result["url"]: result for result in search_results}
    
    # Create batches of URLs to process with the same session
    url_batches = [urls[i:i+batch_size] for i in range(0, len(urls), batch_size)]
    
    # Use ThreadPoolExecutor for parallel processing
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Create a shared session for each worker
        sessions = [create_session() for _ in range(min(max_workers, len(url_batches)))]
        
        # Submit batch tasks
        future_to_batch = {
            executor.submit(process_batch, batch, sessions[i % len(sessions)]): batch 
            for i, batch in enumerate(url_batches)
        }
        
        # Process results as they complete with a progress bar
        all_results = []
        for future in tqdm(concurrent.futures.as_completed(future_to_batch), 
                          total=len(future_to_batch), 
                          desc="Processing URL batches"):
            try:
                batch_results = future.result()
                all_results.extend(batch_results)
            except Exception as e:
                print(f"Batch processing error: {e}")
        
        # Process all results
        for data in all_results:
            if data["success"] and data["url"] in url_to_result:
                url = data["url"]
                results[url] = {
                    "title": url_to_result[url]["title"],
                    "url": url,
                    "description": url_to_result[url]["description"],
                    "content": data["content"]
                }
            elif not data["success"]:
                print(f"Failed to process {data['url']}: {data.get('error', 'Unknown error')}")
    
    return list(results.values())

# Usage
start_time = time.time()
cleaned_results = process_search_results(search_results, max_workers=5, batch_size=3)
print(f"Total processing time: {time.time() - start_time:.2f} seconds")
pr(cleaned_results)


Processing URL batches:   0%|          | 0/7 [00:00<?, ?it/s]

Failed to process https://www.mining-technology.com/news/cobalt-holdings-ipo-london/: 'NoneType' object has no attribute 'find_all'
Failed to process https://www.savethechildren.net/stories/drc-cobalt-mines-child-labour-and-green-transition: 403 Client Error: Forbidden for url: https://www.savethechildren.net/stories/drc-cobalt-mines-child-labour-and-green-transition
Failed to process https://www.washingtonpost.com/world/interactive/2023/ev-cobalt-mines-congo/: HTTPSConnectionPool(host='www.washingtonpost.com', port=443): Max retries exceeded with url: /world/interactive/2023/ev-cobalt-mines-congo/ (Caused by ReadTimeoutError("HTTPSConnectionPool(host='www.washingtonpost.com', port=443): Read timed out. (read timeout=10)"))
Failed to process https://www.sciencedirect.com/science/article/pii/S0301420722004500: 403 Client Error: Forbidden for url: https://www.sciencedirect.com/science/article/pii/S0301420722004500
Total processing time: 33.48 seconds
[{'content': '',
  'description': 'Wh