In [None]:
### Scrapes Content and Starts From Checkpoint
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
import undetected_chromedriver as uc
import time
import pandas as pd
import random
import os
import json

# File paths and session folder
SESSION_DIR = os.path.join(os.getcwd(), "chrome_session")
os.makedirs(SESSION_DIR, exist_ok=True)
CHECKPOINT_FILE = "scraper_checkpoint.json"
PROGRESS_CSV = "businessday_progress3.csv"
FINAL_CSV = "businessday_final3.csv"

def remove_existing_files():
    """Remove existing progress and final CSV files for fresh start"""
    if os.path.exists(PROGRESS_CSV):
        os.remove(PROGRESS_CSV)
    if os.path.exists(FINAL_CSV):
        os.remove(FINAL_CSV)

def save_checkpoint(data):
    """Save current scraping state"""
    with open(CHECKPOINT_FILE, 'w') as f:
        json.dump(data, f)
    print(f"Checkpoint saved: Page {data['current_page']} | Articles: {data['articles_processed']}")

def load_checkpoint():
    """Load existing checkpoint if available"""
    if os.path.exists(CHECKPOINT_FILE):
        with open(CHECKPOINT_FILE, 'r') as f:
            data = json.load(f)
            print(f"Resuming from checkpoint: Page {data['current_page']}")
            return data
    return None

def setup_driver():
    """Configure undetected Chrome driver"""
    options = uc.ChromeOptions()
    options.add_argument(f"--user-data-dir={SESSION_DIR}")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36")
    driver = uc.Chrome(options=options)
    driver.set_page_load_timeout(30)
    return driver

def random_sleep(min=2, max=5):
    """Human-like random delay"""
    time.sleep(random.uniform(min, max))

def human_scroll(driver):
    """Realistic scrolling behavior"""
    total_height = driver.execute_script("return document.body.scrollHeight")
    current = 0
    while current < total_height:
        scroll = random.randint(300, 700)
        current += scroll
        driver.execute_script(f"window.scrollTo(0, {current});")
        random_sleep(0.5, 1.5)

def scrape_articles(driver, scraped_data):
    """Scrape article list with checkpoint support"""
    try:
        if "Just a moment" in driver.title:
            print("Cloudflare detected - waiting...")
            time.sleep(15)
            return False

        human_scroll(driver)
        container = WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div.news"))
        )
        articles = container.find_elements(By.CSS_SELECTOR, 'div.post-info')
        
        for item in articles:
            try:
                title_elem = item.find_element(By.CSS_SELECTOR, 'h2.post-title a')
                title = title_elem.text.strip()
                url = title_elem.get_attribute("href")
                
                # Skip duplicates
                if any(a['url'] == url for a in scraped_data):
                    continue
                
                # Get list page data
                author = item.find_element(By.CSS_SELECTOR, '.post-author a').text
                date = item.find_element(By.CLASS_NAME, 'post-date').text
                excerpt = item.find_element(By.CSS_SELECTOR, 'p').text
                
                scraped_data.append({
                    'title': title,
                    'author': author,
                    'date': date,
                    'excerpt': excerpt,
                    'url': url,
                    'content': None
                })
                print(f"Added: {title[:50]}...")
            except Exception as e:
                print(f"Article error: {e}")
        return True
    except Exception as e:
        print(f"Page error: {e}")
        return False

def scrape_content(driver, articles, start_index):
    """Scrape full content with checkpoint support"""
    for idx in range(start_index, len(articles)):
        article = articles[idx]
        try:
            driver.get(article['url'])
            WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".post-content"))
            )
            content = driver.find_element(By.CSS_SELECTOR, ".post-content").text
            article['content'] = content.strip()
            print(f"Content scraped: {article['title'][:50]}...")
        except Exception as e:
            article['content'] = "Content unavailable"
            print(f"Content error: {e}")
        
        # Update checkpoint after each article
        save_checkpoint({
            'current_page': 'content',
            'articles_processed': idx + 1,
            'scraped_data': articles
        })
    return articles

def main():
    checkpoint = load_checkpoint()
    driver = setup_driver()
    
    # Initialize or resume state
    if checkpoint:
        scraped_data = checkpoint['scraped_data']
        current_page = checkpoint['current_page']
        articles_processed = checkpoint.get('articles_processed', 0)
    else:
        remove_existing_files()
        scraped_data = []
        current_page = 1
        articles_processed = 0

    max_pages = 820

    try:
        # Page scraping phase
        if isinstance(current_page, int) and current_page <= max_pages:
            if current_page == 1:
                driver.get('https://businessday.ng/tag/bdlead/?amp')
                random_sleep(3, 5)
            
            while current_page <= max_pages:
                print(f"\nScraping page {current_page}/{max_pages}")
                if scrape_articles(driver, scraped_data):
                    # Save page progress
                    save_checkpoint({
                        'current_page': current_page + 1,
                        'articles_processed': 0,
                        'scraped_data': scraped_data
                    })
                    pd.DataFrame(scraped_data).to_csv(PROGRESS_CSV, index=False)
                
                # Navigate to next page
                try:
                    next_btn = WebDriverWait(driver, 10).until(
                        EC.element_to_be_clickable((By.CSS_SELECTOR, 'a.next.page-numbers'))
                    )
                    driver.execute_script("arguments[0].click();", next_btn)
                    random_sleep(4, 7)
                    current_page += 1
                except:
                    current_page += 1  # Fallback to direct URL
                    driver.get(f"https://businessday.ng/tag/bdlead/page/{current_page}/")
                    random_sleep(4, 7)

        # Content scraping phase
        if current_page > max_pages or current_page == 'content':
            print("\nStarting content scraping...")
            scraped_data = scrape_content(driver, scraped_data, articles_processed)
            pd.DataFrame(scraped_data).to_csv(FINAL_CSV, index=False)
            if os.path.exists(CHECKPOINT_FILE):
                os.remove(CHECKPOINT_FILE)

    except KeyboardInterrupt:
        print("\nInterrupted by user. Checkpoint saved.")
    finally:
        driver.quit()
        if scraped_data and not os.path.exists(FINAL_CSV):
            pd.DataFrame(scraped_data).to_csv(FINAL_CSV, index=False)

if __name__ == "__main__":
    main()

Resuming from checkpoint: Page 715

Scraping page 715/820
Page error: Message: 
Stacktrace:
	GetHandleVerifier [0x010E8073+60707]
	GetHandleVerifier [0x010E80B4+60772]
	(No symbol) [0x00F10683]
	(No symbol) [0x00F58660]
	(No symbol) [0x00F589FB]
	(No symbol) [0x00FA1022]
	(No symbol) [0x00F7D094]
	(No symbol) [0x00F9E824]
	(No symbol) [0x00F7CE46]
	(No symbol) [0x00F4C5D3]
	(No symbol) [0x00F4D424]
	GetHandleVerifier [0x0132BB53+2435075]
	GetHandleVerifier [0x013270F3+2416035]
	GetHandleVerifier [0x0134349C+2531660]
	GetHandleVerifier [0x010FF145+155125]
	GetHandleVerifier [0x01105AED+182173]
	GetHandleVerifier [0x010EF948+91640]
	GetHandleVerifier [0x010EFAF0+92064]
	GetHandleVerifier [0x010DA5B0+4704]
	BaseThreadInitThunk [0x76337BA9+25]
	RtlInitializeExceptionChain [0x777FC2EB+107]
	RtlClearBits [0x777FC26F+191]


Scraping page 716/820
Cloudflare detected - waiting...

Scraping page 717/820
Cloudflare detected - waiting...

Scraping page 718/820
Added: The ugly face of Nigeria’s exp