In [1]:
print(1)

1


In [26]:
import pandas as pd
import asyncio
import random
import time
from tqdm import tqdm
from playwright.async_api import async_playwright, TimeoutError
from bs4 import BeautifulSoup
import json
import logging
import os
from tqdm.asyncio import tqdm
from typing import List, Dict, Optional


In [14]:
file_path = "output/techinasia_ai_news_batch_0_20250116_174236.csv"

df = pd.read_csv(file_path)

In [16]:
df['article_url'].head()


0    https://www.techinasia.com/news/chinese-unicor...
1    https://www.techinasia.com/news/meta-exec-aime...
2    https://www.techinasia.com/news/nvidia-build-a...
3    https://www.techinasia.com/news/nvidia-backed-...
4    https://www.techinasia.com/news/tsmc-q4-profit...
Name: article_url, dtype: object

In [17]:
# a simple function to fetch the content of the article


async def fetch_article_content(url):
    async with async_playwright() as playwright:
        browser = await playwright.chromium.launch(headless=True)
        page = await browser.new_page()
        try:
            await page.goto(url, timeout=10000)
            await page.wait_for_selector("div#content.content", timeout=10000)
            soup = BeautifulSoup(await page.content(), "html.parser")
            content = soup.select_one("div#content.content")
            return content.get_text(strip=True) if content else None
        finally:
            await browser.close()


url = "https://www.techinasia.com/news/chinese-unicorn-minimax-unveils-new-ai-models"
content = await fetch_article_content(url)

content

'Shanghai-based AI startup MiniMax has released a new family of open-source large language models (LLMs), called MiniMax-01.The launch includes the general-purpose MiniMax-Text-01 model and the multimodal MiniMax-VL-01, which integrates visual capabilities along with text processing.According to benchmark tests shared on MiniMax’s official WeChat account, the foundational language model aligns with global standards in areas such as mathematics, specialized knowledge, instruction-following, and reducing factual errors.'

In [23]:
# For each article URL in the dataframe, fetch the content and store it in a structured JSON format.
# Note that a delay will be introduced between each fetch to prevent being blocked by the website.
# Additionally, user agents will be randomized and shuffled to avoid detection as a bot.

user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Mobile/15E148 Safari/604.1"
]

async def fetch_article_content_with_delay(url):
    async with async_playwright() as playwright:
        browser = await playwright.chromium.launch(headless=True)
        page = await browser.new_page(user_agent=random.choice(user_agents))
        try:
            await page.goto(url, timeout=10000)
            await page.wait_for_selector("div#content.content", timeout=10000)
            soup = BeautifulSoup(await page.content(), "html.parser")
            content = soup.select_one("div#content.content")
            return content.get_text(strip=True) if content else None
        finally:
            await browser.close()

async def fetch_all_articles(df):
    data = []
    for _, row in tqdm(df.iterrows(), total=df.shape[0]):
        url = row['article_url']
        content = await fetch_article_content_with_delay(url)
        if content:
            data.append({"article_id": row['article_id'], "title": row['title'], "url": url, "content": content})
        time.sleep(random.uniform(2, 5))  # Introduce delay with randomness
    with open("articles.json", "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

# run the function
# await fetch_all_articles(df)

# RuntimeWarning: coroutine 'fetch_all_articles' was never awaited



    

In [27]:
"""
This cell contains the implementation of functions to fetch article content from a list of URLs in a dataframe.
It includes the following key components:
1. A list of user agents to randomize requests and avoid detection as a bot.
2. An asynchronous function `fetch_article_content_with_delay` that uses Playwright to fetch the content of an article from a given URL with a delay to prevent being blocked.
3. An asynchronous function `fetch_all_articles` that iterates over the dataframe, fetches the content for each article URL, and stores the results in a JSON file. It also introduces a random delay between each fetch.
4. A commented-out call to `fetch_all_articles` to run the function.
5. Adjustable parameters for various ranges and retry attempts.
6. Logging configuration to log errors to a file.
7. An asynchronous function `fetch_article_content_with_randomization` that fetches article content with additional randomization in user agents and viewport sizes, and simulates scrolling behavior to mimic human interaction.
"""



# Adjustable parameters
SCROLL_ITERATIONS_RANGE = (2, 4)
SCROLL_DISTANCE_RANGE = (50, 200)
MOUSE_MOVEMENTS_RANGE = (2, 5)
SLEEP_SCROLL_RANGE = (0.3, 0.8)
SLEEP_MOUSE_RANGE = (0.1, 0.3)
URL_DELAY_RANGE = (0.5, 1)
RETRY_ATTEMPTS = 2




# Configure logging
logging.basicConfig(filename="errors.log", level=logging.ERROR,
                    format="%(asctime)s - %(levelname)s - %(message)s")

async def fetch_article_content_with_randomization(url):
    async with async_playwright() as playwright:
        browser = await playwright.chromium.launch(headless=True) #  Consider passing a config value
        viewport_sizes = [
            {"width": 1920, "height": 1080},
            {"width": 1366, "height": 768},
            {"width": 1280, "height": 720},
            {"width": 1440, "height": 900}
        ]
        user_agents = [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            "Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Mobile/15E148 Safari/604.1",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.59",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.59",
        ]

        try:
            page = await browser.new_page(
                user_agent=random.choice(user_agents),
                viewport=random.choice(viewport_sizes)
            )
            await page.goto(url, timeout=10000)

            await page.wait_for_selector("div#content.content", timeout=10000)


            # Simulate scrolling behavior
            for _ in range(random.randint(*SCROLL_ITERATIONS_RANGE)):
                scroll_position = random.randint(*SCROLL_DISTANCE_RANGE)
                await page.evaluate(f"window.scrollBy(0, {scroll_position})")
                await asyncio.sleep(random.uniform(*SLEEP_SCROLL_RANGE))

            # Simulate mouse movements
            for _ in range(random.randint(*MOUSE_MOVEMENTS_RANGE)):
                x, y = random.randint(0, 1920), random.randint(0, 1080)
                await page.mouse.move(x, y, steps=random.randint(5, 10))
                await asyncio.sleep(random.uniform(*SLEEP_MOUSE_RANGE))

            await asyncio.sleep(random.uniform(0.5, 1.0)) # Wait for dynamic content to load

            soup = BeautifulSoup(await page.content(), "html.parser")
            content_selectors = [ # add fallback selectors if any
                "div#content.content",
            ]

            content = None
            for selector in content_selectors:
                content = soup.select_one(selector)
                if content:
                    break


            return content.get_text(strip=True) if content else None

        except TimeoutError as te:
            logging.error(f"Timeout error fetching URL {url}: {te}")
            return None

        except Exception as e:
            logging.error(f"Error fetching URL {url}: {e}")
            return None
        finally:
            if browser:
                await browser.close()


async def fetch_all_articles(df):
    data = []
    for _, row in tqdm(df.iterrows(), total=df.shape[0]):
        url = row["article_url"]

        for attempt in range(RETRY_ATTEMPTS):
            content = await fetch_article_content_with_randomization(url)
            if content:
                data.append({
                    "article_id": row["article_id"],
                    "title": row["title"],
                    "url": url,
                    "content": content
                })
                break # If successful break out of retry loop
            else:
                if attempt < RETRY_ATTEMPTS - 1:
                    await asyncio.sleep(random.uniform(1, 3) * (attempt + 1)) # Exponential backoff
                else:
                    logging.error(f"Failed to fetch URL {url} after {RETRY_ATTEMPTS} attempts.") # log error after all retries fail
        await asyncio.sleep(random.uniform(*URL_DELAY_RANGE))  # Delay between requests

    timestamp_str = time.strftime("%Y%m%d_%H%M%S")
    filename = f"articles_{timestamp_str}.json"
    filepath = os.path.join(".", filename)  # To ensure platform compatibility
    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)



# asyncio.run(fetch_all_articles(df))

 web scraping system designed to fetch and process content from webpages. 
 
 It uses Playwright for browser automation, simulating human-like behavior such as scrolling and mouse movements to evade detection. 
 
 The Config class defines customizable parameters for user interaction simulation, timeouts, retries, and logging. 
 
 The asynchronous functions manage tasks like content extraction, retry mechanisms for failed requests, and processing a batch of URLs from a DataFrame. 
 
 Finally, the scraped data is saved to a JSON file with error handling to ensure reliability and robustness. This system is suitable for scalable and dynamic content extraction workflows.

In [29]:
# This cell contains the configuration settings for the web scraping system.
# It defines various parameters to simulate human-like behavior, such as scrolling and mouse movements,
# to avoid detection. The configuration includes ranges for scroll iterations, scroll distances,
# mouse movements, and sleep durations to mimic human interaction. Additionally, it sets the delay
# between URL requests, the number of retry attempts for failed requests, and the timeout for page
# navigation and element selection. The viewport sizes and user agents are also specified to emulate
# different devices and browsers.



# --- Configuration ---
class Config:
    # --- Scrolling Simulation ---
    SCROLL_ITERATIONS_RANGE = (1, 2)
    # The range (min, max) for the number of times to simulate scrolling on a page.
    # This helps mimic human-like behavior and avoid detection.

    SCROLL_DISTANCE_RANGE = (50, 100)
    # The range (min, max) for the distance (in pixels) to scroll on each scroll iteration.
    # This simulates varying scroll lengths, making the bot less predictable.

    # --- Mouse Movement Simulation ---
    MOUSE_MOVEMENTS_RANGE = (1, 2)
    # The range (min, max) for the number of times to simulate mouse movements on a page.
    # This adds another layer of human-like interaction to avoid detection.

    # --- Sleep Durations ---
    SLEEP_SCROLL_RANGE = (0.1, 0.3)
    # The range (min, max) for the duration (in seconds) to sleep after each scroll action.
    # This simulates the time a human might take to read and scroll.

    SLEEP_MOUSE_RANGE = (0.1, 0.2)
    # The range (min, max) for the duration (in seconds) to sleep after each mouse movement.
    # This simulates the time a human might take to move the mouse.

    # --- Request Delays and Retries ---
    URL_DELAY_RANGE = (0.5, 1)
    # The range (min, max) for the duration (in seconds) to sleep between fetching different URLs.
    # This helps avoid overloading the server and mimics human browsing speed.

    RETRY_ATTEMPTS = 3
    # The maximum number of times to retry fetching a URL if it fails.
    # This helps handle transient network issues and improves reliability.

    TIMEOUT = 10000
    # The timeout (in milliseconds) for page navigation and element selection.
    # This prevents the scraper from getting stuck on slow-loading pages.


    VIEWPORT_SIZES = [
        {"width": 1920, "height": 1080},
        {"width": 1366, "height": 768},
        {"width": 1280, "height": 720},
        {"width": 1440, "height": 900},
        {"width": 1280, "height": 1024},
        {"width": 1024, "height": 768},
        {"width": 800, "height": 600},
        {"width": 1600, "height": 900},
        {"width": 1680, "height": 1050},
        {"width": 1920, "height": 1200},
        {"width": 1600, "height": 1200},
    ]
    USER_AGENTS = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Mobile/15E148 Safari/604.1",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.59",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.59",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.59",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.59",
        "Opera/9.80 (Windows NT 6.1; WOW64) Presto/2.12.388 Version/12.18",
        "Safari/537.36 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Mobile/15E148 Safari/604.1",
        "Edge/91.0.864.59 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    ]
    CONTENT_SELECTORS = [ # Add fallback selectors if any, refer to file: individual-article-page.html
        "div#content.content",
    ]
    LOG_FILE = "errors.log"
    OUTPUT_DIR = "." # Current directory

# --- Logging Setup ---
logging.basicConfig(filename=Config.LOG_FILE, level=logging.ERROR,
                    format="%(asctime)s - %(levelname)s - %(message)s")

# --- Helper Functions ---
async def simulate_user_behavior(page, config: Config):
    """Simulates user scrolling and mouse movements."""
    for _ in range(random.randint(*config.SCROLL_ITERATIONS_RANGE)):
        scroll_position = random.randint(*config.SCROLL_DISTANCE_RANGE)
        await page.evaluate(f"window.scrollBy(0, {scroll_position})")
        await asyncio.sleep(random.uniform(*config.SLEEP_SCROLL_RANGE))

    for _ in range(random.randint(*config.MOUSE_MOVEMENTS_RANGE)):
        x, y = random.randint(0, 1920), random.randint(0, 1080)
        await page.mouse.move(x, y, steps=random.randint(5, 10))
        await asyncio.sleep(random.uniform(*config.SLEEP_MOUSE_RANGE))

async def extract_content(page, config: Config) -> Optional[str]:
    """Extracts content from the page using multiple selectors."""
    await asyncio.sleep(random.uniform(0.5, 1.0))  # Wait for dynamic content
    soup = BeautifulSoup(await page.content(), "html.parser")
    for selector in config.CONTENT_SELECTORS:
        content = soup.select_one(selector)
        if content:
            return content.get_text(strip=True)
    return None

async def fetch_page_content(url: str, config: Config) -> Optional[str]:
    """Fetches content from a URL with retries and error handling."""
    async with async_playwright() as playwright:
        browser = await playwright.chromium.launch(headless=True)
        try:
            page = await browser.new_page(
                user_agent=random.choice(config.USER_AGENTS),
                viewport=random.choice(config.VIEWPORT_SIZES)
            )
            await page.goto(url, timeout=config.TIMEOUT)
            await page.wait_for_selector(config.CONTENT_SELECTORS[0], timeout=config.TIMEOUT) # Wait for at least one selector to be present
            await simulate_user_behavior(page, config)
            return await extract_content(page, config)
        except TimeoutError as te:
            logging.error(f"Timeout error fetching URL {url}: {te}")
            return None
        except Exception as e:
            logging.error(f"Error fetching URL {url}: {e}")
            return None
        finally:
            if browser:
                await browser.close()

async def fetch_article_with_retries(url: str, config: Config) -> Optional[str]:
    """Fetches article content with retries."""
    for attempt in range(config.RETRY_ATTEMPTS):
        content = await fetch_page_content(url, config)
        if content:
            return content
        if attempt < config.RETRY_ATTEMPTS - 1:
            await asyncio.sleep(random.uniform(1, 3) * (attempt + 1))  # Exponential backoff
        else:
            logging.error(f"Failed to fetch URL {url} after {config.RETRY_ATTEMPTS} attempts.")
    return None

async def fetch_all_articles(df: pd.DataFrame, config: Config) -> List[Dict]:
    """Fetches content for all articles in the DataFrame."""
    data = []
    for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="Fetching Articles"):
        url = row["article_url"]
        content = await fetch_article_with_retries(url, config)
        if content:
            data.append({
                "article_id": row["article_id"],
                "title": row["title"],
                "url": url,
                "content": content
            })
        await asyncio.sleep(random.uniform(*config.URL_DELAY_RANGE))
    return data

def save_to_json(data: List[Dict], config: Config):
    """Saves the scraped data to a JSON file with error handling."""
    timestamp_str = time.strftime("%Y%m%d_%H%M%S")
    filename = f"articles_{timestamp_str}.json"
    filepath = os.path.join(config.OUTPUT_DIR, filename)

    try:
        with open(filepath, "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False, indent=4)
        print(f"Data successfully saved to: {filepath}")
    except FileNotFoundError:
        logging.error(f"Error: Output directory not found: {config.OUTPUT_DIR}")
        print(f"Error: Could not save data. Output directory not found: {config.OUTPUT_DIR}")
    except PermissionError:
        logging.error(f"Error: Permission denied to write to: {filepath}")
        print(f"Error: Could not save data. Permission denied to write to: {filepath}")
    except Exception as e:
        logging.error(f"Error saving data to JSON file: {e}")
        print(f"Error: Could not save data. An unexpected error occurred: {e}")


# --- Main Program ---
config = Config()

# load the dataframe
file_path = "output/techinasia_ai_news_batch_0_20250116_174236.csv"
df = pd.read_csv(file_path)

# run the function
import nest_asyncio
nest_asyncio.apply()
loop = asyncio.get_event_loop()
scraped_data = loop.run_until_complete(fetch_all_articles(df, config))

# save the data to a json file
save_to_json(scraped_data, config)

print("Scraping complete. Data saved to JSON.")



  def _replace_cdata_list_attribute_values(self, tag_name, attrs):
Fetching Articles: 100%|██████████| 50/50 [05:37<00:00,  6.75s/it]

Data successfully saved to: ./articles_20250116_205828.json
Scraping complete. Data saved to JSON.



