In [1]:
import requests
from bs4 import BeautifulSoup
import time

BASE_URL = 'https://help.moengage.com'
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (compatible; MoengageScraper/1.0; +https://yourdomain.com)'
}

print("okay done, lets begin scraping")

okay done, lets begin scraping


In [16]:
import nest_asyncio
import asyncio
import csv
from playwright.async_api import async_playwright

nest_asyncio.apply()

BASE_URL = 'https://help.moengage.com'

async def debug_all_links():
    async with async_playwright() as p:
        browser = await p.chromium.launch(
            headless=False,
            args=[
                "--start-maximized",
                "--disable-blink-features=AutomationControlled"
            ]
        )
        context = await browser.new_context(
            viewport={'width': 1920, 'height': 1080},
            user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36'
        )
        page = await context.new_page()
        await page.goto(f'{BASE_URL}/hc/en-us')
        await page.wait_for_load_state('networkidle')

        links = await page.query_selector_all('a')

        print(f"Total <a> tags found: {len(links)}")

        all_hrefs = []
        for link in links:
            href = await link.get_attribute('href')
            if href:
                all_hrefs.append(href)
                print(href)

        await browser.close()
    
    return all_hrefs

all_hrefs = asyncio.get_event_loop().run_until_complete(debug_all_links())

csv_file = 'extracted_links.csv'
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['URL'])  # header
    for href in all_hrefs:
        writer.writerow([href])

print(f"Saved {len(all_hrefs)} links to {csv_file}")


Total <a> tags found: 945
http://moengage.com/
https://rebrand.ly/moe-use-cases
/hc/en-us/signin?return_to=https%3A%2F%2Fhelp.moengage.com%2Fhc%2Fen-us&locale=en-us
https://help.moengage.com/hc/en-us
https://developers.moengage.com/hc/en-us
https://partners.moengage.com/hc/en-us
https://help.moengage.com/hc/en-us/p/use-cases
https://www.moengage.com/resources/
https://help.moengage.com/hc/en-us/articles/19708702327572-Raise-a-Support-Ticket-Through-MoEngage-Dashboard
/hc/change_language/ja?return_to=%2Fhc%2Fja
https://help.moengage.com/hc/en-us
https://developers.moengage.com/hc/en-us
https://partners.moengage.com/hc/en-us
https://help.moengage.com/hc/en-us/p/use-cases
https://www.moengage.com/resources/
https://help.moengage.com/hc/en-us/articles/19708702327572-Raise-a-Support-Ticket-Through-MoEngage-Dashboard
https://help.moengage.com/hc/en-us/articles/360040071212-Terms-to-Know
https://help.moengage.com/hc/en-us/articles/115005943283-Feature-or-Product-in-Beta
https://help.moengage.

In [15]:
# filtering only the useful links

import csv

input_file = 'extracted_links.csv'
output_file = 'filtered_article_links.csv'

allowed_prefixes = [
    'https://help.moengage.com/hc/en-us/articles/',
    'https://developers.moengage.com/hc/en-us/articles/',
    'https://partners.moengage.com/hc/en-us/articles/'
]

filtered_links = []

with open(input_file, mode='r', encoding='utf-8') as infile:
    reader = csv.DictReader(infile)
    for row in reader:
        url = row['URL']
        if any(url.startswith(prefix) for prefix in allowed_prefixes):
            filtered_links.append(url)

with open(output_file, mode='w', newline='', encoding='utf-8') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(['URL'])  # header
    for link in filtered_links:
        writer.writerow([link])

print(f"Saved {len(filtered_links)} filtered article links to {output_file}")

Saved 900 filtered article links to filtered_article_links.csv


We’re scraping the MoEngage Help Center main page.
The useful content is all in the links under /articles/, which we’ve successfully captured.
So we don’t need to crawl every section — we already have the key articles we need.

In [None]:
# lets try scraping just one link for now and see how its working

In [19]:
import asyncio
import nest_asyncio
from playwright.async_api import async_playwright

nest_asyncio.apply()

test_url = 'https://help.moengage.com/hc/en-us/articles/32664799340564-Artificial-Intelligence-Resolution-Agent-AIRA'

async def scrape_single_article(url):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        context = await browser.new_context(
            viewport={'width': 1920, 'height': 1080},
            user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36'
        )
        page = await context.new_page()

        print(f"Scraping: {url}")
        await page.goto(url)
        await page.wait_for_load_state('networkidle')

        title_el = await page.query_selector('h6.article-title')
        title = await title_el.inner_text() if title_el else '[No Title Found]'
        print("\nTitle:")
        print(title)

        body_el = await page.query_selector('div.article__body')
        body_text = await body_el.inner_text() if body_el else '[No Body Found]'
        print("\nBody Text (first 500 chars):")
        print(body_text[:500])

        await browser.close()

asyncio.get_event_loop().run_until_complete(scrape_single_article(test_url))

Scraping: https://help.moengage.com/hc/en-us/articles/32664799340564-Artificial-Intelligence-Resolution-Agent-AIRA

Title:
Artificial Intelligence Resolution Agent (AIRA)

Body Text (first 500 chars):
This section provides step-by-step instructions for using the Artificial Intelligence Resolution Agent (AIRA) from the MoEngage dashboard.

What is AIRA?

AIRA is MoEngage's generative AI-enabled support bot. It addresses your queries by referencing an extensive knowledge base and utilizing custom workflows. It ensures prompt and accurate responses and can escalate complex issues by creating support tickets. AIRA is available 24/7.

AIRA Capabilities
 	

Early Access

This is an Early Access fea


In [None]:
# with image links in the articles

import asyncio
import nest_asyncio
from playwright.async_api import async_playwright

nest_asyncio.apply()

test_url = 'https://help.moengage.com/hc/en-us/articles/32664799340564-Artificial-Intelligence-Resolution-Agent-AIRA'

async def scrape_single_article_with_images(url):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        context = await browser.new_context(
            viewport={'width': 1920, 'height': 1080},
            user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36'
        )
        page = await context.new_page()

        print(f"Scraping: {url}")
        await page.goto(url)
        await page.wait_for_load_state('networkidle')

        title_el = await page.query_selector('h6.article-title')
        title = await title_el.inner_text() if title_el else '[No Title Found]'
        print("\nTitle:")
        print(title)

        body_el = await page.query_selector('div.article__body')
        if body_el:
            body_text = await body_el.inner_text()

            img_elements = await body_el.query_selector_all('img')
            img_urls = []
            for img in img_elements:
                src = await img.get_attribute('src')
                if src:
                    img_urls.append(src)
                    body_text += f'\n[Image: {src}]'
        else:
            body_text = '[No Body Found]'

        print("\nBody Text (first 500 chars + image mentions):")
        print(body_text)

        await browser.close()

asyncio.get_event_loop().run_until_complete(scrape_single_article_with_images(test_url))

In [27]:
import asyncio
import nest_asyncio
from playwright.async_api import async_playwright

nest_asyncio.apply()

test_url = 'https://help.moengage.com/hc/en-us/articles/32664799340564-Artificial-Intelligence-Resolution-Agent-AIRA'

async def scrape_article_with_sections(url):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        context = await browser.new_context(
            viewport={'width': 1920, 'height': 1080},
            user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36'
        )
        page = await context.new_page()

        print(f"Scraping: {url}")
        await page.goto(url)
        await page.wait_for_load_state('networkidle')

        title_el = await page.query_selector('h6.article-title')
        title = await title_el.inner_text() if title_el else '[No Title Found]'
        print(f"\nTitle:\n{title}")

        body_el = await page.query_selector('div.article__body')
        if not body_el:
            print("[No Body Found]")
            await browser.close()
            return

        # full block children
        body_children = await body_el.query_selector_all(':scope > *')

        current_section = "Intro (before first H2)"
        section_content = ""

        for child in body_children:
            tag = await child.evaluate("el => el.tagName.toLowerCase()")

            if tag == 'h2':
                if section_content.strip():
                    print(f"\nSection: {current_section}\n{section_content.strip()}")
                    section_content = ""
                current_section = await child.inner_text()

            else:
                block_text = await child.inner_text()
                if block_text.strip():
                    section_content += block_text + "\n"

                img_elements = await child.query_selector_all('img')
                for img in img_elements:
                    src = await img.get_attribute('src')
                    if src:
                        section_content += f"[Image: {src}]\n"

        if section_content.strip():
            print(f"\nSection: {current_section}\n{section_content.strip()}")

        await browser.close()
asyncio.get_event_loop().run_until_complete(scrape_article_with_sections(test_url))


Scraping: https://help.moengage.com/hc/en-us/articles/32664799340564-Artificial-Intelligence-Resolution-Agent-AIRA

Title:
Artificial Intelligence Resolution Agent (AIRA)

Section: Intro (before first H2)
This section provides step-by-step instructions for using the Artificial Intelligence Resolution Agent (AIRA) from the MoEngage dashboard.
What is AIRA?
AIRA is MoEngage's generative AI-enabled support bot. It addresses your queries by referencing an extensive knowledge base and utilizing custom workflows. It ensures prompt and accurate responses and can escalate complex issues by creating support tickets. AIRA is available 24/7.
AIRA Capabilities
 	

Early Access

This is an Early Access feature. To enable it for your account, contact your CSM or raise a support ticket.
The following are the current capabilities of AIRA:
Respond to troubleshooting queries regarding Push notifications.
Address 'how-to' queries from the knowledge base. 
info	

Information

The file upload feature is cu