In [1]:
import asyncio
import nest_asyncio
from playwright.async_api import async_playwright
import pandas as pd
import json
import re
from datetime import datetime
from urllib.parse import urljoin, urlparse
import time
from typing import Dict, List, Optional
import csv

nest_asyncio.apply()

BASE_URL = 'https://help.moengage.com'
RATE_LIMIT_DELAY = 1  
MAX_RETRIES = 3

print("setup complete")

setup complete


In [8]:
async def discover_all_documentation_links():
    async with async_playwright() as p:
        browser = await p.chromium.launch(
            headless=True,
            args=[
                "--disable-blink-features=AutomationControlled",
                "--disable-dev-shm-usage",
                "--no-sandbox"
            ]
        )
        context = await browser.new_context(
            viewport={'width': 1920, 'height': 1080},
            user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        )
        page = await context.new_page()
        
        try:
            print("Starting link discovery...")
            await page.goto(f'{BASE_URL}/hc/en-us', wait_until='networkidle')
            
            # grab all <a> tags with hrefs + their text and title
            links = await page.evaluate('''
                () => {
                    return Array.from(document.querySelectorAll('a[href]')).map(link => ({
                        href: link.href,
                        text: link.textContent.trim(),
                        title: link.title || ''
                    }));
                }
            ''')

            # regex patterns for which links we care about
            documentation_patterns = [
                r'https://help\.moengage\.com/hc/en-us/articles/\d+-',
                r'https://developers\.moengage\.com/hc/en-us/articles/\d+-',
                r'https://partners\.moengage\.com/hc/en-us/articles/\d+-'
            ]

            filtered_links = []
            seen_urls = set()

            for link in links:
                href = link['href']
                if any(re.match(pattern, href) for pattern in documentation_patterns):
                    if href not in seen_urls:
                        filtered_links.append({
                            'url': href,
                            'title': link['text'],  # no truncation here
                            'source': (
                                'help' if 'help.moengage' in href else
                                'developers' if 'developers' in href else
                                'partners'
                            )
                        })
                        seen_urls.add(href)

            print(f"Found {len(filtered_links)} unique docs")
            return filtered_links

        except Exception as e:
            print(f"Oops, something blew up during link discovery: {str(e)}")
            return []
        finally:
            await browser.close()

# run the discovery and dump results into a CSV
discovered_links = await discover_all_documentation_links()
df_links = pd.DataFrame(discovered_links)
df_links.to_csv('final.csv', index=False)
print(f"Wrote {len(discovered_links)} links to final.csv")
df_links.head()

Starting link discovery...
Found 898 unique docs
Wrote 898 links to final.csv


Unnamed: 0,url,title,source
0,https://help.moengage.com/hc/en-us/articles/19...,Contact Support,help
1,https://help.moengage.com/hc/en-us/articles/36...,Terms to Know,help
2,https://help.moengage.com/hc/en-us/articles/11...,Feature or Product in Beta,help
3,https://help.moengage.com/hc/en-us/articles/33...,Sign Up with MoEngage or Create a New Account ...,help
4,https://help.moengage.com/hc/en-us/articles/36...,Set Up 2-Step Verification on Your Device,help


In [15]:
class ArticleExtractor:
    """Article extractor with structured content parsing"""

    def __init__(self):
        self.retry_count = 0
        self.max_retries = MAX_RETRIES

    async def extract_article_content(self, url: str) -> dict:
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            context = await browser.new_context(
                viewport={'width': 1920, 'height': 1080},
                user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
            )
            page = await context.new_page()

            try:
                print(f"Extracting: {url}")
                await page.goto(url, wait_until='networkidle', timeout=30000)

                content = await page.evaluate('''
                    () => {
                        const article = {};

                        // grab title with fallback selectors
                        const titleSelectors = [
                            'h6.article-title',
                            'h1.article-title',
                            '.article-title',
                            'h1',
                            '.page-title'
                        ];
                        for (const selector of titleSelectors) {
                            const el = document.querySelector(selector);
                            if (el) {
                                article.title = el.textContent.trim();
                                break;
                            }
                        }

                        // get main body element
                        const bodyEl = document.querySelector('div.article__body') ||
                                       document.querySelector('.article-body') ||
                                       document.querySelector('.content');

                        if (bodyEl) {
                            const sections = [];
                            let currentSection = { heading: 'Introduction', content: '', images: [] };
                            const children = Array.from(bodyEl.children);

                            children.forEach(child => {
                                const tag = child.tagName.toLowerCase();

                                if (['h1', 'h2', 'h3', 'h4', 'h5', 'h6'].includes(tag)) {
                                    if (currentSection.content.trim()) {
                                        sections.push({ ...currentSection });
                                    }
                                    currentSection = {
                                        heading: child.textContent.trim(),
                                        content: '',
                                        images: [],
                                        level: parseInt(tag.charAt(1))
                                    };
                                } else {
                                    const text = child.textContent.trim();
                                    if (text) {
                                        currentSection.content += text + '\\n\\n';
                                    }
                                    const imgs = child.querySelectorAll('img');
                                    imgs.forEach(img => {
                                        if (img.src) {
                                            currentSection.images.push({
                                                src: img.src,
                                                alt: img.alt || '',
                                                title: img.title || ''
                                            });
                                        }
                                    });
                                }
                            });

                            if (currentSection.content.trim()) {
                                sections.push(currentSection);
                            }

                            article.sections = sections;
                            article.fullText = bodyEl.textContent.trim();
                            article.htmlContent = bodyEl.innerHTML;
                        }

                        article.url = window.location.href;
                        article.wordCount = article.fullText ? article.fullText.split(/\\s+/).length : 0;
                        article.lastModified = document.querySelector('time')?.getAttribute('datetime') || '';

                        const breadcrumbs = Array.from(document.querySelectorAll('.breadcrumbs a, nav a'))
                                                .map(a => a.textContent.trim())
                                                .filter(Boolean);
                        article.breadcrumbs = breadcrumbs;

                        return article;
                    }
                ''')

                content['extractedAt'] = datetime.now().isoformat()
                content['success'] = True

                return content

            except Exception as e:
                print(f"Error extracting {url}: {str(e)}")
                return {
                    'url': url,
                    'error': str(e),
                    'success': False,
                    'extractedAt': datetime.now().isoformat()
                }
            finally:
                await browser.close()

extractor = ArticleExtractor()

In [16]:
async def process_articles_batch(urls: list, batch_size: int = 5) -> list:
    # store all results here
    all_results = []
    total_batches = (len(urls) + batch_size - 1) // batch_size

    print(f"Processing {len(urls)} articles in {total_batches} batches of {batch_size}")

    for i in range(0, len(urls), batch_size):
        batch = urls[i:i + batch_size]
        batch_num = (i // batch_size) + 1

        print(f"\nBatch {batch_num}/{total_batches} — processing {len(batch)} articles...")

        # run all extractors in parallel
        batch_tasks = [extractor.extract_article_content(url) for url in batch]
        batch_results = await asyncio.gather(*batch_tasks, return_exceptions=True)

        # handle results + failures
        for j, result in enumerate(batch_results):
            if isinstance(result, Exception):
                print(f"Failed: {batch[j]} — {str(result)}")
                all_results.append({
                    'url': batch[j],
                    'error': str(result),
                    'success': False
                })
            else:
                if result.get('success', False):
                    print(f"Success: {result.get('title', 'Untitled')[:50]}...")
                else:
                    print(f"Partial: {batch[j]}")
                all_results.append(result)

        # avoid hammering the server
        if i + batch_size < len(urls):
            print(f"Sleeping {RATE_LIMIT_DELAY}s before next batch...")
            await asyncio.sleep(RATE_LIMIT_DELAY)

    success_count = sum(1 for r in all_results if r.get('success', False))
    print(f"\nDone! {success_count}/{len(urls)} articles successfully extracted")

    return all_results

In [12]:
# grab first 3 URLs for a test run
sample_urls = [url['url'] for url in discovered_links[:3]]

print("Running test on sample articles...")
sample_results = await process_articles_batch(sample_urls, batch_size=3)

# show some sample outputs for quick check
for result in sample_results[:3]:
    if result.get('success'):
        print(f"\nTitle: {result.get('title', 'N/A')}")
        print(f"Word count: {result.get('wordCount', 0)}")
        print(f"Sections found: {len(result.get('sections', []))}")
        if result.get('sections'):
            section_titles = [s['heading'] for s in result['sections'][:3]]
            print("Sample section headings:", section_titles)

Running test on sample articles...
Processing 3 articles in 1 batches of 3

Batch 1/1 — processing 3 articles...
Extracting: https://help.moengage.com/hc/en-us/articles/115005943283-Feature-or-Product-in-Beta
Extracting: https://help.moengage.com/hc/en-us/articles/19708702327572-Raise-a-Support-Ticket-Through-MoEngage-Dashboard
Extracting: https://help.moengage.com/hc/en-us/articles/360040071212-Terms-to-Know
Success: Raise a Support Ticket Through MoEngage Dashboard...
Success: Terms to Know...
Success: Feature or Product in Beta...

Done! 3/3 articles successfully extracted

Title: Raise a Support Ticket Through MoEngage Dashboard
Word count: 388
Sections found: 1
Sample section headings: ['Introduction']

Title: Terms to Know
Word count: 604
Sections found: 1
Sample section headings: ['Introduction']

Title: Feature or Product in Beta
Word count: 251
Sections found: 1
Sample section headings: ['Introduction']


In [19]:
# running this on all the articles

all_urls = [url['url'] for url in discovered_links[:898]]

print("all the urls...")
all_results = await process_articles_batch(all_urls, batch_size=2)

for result in all_results[:3]:
    if result.get('success'):
        print(f"\nTitle: {result.get('title', 'N/A')}")
        print(f"Word count: {result.get('wordCount', 0)}")
        print(f"Sections found: {len(result.get('sections', []))}")
        if result.get('sections'):
            section_titles = [s['heading'] for s in result['sections'][:3]]
            print("section headings:", section_titles)

all the urls...
Processing 898 articles in 449 batches of 2

Batch 1/449 — processing 2 articles...
Extracting: https://help.moengage.com/hc/en-us/articles/360040071212-Terms-to-Know
Extracting: https://help.moengage.com/hc/en-us/articles/19708702327572-Raise-a-Support-Ticket-Through-MoEngage-Dashboard
Success: Raise a Support Ticket Through MoEngage Dashboard...
Success: Terms to Know...
Sleeping 1s before next batch...

Batch 2/449 — processing 2 articles...
Extracting: https://help.moengage.com/hc/en-us/articles/33436161901332-Sign-Up-with-MoEngage-or-Create-a-New-Account-in-MoEngage
Extracting: https://help.moengage.com/hc/en-us/articles/115005943283-Feature-or-Product-in-Beta
Success: Feature or Product in Beta...
Success: Sign Up with MoEngage or Create a New Account in M...
Sleeping 1s before next batch...

Batch 3/449 — processing 2 articles...
Extracting: https://help.moengage.com/hc/en-us/articles/36908840992276-Set-Up-2-Step-Verification-on-Your-Device
Extracting: https://he

Future exception was never retrieved
future: <Future finished exception=TargetClosedError('Target page, context or browser has been closed\nCall log:\n  - navigating to "https://help.moengage.com/hc/en-us/articles/115005943283-Feature-or-Product-in-Beta", waiting until "networkidle"\n')>
playwright._impl._errors.TargetClosedError: Target page, context or browser has been closed
Call log:
  - navigating to "https://help.moengage.com/hc/en-us/articles/115005943283-Feature-or-Product-in-Beta", waiting until "networkidle"

Future exception was never retrieved
future: <Future finished exception=TargetClosedError('Target page, context or browser has been closed\nCall log:\n  - navigating to "https://help.moengage.com/hc/en-us/articles/19708702327572-Raise-a-Support-Ticket-Through-MoEngage-Dashboard", waiting until "networkidle"\n')>
playwright._impl._errors.TargetClosedError: Target page, context or browser has been closed
Call log:
  - navigating to "https://help.moengage.com/hc/en-us/artic

Success: Manage Subscription...
Success: How To Resolve reCAPTCHA Issues in the Browser Whe...
Sleeping 1s before next batch...

Batch 29/449 — processing 2 articles...
Extracting: https://help.moengage.com/hc/en-us/articles/31224595537940-Why-Does-Switching-Between-LIVE-and-TEST-Environments-Fail
Extracting: https://help.moengage.com/hc/en-us/articles/31223472156052-Why-Is-the-User-Unable-to-Log-in-to-the-Dashboard
Success: Why Does Switching Between LIVE and TEST Environme...
Success: Why Is the User Unable to Log in to the Dashboard?...
Sleeping 1s before next batch...

Batch 30/449 — processing 2 articles...
Extracting: https://help.moengage.com/hc/en-us/articles/22679333212564-Suspicious-Login-Verification
Extracting: https://help.moengage.com/hc/en-us/articles/29140773691156-Why-Is-the-OTP-Not-Received-on-Email-for-2FA
Success: Why Is the OTP Not Received on Email for 2FA?...
Success: Suspicious Login Verification...
Sleeping 1s before next batch...

Batch 31/449 — processing 2 a

In [20]:
def save_extracted_content(results: list, filename: str = 'extracted_articles.json'):
    # filter out only successful runs
    successful = [r for r in results if r.get('success', False)]

    summary = {
        'extraction_summary': {
            'total_articles': len(results),
            'successful_extractions': len(successful),
            'failed_extractions': len(results) - len(successful),
            'average_word_count': (
                sum(r.get('wordCount', 0) for r in successful) / len(successful)
                if successful else 0
            ),
            'extraction_timestamp': datetime.now().isoformat()
        },
        'articles': results
    }

    # write full detailed JSON dump
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(summary, f, indent=2, ensure_ascii=False)

    # prep a light CSV summary for quick checks
    csv_data = []
    for article in successful:
        csv_data.append({
            'url': article.get('url', ''),
            'title': article.get('title', ''),
            'word_count': article.get('wordCount', 0),
            'section_count': len(article.get('sections', [])),
            'has_images': any(len(s.get('images', [])) > 0 for s in article.get('sections', [])),
            'breadcrumbs': ' > '.join(article.get('breadcrumbs', []))
        })

    df_articles = pd.DataFrame(csv_data)
    csv_filename = filename.replace('.json', '_summary.csv')
    df_articles.to_csv(csv_filename, index=False)

    print(f"Saved detailed JSON to {filename}")
    print(f"Saved summary CSV to {csv_filename}")
    print(f"Success rate: {len(successful)}/{len(results)} ({(len(successful)/len(results)*100):.1f}%)")

    return summary

# save the sample run results
all_summary = save_extracted_content(all_results, 'extracted_articles.json')


Saved detailed JSON to extracted_articles.json
Saved summary CSV to extracted_articles_summary.csv
Success rate: 864/898 (96.2%)


In [21]:
# okay we missed some here, lets fix them

import pandas as pd
import json
from datetime import datetime

df_original = pd.read_csv('final.csv')
df_extracted = pd.read_csv('extracted_articles_summary.csv')

original_urls = set(df_original['url'].tolist())
extracted_urls = set(df_extracted['url'].tolist())

missed_urls = list(original_urls - extracted_urls)

print(f"Total original: {len(original_urls)}")
print(f"Extracted: {len(extracted_urls)}")
print(f"Missed: {len(missed_urls)}")

if missed_urls:
    print("\nFirst few missed URLs:")
    for i, url in enumerate(missed_urls[:10]):
        print(f"{i + 1}. {url}")

    print(f"\nProcessing {len(missed_urls)} missed articles...")
    missed_results = await process_articles_batch(missed_urls, batch_size=2)

    with open('extracted_articles.json', 'r', encoding='utf-8') as f:
        existing_data = json.load(f)

    all_combined = existing_data['articles'] + missed_results
    successful = [r for r in all_combined if r.get('success', False)]

    summary = {
        'extraction_summary': {
            'total_articles': len(all_combined),
            'successful': len(successful),
            'failed': len(all_combined) - len(successful),
            'avg_word_count': (
                sum(r.get('wordCount', 0) for r in successful) / len(successful)
                if successful else 0
            ),
            'timestamp': datetime.now().isoformat(),
            'retry_done': True
        },
        'articles': all_combined
    }

    with open('extracted_articles_complete.json', 'w', encoding='utf-8') as f:
        json.dump(summary, f, indent=2, ensure_ascii=False)

    csv_data = []
    for article in successful:
        csv_data.append({
            'url': article.get('url', ''),
            'title': article.get('title', ''),
            'word_count': article.get('wordCount', 0),
            'section_count': len(article.get('sections', [])),
            'has_images': any(len(s.get('images', [])) > 0 for s in article.get('sections', [])),
            'breadcrumbs': ' > '.join(article.get('breadcrumbs', []))
        })

    df_complete = pd.DataFrame(csv_data)
    df_complete.to_csv('extracted_articles_complete_summary.csv', index=False)

    print(f"\nRetry done. Success: {len(successful)}/{len(all_combined)}")
    print(f"Improved by: {len(successful) - len(df_extracted)}")

    still_missing = list(original_urls - set(df_complete['url'].tolist()))
    if still_missing:
        print(f"Still missing {len(still_missing)} URLs.")
    else:
        print("All done. No missing URLs left.")

else:
    print("Nothing missed.")

Total original: 898
Extracted: 864
Missed: 34

First few missed URLs:
1. https://help.moengage.com/hc/en-us/articles/34174988942740-Overview
2. https://help.moengage.com/hc/en-us/articles/34174962989204-Overview
3. https://help.moengage.com/hc/en-us/articles/206503455-Segment-Users-to-Notify-App-Updates
4. https://help.moengage.com/hc/en-us/articles/18129193565204-Compare-Mode-in-Analytics
5. https://help.moengage.com/hc/en-us/articles/29145844762900-Display-a-Non-Intrusive-Countdown-Timer-Banner
6. https://help.moengage.com/hc/en-us/articles/24865660345876-How-to-Use-Recommendations-to-Announce-Price-Drop
7. https://help.moengage.com/hc/en-us/articles/11772245742100-Add-a-Countdown-Timer-to-Emails
8. https://help.moengage.com/hc/en-us/articles/33132363356564-View-Personalize-in-action
9. https://help.moengage.com/hc/en-us/articles/25637711880724-How-to-Reduce-Bounce-Rates-on-Websites
10. https://help.moengage.com/hc/en-us/articles/35724347745812-Overview

Processing 34 missed articles