*Ensure all required libraries for scraping are installed*

In [79]:
import os
import re
import json
import requests
from bs4 import BeautifulSoup

In [80]:
THE_BATCH_URL = 'https://www.deeplearning.ai/the-batch/'

### Option 1: Scrape using urls from sitemap

In [81]:
# SITEMAP_URL_0 = 'https://www.deeplearning.ai/sitemap-0.xml'
# SITEMAP_URL_1 = 'https://www.deeplearning.ai/sitemap-1.xml'

# def get_sitemap_urls(sitemap_url):
#     response = requests.get(sitemap_url)
#     soup = BeautifulSoup(response.content, 'xml')
#     urls = [url.text for url in soup.find_all('loc') if url.text.startswith(THE_BATCH_URL)]
#     return urls

### Option 2: Scrape only `the-batch/issue-x/`

I noticed that every article tag is already included in the weekly issues, so there's probably no need to scrape each article individually.


For now, I'll use only the articles from **weekly issues**.

In [82]:
output_dir = 'articles/'
image_output_dir = 'images/'

In [83]:
def download_image(url, image_name, images_dir):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            image_path = os.path.join(images_dir, image_name)
            with open(image_path, 'wb') as f:
                f.write(response.content)
            return image_path
    except Exception as e:
        print(f"Failed to download {url}: {e}")
    return None

In [84]:
# TODO: Research whether this kind of cleaning is needed and if so, how to do it properly
def clean_text(text: str):
    cleaned_text = re.sub(r'[ \t]+', ' ', text)                    # Remove extra spaces and tabs
    cleaned_text = re.sub(r'\n+', '\n', cleaned_text)              # Remove extra newlines
    
    to_remove = [
        '\n Andrew\n',
        'Dear friends,\n ',
        'Andrew \n \n ',
    ]
    for i in to_remove:
        cleaned_text = cleaned_text.replace(i, '')
    
    return cleaned_text

In [85]:
def remove_paragraph_after_deeplearningai(soup):
    """
    Used to remove the paragraph that follows the header "A message from deeplearning.ai"
    This is an ad for the deeplearning.ai courses.
    
    P.S. not sure if this is the best way to do it, but it works for now.
    """
    
    header = soup.find('h2', id='a-message-from-deeplearningai')
    if header:
        next_p = header.find_next_sibling('p')
        if next_p:
            next_p.decompose()
    return soup

In [86]:
def get_issue_articles(start_from: int, last_issue: int):
    articles_data = []
    
    for i in range(start_from, last_issue + 1):
        url = f'{THE_BATCH_URL}issue-{i}/'
        response = requests.get(url)
        
        soup = BeautifulSoup(response.content, 'html.parser')

        article_tags = soup.find_all('article')

        for tag in article_tags:
            tag = remove_paragraph_after_deeplearningai(tag)
            
            paragraphs = tag.find_all('p')
            body = '\n '.join(p.get_text() for p in paragraphs)
            body = clean_text(body)

            image_urls = [
                img['src'] for img in tag.find_all('img')
                if img.get('src') and img['src'].lower().endswith(('.png', '.jpg', '.jpeg', '.svg')) and
                'The-Batch-ads-and-exclusive-banners' not in img['src']  # exclude ads
            ]
            
            images = []
            for j, img_url in enumerate(image_urls):
                ext = os.path.splitext(img_url)[1].split('?')[0]
                image_name = f"issue_{i}_{j}" + ext
                image_path = download_image(img_url, image_name, image_output_dir)
                
                if image_path:
                    images.append(image_path)

            articles_data.append({
                'link': url,
                'body': body,
                'images': images,
            })  

    return articles_data

In [87]:
START_FROM_ISSUE = 251
LAST_ISSUE = 300
CHUNK_SIZE = 50

for start in range(START_FROM_ISSUE, LAST_ISSUE + 1, CHUNK_SIZE):
    end = min(start + CHUNK_SIZE - 1, LAST_ISSUE)
    articles_chunk = get_issue_articles(start, end)
    
    filename = os.path.join(output_dir, f'articles_{start}_{end}.json')
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(articles_chunk, f, ensure_ascii=False, indent=4)
    
    print(f'Articles from issue {start} to {end} saved to articles_{start}_{end}.json')

Articles from issue 251 to 300 saved to articles_251_300.json
