# Pandas Changelog Scraper

This notebook scrapes the Pandas changelog from 2023 to present and creates a unified knowledge base of changes. We'll use ScrapeGraphAI to handle the scraping and processing of the changelog data.

## Setup

First, let's install the required packages and import dependencies.

In [1]:
#!pip install scrapegraphai
#!pip install pandas

In [2]:
from scrapegraphai.graphs import SmartScraperGraph  # Changed to SmartScraperGraph as per the context
from pydantic import BaseModel, Field
from typing import List, Optional
import pandas as pd
from datetime import datetime
import re

  from .autonotebook import tqdm as notebook_tqdm


## Define Data Schema

Let's create a schema for the changelog data we want to extract.

In [3]:
class ChangelogEntry(BaseModel):
    version: str = Field(description="The version number of the release")
    release_date: str = Field(description="The release date")
    bug_fixes: List[str] = Field(description="List of bug fixes in this release", default_factory=list)
    enhancements: List[str] = Field(description="List of enhancements in this release", default_factory=list)
    deprecations: List[str] = Field(description="List of deprecations in this release", default_factory=list)
    api_changes: List[str] = Field(description="List of API changes in this release", default_factory=list)
    contributors: List[str] = Field(description="List of contributors to this release", default_factory=list)

## Generate List of Changelog URLs

We'll create a function to extract version links from the main changelog page.

In [4]:
from scrapegraphai.graphs import SmartScraperGraph
from pydantic import BaseModel, Field
from typing import List
import asyncio
import nest_asyncio

# Apply nest_asyncio at the start
nest_asyncio.apply()

# Define the VersionLinks schema first
class VersionLinks(BaseModel):
    versions: List[dict] = Field(description="List of version numbers and their links")

async def fetch_changelog_urls(base_url: str, config: dict) -> List[str]:
    # Create scraper instance
    scraper = SmartScraperGraph(
        prompt="Extract all version links from 2023 and later, including version number and full URL",
        source=base_url,
        config=config,
        schema=VersionLinks
    )
    
    # Get results
    results = await scraper.run()
    
    # Filter for versions from 2023 onwards
    urls = []
    for version in results['versions']:
        if '2023' in version['date'] or '2024' in version['date']:
            urls.append(base_url + version['link'])
    
    return urls

from bs4 import BeautifulSoup
import requests
from typing import List
import re
def get_changelog_urls() -> List[str]:
    base_url = "https://pandas.pydata.org/docs/whatsnew/"
    
    # Fetch the main changelog page
    response = requests.get(base_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all version links
    urls = []
    for link in soup.find_all('a'):
        href = link.get('href')
        # Look for version links that contain 2023 or 2024
        if href and ('2023' in href or '2024' in href):
            full_url = base_url + href if not href.startswith('http') else href
            urls.append(full_url)
    
    return urls


## Scrape Individual Changelog Pages

Now let's create a function to scrape each changelog page and extract the relevant information.

In [5]:
def scrape_changelogs(urls):
    entries = []
    
    for url in urls:
        try:
            response = requests.get(url)
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract version from URL
            version = re.search(r'v(\d+\.\d+\.\d+)', url)
            version = version.group(1) if version else "Unknown"
            
            # Basic extraction of content
            entry = ChangelogEntry(
                version=version,
                release_date="",  # We'll need to extract this from the page content
                bug_fixes=[li.text for li in soup.select('div#bug-fixes li')],
                enhancements=[li.text for li in soup.select('div#enhancements li')],
                deprecations=[li.text for li in soup.select('div#deprecations li')],
                api_changes=[li.text for li in soup.select('div#api-changes li')],
                contributors=[li.text for li in soup.select('div#contributors li')]
            )
            entries.append(entry)
            
        except Exception as e:
            print(f"Error scraping {url}: {str(e)}")
    
    return entries

## Process and Store Results

Let's create functions to process the scraped data and store it in a structured format.

In [6]:
def process_changelog_data(changelog_data):
    # Convert to DataFrame
    df = pd.DataFrame(changelog_data)
    
    # Convert release_date to datetime
    df['release_date'] = pd.to_datetime(df['release_date'])
    
    # Sort by release date
    df = df.sort_values('release_date', ascending=False)
    
    return df

def save_changelog_data(df, excel_path='pandas_changelog.xlsx', json_path='pandas_changelog.json'):
    # Save to Excel
    df.to_excel(excel_path, index=False)
    
    # Save to JSON
    df.to_json(json_path, orient='records', indent=2)
    
    return excel_path, json_path

## Main Execution

Now let's put everything together and run the scraper.

In [7]:
def main():
    print("Getting changelog URLs...")
    urls = get_changelog_urls()
    
    print(f"Found {len(urls)} changelog pages to scrape")
    
    print("Scraping changelog pages...")
    changelog_data = scrape_changelogs(urls)
    
    print("Processing data...")
    df = process_changelog_data(changelog_data)
    
    print("Saving results...")
    excel_path, json_path = save_changelog_data(df)
    
    print(f"\nResults saved to:")
    print(f"Excel: {excel_path}")
    print(f"JSON: {json_path}")
    
    return df

# Run the scraper
changelog_df = main()

Getting changelog URLs...
Found 0 changelog pages to scrape
Scraping changelog pages...
Processing data...


KeyError: 'release_date'

## Analyze Results

Let's create some basic visualizations and analysis of the changelog data.

In [None]:
def analyze_changelog(df):
    print("Changelog Analysis\n")
    
    print(f"Total number of releases: {len(df)}")
    print(f"Date range: {df['release_date'].min().date()} to {df['release_date'].max().date()}")
    
    print("\nNumber of changes by type:")
    print(f"Bug fixes: {df['bug_fixes'].apply(len).sum()}")
    print(f"Enhancements: {df['enhancements'].apply(len).sum()}")
    print(f"Deprecations: {df['deprecations'].apply(len).sum()}")
    print(f"API changes: {df['api_changes'].apply(len).sum()}")
    
    print("\nUnique contributors:", 
          len(set([c for contributors in df['contributors'] for c in contributors])))

# Run analysis
analyze_changelog(changelog_df)

## Search Function

Finally, let's create a function to search through the changelog data.

In [None]:
def search_changelog(df, query, categories=None):
    """Search through the changelog for specific terms.
    
    Args:
        df: The changelog DataFrame
        query: Search term
        categories: List of categories to search (bug_fixes, enhancements, etc.)
                   If None, searches all categories
    """
    if categories is None:
        categories = ['bug_fixes', 'enhancements', 'deprecations', 'api_changes']
    
    results = []
    
    for idx, row in df.iterrows():
        for category in categories:
            matches = [item for item in row[category] 
                      if query.lower() in item.lower()]
            if matches:
                for match in matches:
                    results.append({
                        'version': row['version'],
                        'date': row['release_date'].date(),
                        'category': category,
                        'entry': match
                    })
    
    return pd.DataFrame(results)

# Example usage:
# search_changelog(changelog_df, 'performance')