In [4]:
"""
Metamia to Anki Scraper

This script scrapes analogies from metamia.com and exports them to Anki-compatible CSV format.
Includes filtering options to help prune problematic entries.

Requirements:
    pip install requests beautifulsoup4 pandas

Usage:
    python metamia_scraper.py
"""
!pip install wheel
!pip install pandas
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from urllib.parse import urljoin, urlparse
import csv
from typing import List, Dict, Optional
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class MetamiaScraper:
    def __init__(self, base_url="http://www.metamia.com", delay=1.0):
        self.base_url = base_url
        self.delay = delay  # Delay between requests to be respectful
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        
    def get_page(self, url: str) -> Optional[BeautifulSoup]:
        """Fetch and parse a page with error handling and rate limiting."""
        try:
            time.sleep(self.delay)
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            return BeautifulSoup(response.content, 'html.parser')
        except requests.RequestException as e:
            logger.error(f"Error fetching {url}: {e}")
            return None
    
    def extract_analogy_links(self, soup: BeautifulSoup) -> List[str]:
        """Extract analogy page links from a listing page."""
        links = []
        # Look for links that match the pattern /critique-*
        for link in soup.find_all('a', href=True):
            href = link['href']
            if '/critique-' in href:
                full_url = urljoin(self.base_url, href)
                links.append(full_url)
        return list(set(links))  # Remove duplicates
    
    def parse_analogy_page(self, url: str) -> Optional[Dict]:
        """Parse an individual analogy page and extract structured data."""
        soup = self.get_page(url)
        if not soup:
            return None
            
        try:
            # Extract from URL pattern: /critique-CONCEPT-like-ANALOGY-ID
            url_parts = url.split('/')[-1]  # Get the last part
            if 'critique-' in url_parts:
                # Remove 'critique-' prefix and split
                content = url_parts.replace('critique-', '')
                
                # Find 'like' separator
                if '-like-' in content:
                    parts = content.split('-like-', 1)
                    concept = parts[0].replace('-', ' ').strip()
                    analogy_with_id = parts[1]
                    
                    # Remove ID (last part after final dash followed by numbers)
                    analogy = re.sub(r'-\d+$', '', analogy_with_id).replace('-', ' ').strip()
                else:
                    # Fallback parsing
                    concept = content.replace('-', ' ').strip()
                    analogy = "Unknown"
            
            # Try to extract additional details from page content
            writer = "Not Stated"
            explanation = ""
            date = ""
            
            # Look for writer information
            writer_elem = soup.find(text=re.compile(r'Writer:'))
            if writer_elem:
                writer_text = writer_elem.strip()
                if '--' in writer_text:
                    parts = writer_text.split('--')
                    if len(parts) >= 2:
                        writer = parts[0].replace('Writer:', '').strip()
                        date = parts[1].replace('Date:', '').strip()
            
            # Look for explanation/critique text
            # This might be in various places depending on page structure
            content_divs = soup.find_all(['div', 'p'], text=True)
            for div in content_divs:
                text = div.get_text().strip()
                if len(text) > 50 and not any(x in text.lower() for x in ['writer:', 'date:', 'most active']):
                    explanation = text[:500]  # Limit length
                    break
            
            return {
                'concept': concept.title(),
                'analogy': analogy.title(),
                'writer': writer,
                'date': date,
                'explanation': explanation,
                'url': url,
                'quality_score': self.assess_quality(concept, analogy, explanation)
            }
            
        except Exception as e:
            logger.error(f"Error parsing {url}: {e}")
            return None
    
    def assess_quality(self, concept: str, analogy: str, explanation: str) -> int:
        """Simple quality assessment to help with filtering."""
        score = 5  # Base score
        
        # Boost for longer explanations
        if len(explanation) > 100:
            score += 2
        elif len(explanation) > 50:
            score += 1
            
        # Penalize very short or generic analogies
        if len(analogy.split()) < 2:
            score -= 1
        
        # Penalize certain problematic patterns (add your own filters here)
        problematic_terms = ['fuck', 'shit', 'damn', 'hell']  # Extend as needed
        if any(term in analogy.lower() or term in concept.lower() for term in problematic_terms):
            score -= 3
            
        # Boost for scientific/educational content
        educational_terms = ['cell', 'dna', 'protein', 'neuron', 'photosynthesis', 'system']
        if any(term in concept.lower() for term in educational_terms):
            score += 1
            
        return max(0, min(10, score))  # Clamp to 0-10 range
    
    def discover_all_analogies(self, max_pages: int = 10) -> List[str]:
        """Discover analogy URLs by exploring the site structure."""
        all_links = []
        
        # Start with the main page
        main_soup = self.get_page(self.base_url)
        if main_soup:
            all_links.extend(self.extract_analogy_links(main_soup))
        
        # You could add more discovery methods here:
        # - Browse by category pages
        # - Search results
        # - Recent entries pagination
        
        logger.info(f"Discovered {len(all_links)} analogy URLs")
        return all_links[:max_pages * 50]  # Reasonable limit
    
    def scrape_analogies(self, max_entries: int = 200) -> List[Dict]:
        """Main scraping method."""
        logger.info("Starting Metamia scrape...")
        
        # Discover analogy URLs
        urls = self.discover_all_analogies()
        
        analogies = []
        for i, url in enumerate(urls[:max_entries]):
            logger.info(f"Scraping {i+1}/{min(max_entries, len(urls))}: {url}")
            
            analogy_data = self.parse_analogy_page(url)
            if analogy_data:
                analogies.append(analogy_data)
                
        logger.info(f"Successfully scraped {len(analogies)} analogies")
        return analogies
    
    def filter_analogies(self, analogies: List[Dict], min_quality: int = 4) -> List[Dict]:
        """Filter analogies based on quality and other criteria."""
        filtered = [a for a in analogies if a['quality_score'] >= min_quality]
        logger.info(f"Filtered {len(analogies)} -> {len(filtered)} analogies (min quality: {min_quality})")
        return filtered
    
    def export_to_anki_csv(self, analogies: List[Dict], filename: str = "metamia_analogies.csv"):
        """Export analogies to Anki-compatible CSV format."""
        
        # Anki CSV format: Front, Back, Extra fields...
        with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['Front', 'Back', 'Explanation', 'Source', 'Quality', 'URL']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            
            writer.writeheader()
            for analogy in analogies:
                writer.writerow({
                    'Front': analogy['concept'],
                    'Back': f"{analogy['concept']} is like {analogy['analogy']}",
                    'Explanation': analogy['explanation'][:300],  # Truncate for Anki
                    'Source': analogy['writer'],
                    'Quality': analogy['quality_score'],
                    'URL': analogy['url']
                })
        
        logger.info(f"Exported {len(analogies)} analogies to {filename}")
    
    def export_to_pandas(self, analogies: List[Dict]) -> pd.DataFrame:
        """Export to pandas DataFrame for further analysis."""
        return pd.DataFrame(analogies)

def main():
    """Main execution function."""
    scraper = MetamiaScraper(delay=1.5)  # Be respectful with delays
    
    # Scrape analogies
    analogies = scraper.scrape_analogies(max_entries=100)  # Start small
    
    if not analogies:
        logger.error("No analogies found!")
        return
    
    # Filter for quality
    filtered_analogies = scraper.filter_analogies(analogies, min_quality=4)
    
    # Export to CSV for Anki
    scraper.export_to_anki_csv(filtered_analogies)
    
    # Also save as DataFrame for inspection
    df = scraper.export_to_pandas(filtered_analogies)
    df.to_csv('metamia_raw_data.csv', index=False)
    
    # Print summary
    print(f"\nScraping Summary:")
    print(f"Total analogies found: {len(analogies)}")
    print(f"After filtering: {len(filtered_analogies)}")
    print(f"Average quality score: {df['quality_score'].mean():.1f}")
    print(f"\nTop concepts:")
    print(df['concept'].value_counts().head())
    
    print(f"\nFiles created:")
    print(f"- metamia_analogies.csv (for Anki import)")
    print(f"- metamia_raw_data.csv (for inspection)")

# if __name__ == "__main__":
#     main()



ModuleNotFoundError: No module named 'pandas'