In [1]:
import os

# Base directory
base_dir = "./dataset"
categories = ["productive", "unproductive"]

for cat in categories:
    path = os.path.join(base_dir, cat)
    os.makedirs(path, exist_ok=True)


In [3]:
import requests
from bs4 import BeautifulSoup
import os
import time
import re
import json
from urllib.parse import urlparse, urljoin
from pathlib import Path
import random

class MassDatasetScraper:
    def __init__(self, base_dir="dataset", min_words=50, delay=(1, 3), max_retries=3):
        """
        Initialize the mass dataset scraper.
        
        Args:
            base_dir: Base directory for the dataset
            min_words: Minimum word count to keep content
            delay: Tuple (min, max) seconds to wait between requests
            max_retries: Maximum retry attempts for failed requests
        """
        self.base_dir = base_dir
        self.min_words = min_words
        self.delay = delay
        self.max_retries = max_retries
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        }
        
        # Create directory structure
        self.productive_dir = Path(base_dir) / "productive"
        self.unproductive_dir = Path(base_dir) / "unproductive"
        self.productive_dir.mkdir(parents=True, exist_ok=True)
        self.unproductive_dir.mkdir(parents=True, exist_ok=True)
        
        # Stats
        self.stats = {
            'productive': {'saved': 0, 'filtered': 0, 'failed': 0},
            'unproductive': {'saved': 0, 'filtered': 0, 'failed': 0}
        }
    
    def clean_text(self, text):
        """Clean and normalize text content."""
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s.,!?;:\-\'\"]', '', text)
        return text.strip()
    
    def extract_text(self, html, url):
        """Extract clean text from HTML."""
        soup = BeautifulSoup(html, 'html.parser')
        
        # Remove unwanted elements
        for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'iframe', 'form']):
            tag.decompose()
        
        text = ""
        content_selectors = ['article', 'main', '.content', '#content', '.post-content', '.entry-content']
        
        for selector in content_selectors:
            elements = soup.select(selector) if selector.startswith(('.', '#')) else soup.find_all(selector)
            if elements:
                text = ' '.join([elem.get_text(separator=' ', strip=True) for elem in elements])
                break
        
        if not text:
            body = soup.find('body')
            text = body.get_text(separator=' ', strip=True) if body else ""
        
        return self.clean_text(text)
    
    def count_words(self, text):
        """Count words in text."""
        return len(text.split())
    
    def fetch_url(self, url, retries=0):
        """Fetch content from URL with retry logic."""
        try:
            response = requests.get(url, headers=self.headers, timeout=15)
            response.raise_for_status()
            return response.text
        except requests.RequestException as e:
            if retries < self.max_retries:
                wait = random.uniform(2, 5)
                print(f"  Retry {retries + 1}/{self.max_retries} after {wait:.1f}s...")
                time.sleep(wait)
                return self.fetch_url(url, retries + 1)
            print(f"  Failed after {self.max_retries} retries: {e}")
            return None
    
    def get_next_filename(self, directory):
        """Get the next sequential filename."""
        existing_files = list(directory.glob("*.txt"))
        if not existing_files:
            return "00001.txt"
        
        numbers = [int(re.match(r'(\d+)\.txt', f.name).group(1)) 
                   for f in existing_files if re.match(r'(\d+)\.txt', f.name)]
        next_num = max(numbers) + 1 if numbers else 1
        return f"{next_num:05d}.txt"
    
    def save_text(self, text, directory, category):
        """Save text to file."""
        filename = self.get_next_filename(directory)
        filepath = directory / filename
        
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(text)
        
        self.stats[category]['saved'] += 1
        return filepath
    
    def process_url(self, url, category):
        """Process a single URL and save if valid."""
        directory = self.productive_dir if category == 'productive' else self.unproductive_dir
        
        html = self.fetch_url(url)
        if html is None:
            self.stats[category]['failed'] += 1
            return False
        
        text = self.extract_text(html, url)
        word_count = self.count_words(text)
        
        if word_count < self.min_words:
            self.stats[category]['filtered'] += 1
            return False
        
        self.save_text(text, directory, category)
        return True
    
    def scrape_wikipedia_category(self, category_name, max_pages=500):
        """Scrape Wikipedia pages from a category."""
        print(f"\n[Wikipedia] Scraping category: {category_name}")
        base_url = "https://en.wikipedia.org/wiki/Category:"
        api_url = "https://en.wikipedia.org/w/api.php"
        
        urls = []
        params = {
            'action': 'query',
            'list': 'categorymembers',
            'cmtitle': f'Category:{category_name}',
            'cmlimit': 500,
            'format': 'json'
        }
        
        while len(urls) < max_pages:
            response = self.fetch_url(api_url + '?' + '&'.join([f'{k}={v}' for k, v in params.items()]))
            if not response:
                break
            
            try:
                data = json.loads(response)
                members = data.get('query', {}).get('categorymembers', [])
                
                for member in members:
                    if member.get('ns') == 0:  # Main namespace only
                        title = member['title'].replace(' ', '_')
                        urls.append(f"https://en.wikipedia.org/wiki/{title}")
                
                if 'continue' not in data or len(urls) >= max_pages:
                    break
                
                params['cmcontinue'] = data['continue']['cmcontinue']
                time.sleep(random.uniform(*self.delay))
            except:
                break
        
        return urls[:max_pages]
    
    def scrape_reddit_subreddit(self, subreddit, sort='top', time_filter='all', limit=500):
        """Scrape Reddit posts from a subreddit using JSON API."""
        print(f"\n[Reddit] Scraping r/{subreddit} ({sort}/{time_filter})")
        urls = []
        after = None
        
        while len(urls) < limit:
            url = f"https://old.reddit.com/r/{subreddit}/{sort}.json?t={time_filter}&limit=100"
            if after:
                url += f"&after={after}"
            
            response = self.fetch_url(url)
            if not response:
                break
            
            try:
                data = json.loads(response)
                posts = data['data']['children']
                
                for post in posts:
                    post_data = post['data']
                    # Get both title and selftext
                    title = post_data.get('title', '')
                    selftext = post_data.get('selftext', '')
                    combined = f"{title}. {selftext}".strip()
                    
                    if self.count_words(combined) >= self.min_words:
                        urls.append(combined)  # Store text directly
                
                after = data['data']['after']
                if not after or len(urls) >= limit:
                    break
                
                time.sleep(random.uniform(*self.delay))
            except:
                break
        
        return urls[:limit]
    
    def scrape_mass_urls(self, urls, category, target=None):
        """
        Scrape a large list of URLs with progress tracking.
        
        Args:
            urls: List of URLs or text content
            category: 'productive' or 'unproductive'
            target: Stop when this many samples saved (None = scrape all)
        """
        directory = self.productive_dir if category == 'productive' else self.unproductive_dir
        print(f"\n{'='*70}")
        print(f"Scraping {len(urls)} items for: {category.upper()}")
        if target:
            print(f"Target: {target} samples")
        print(f"{'='*70}\n")
        
        for i, item in enumerate(urls, 1):
            if target and self.stats[category]['saved'] >= target:
                print(f"\n✓ Target reached: {self.stats[category]['saved']} samples")
                break
            
            current = self.stats[category]['saved']
            print(f"[{i}/{len(urls)}] {category.capitalize()}: {current} saved | ", end="")
            
            # Check if item is URL or direct text
            if isinstance(item, str) and (item.startswith('http://') or item.startswith('https://')):
                print(f"{item[:60]}...")
                success = self.process_url(item, category)
            else:
                # Direct text (e.g., from Reddit)
                print("Direct text...")
                word_count = self.count_words(item)
                if word_count >= self.min_words:
                    self.save_text(item, directory, category)
                    success = True
                else:
                    self.stats[category]['filtered'] += 1
                    success = False
            
            if success:
                print(f"  ✓ Saved ({self.stats[category]['saved']} total)")
            
            time.sleep(random.uniform(*self.delay))
        
        self.print_summary(category)
    
    def print_summary(self, category=None):
        """Print statistics summary."""
        if category:
            stats = self.stats[category]
            print(f"\n{'='*70}")
            print(f"{category.upper()} Summary:")
            print(f"  Saved: {stats['saved']}")
            print(f"  Filtered: {stats['filtered']}")
            print(f"  Failed: {stats['failed']}")
            print(f"{'='*70}")
        else:
            print(f"\n{'='*70}")
            print("FINAL DATASET SUMMARY")
            print(f"{'='*70}")
            for cat in ['productive', 'unproductive']:
                stats = self.stats[cat]
                print(f"{cat.capitalize()}: {stats['saved']} samples")
            total = sum(s['saved'] for s in self.stats.values())
            print(f"{'='*70}")
            print(f"TOTAL SAMPLES: {total}")
            print(f"{'='*70}")


# ============================================================================
# MAIN EXECUTION - CONFIGURE YOUR DATASET HERE
# ============================================================================

if __name__ == "__main__":
    scraper = MassDatasetScraper(
        base_dir="dataset",
        min_words=50,
        delay=(1, 2),  # Random delay between 1-2 seconds
        max_retries=3
    )
    
    # ========================================================================
    # PRODUCTIVE CONTENT - Target: 3000+ samples
    # ========================================================================
    
    print("\n" + "="*70)
    print("PHASE 1: COLLECTING PRODUCTIVE CONTENT")
    print("="*70)
    
    productive_sources = []
    
    # Wikipedia categories (computer science, technology, education)
    wiki_categories = [
        'Computer_science',
        'Algorithms',
        'Data_structures',
        'Programming_languages',
        'Machine_learning',
        'Software_engineering',
        'Computer_programming',
        'Mathematics',
        'Physics',
        'Biology'
    ]
    
    for cat in wiki_categories:
        urls = scraper.scrape_wikipedia_category(cat, max_pages=200)
        productive_sources.extend(urls)
        print(f"  Collected {len(urls)} pages from {cat}")
    
    # Add more direct URLs if needed
    productive_sources.extend([
        "https://en.wikipedia.org/wiki/Artificial_intelligence",
        "https://en.wikipedia.org/wiki/Deep_learning",
        "https://en.wikipedia.org/wiki/Neural_network",
        # Add documentation URLs, arXiv abstracts, tech blog URLs here
    ])
    
    print(f"\nTotal productive URLs collected: {len(productive_sources)}")
    scraper.scrape_mass_urls(productive_sources, 'productive', target=3000)
    
    # ========================================================================
    # UNPRODUCTIVE CONTENT - Target: 3000+ samples
    # ========================================================================
    
    print("\n" + "="*70)
    print("PHASE 2: COLLECTING UNPRODUCTIVE CONTENT")
    print("="*70)
    
    unproductive_sources = []
    
    # Reddit subreddits (entertainment, memes, gaming)
    subreddits = [
        ('funny', 'top', 'all'),
        ('memes', 'top', 'all'),
        ('gaming', 'top', 'all'),
        ('movies', 'top', 'all'),
        ('television', 'top', 'all'),
        ('entertainment', 'top', 'all'),
        ('Jokes', 'top', 'all'),
        ('AdviceAnimals', 'top', 'all'),
        ('facepalm', 'top', 'all'),
        ('aww', 'top', 'all')
    ]
    
    for subreddit, sort, time_filter in subreddits:
        texts = scraper.scrape_reddit_subreddit(subreddit, sort, time_filter, limit=300)
        unproductive_sources.extend(texts)
        print(f"  Collected {len(texts)} posts from r/{subreddit}")
    
    print(f"\nTotal unproductive items collected: {len(unproductive_sources)}")
    scraper.scrape_mass_urls(unproductive_sources, 'unproductive', target=3000)
    
    # ========================================================================
    # FINAL SUMMARY
    # ========================================================================
    
    scraper.print_summary()
    
    print(f"\n✓ Dataset ready at: {scraper.base_dir}/")
    print("  - Minimum 2000+ samples per class for training")
    print("  - Ready for DistilBERT fine-tuning")
    print("  - No overfitting on toy dataset")


PHASE 1: COLLECTING PRODUCTIVE CONTENT

[Wikipedia] Scraping category: Computer_science
  Collected 16 pages from Computer_science

[Wikipedia] Scraping category: Algorithms
  Collected 137 pages from Algorithms

[Wikipedia] Scraping category: Data_structures
  Collected 22 pages from Data_structures

[Wikipedia] Scraping category: Programming_languages
  Collected 172 pages from Programming_languages

[Wikipedia] Scraping category: Machine_learning
  Collected 200 pages from Machine_learning

[Wikipedia] Scraping category: Software_engineering
  Collected 59 pages from Software_engineering

[Wikipedia] Scraping category: Computer_programming
  Collected 139 pages from Computer_programming

[Wikipedia] Scraping category: Mathematics
  Collected 3 pages from Mathematics

[Wikipedia] Scraping category: Physics
  Collected 24 pages from Physics

[Wikipedia] Scraping category: Biology
  Collected 22 pages from Biology

Total productive URLs collected: 797

Scraping 797 items for: PRODUCTI

In [5]:
import requests
from bs4 import BeautifulSoup
import os
import time
import re
import json
from pathlib import Path
import random

class UnproductiveScraper:
    def __init__(self, base_dir="dataset", min_words=50, delay=(1, 2), max_retries=3):
        """Scraper focused on unproductive content only."""
        self.base_dir = base_dir
        self.min_words = min_words
        self.delay = delay
        self.max_retries = max_retries
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        }
        
        self.unproductive_dir = Path(base_dir) / "unproductive"
        self.unproductive_dir.mkdir(parents=True, exist_ok=True)
        
        self.stats = {'saved': 0, 'filtered': 0, 'failed': 0}
    
    def clean_text(self, text):
        """Clean and normalize text."""
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s.,!?;:\-\'\"]', '', text)
        return text.strip()
    
    def count_words(self, text):
        """Count words in text."""
        return len(text.split())
    
    def fetch_url(self, url, retries=0):
        """Fetch URL with retry logic."""
        try:
            response = requests.get(url, headers=self.headers, timeout=15)
            response.raise_for_status()
            return response.text
        except requests.RequestException as e:
            if retries < self.max_retries:
                wait = random.uniform(2, 5)
                time.sleep(wait)
                return self.fetch_url(url, retries + 1)
            return None
    
    def get_next_filename(self):
        """Get next sequential filename."""
        existing_files = list(self.unproductive_dir.glob("*.txt"))
        if not existing_files:
            return "00001.txt"
        
        numbers = [int(re.match(r'(\d+)\.txt', f.name).group(1)) 
                   for f in existing_files if re.match(r'(\d+)\.txt', f.name)]
        next_num = max(numbers) + 1 if numbers else 1
        return f"{next_num:05d}.txt"
    
    def save_text(self, text):
        """Save text to file."""
        filename = self.get_next_filename()
        filepath = self.unproductive_dir / filename
        
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(text)
        
        self.stats['saved'] += 1
        return filepath
    
    def scrape_reddit_subreddit(self, subreddit, sort='top', time_filter='all', limit=500):
        """Scrape Reddit posts using JSON API."""
        print(f"\n[Reddit] Scraping r/{subreddit} ({sort}/{time_filter})")
        texts = []
        after = None
        
        while len(texts) < limit:
            url = f"https://old.reddit.com/r/{subreddit}/{sort}.json?t={time_filter}&limit=100"
            if after:
                url += f"&after={after}"
            
            response = self.fetch_url(url)
            if not response:
                break
            
            try:
                data = json.loads(response)
                posts = data['data']['children']
                
                for post in posts:
                    post_data = post['data']
                    title = post_data.get('title', '')
                    selftext = post_data.get('selftext', '')
                    combined = f"{title}. {selftext}".strip()
                    
                    if self.count_words(combined) >= self.min_words:
                        texts.append(combined)
                
                after = data['data']['after']
                if not after or len(texts) >= limit:
                    break
                
                time.sleep(random.uniform(*self.delay))
            except Exception as e:
                print(f"  Error: {e}")
                break
        
        print(f"  Collected {len(texts)} posts")
        return texts
    
    def scrape_and_save(self, texts, target=None):
        """Save texts to files."""
        print(f"\nSaving {len(texts)} items...")
        
        for i, text in enumerate(texts, 1):
            if target and self.stats['saved'] >= target:
                print(f"\n✓ Target reached: {self.stats['saved']} samples")
                break
            
            word_count = self.count_words(text)
            
            if word_count >= self.min_words:
                self.save_text(text)
                if i % 50 == 0:
                    print(f"  Progress: {self.stats['saved']} saved")
            else:
                self.stats['filtered'] += 1
            
            time.sleep(random.uniform(0.1, 0.3))  # Fast saving
        
        print(f"\n✓ Saved: {self.stats['saved']} | Filtered: {self.stats['filtered']}")


# ============================================================================
# UNPRODUCTIVE CONTENT COLLECTION - FOCUSED ON BALANCING
# ============================================================================

if __name__ == "__main__":
    scraper = UnproductiveScraper(
        base_dir="dataset",
        min_words=50,
        delay=(1, 2),
        max_retries=3
    )
    
    print("="*70)
    print("UNPRODUCTIVE CONTENT SCRAPER")
    print(f"Target: 3000+ samples")
    print(f"Current unproductive samples: Check your dataset/unproductive/ folder")
    print("="*70)
    
    # Calculate how many we need
    current_count = len(list(scraper.unproductive_dir.glob("*.txt")))
    target = 3000
    needed = target - current_count
    
    print(f"\nCurrent samples: {current_count}")
    print(f"Target samples: {target}")
    print(f"Need to collect: {max(0, needed)}")
    
    if needed <= 0:
        print("\n✓ Already balanced! No need to scrape.")
        exit()
    
    # ========================================================================
    # MASSIVE SUBREDDIT LIST - Entertainment, Memes, Gaming, Casual
    # ========================================================================
    
    all_texts = []
    
    subreddits = [
        # Humor & Memes
        ('funny', 'top', 'all', 500),
        ('memes', 'top', 'all', 500),
        ('dankmemes', 'top', 'all', 500),
        ('me_irl', 'top', 'all', 400),
        ('meirl', 'top', 'all', 400),
        ('AdviceAnimals', 'top', 'all', 300),
        ('wholesomememes', 'top', 'all', 300),
        
        # Entertainment
        ('movies', 'top', 'all', 400),
        ('television', 'top', 'all', 400),
        ('entertainment', 'top', 'all', 300),
        ('Music', 'top', 'all', 300),
        ('Netflix', 'top', 'all', 300),
        
        # Gaming
        ('gaming', 'top', 'all', 500),
        ('Games', 'top', 'all', 400),
        ('pcgaming', 'top', 'all', 300),
        ('PS4', 'top', 'all', 300),
        ('xboxone', 'top', 'all', 300),
        
        # Sports
        ('sports', 'top', 'all', 400),
        ('nfl', 'top', 'all', 300),
        ('nba', 'top', 'all', 300),
        ('soccer', 'top', 'all', 300),
        ('baseball', 'top', 'all', 200),
        
        # Social/Casual
        ('AskReddit', 'top', 'month', 500),
        ('CasualConversation', 'top', 'all', 300),
        ('tifu', 'top', 'all', 400),
        ('Showerthoughts', 'top', 'all', 400),
        ('LifeProTips', 'top', 'all', 300),
        
        # Visual/Reaction
        ('pics', 'top', 'all', 300),
        ('gifs', 'top', 'all', 300),
        ('videos', 'top', 'all', 300),
        ('aww', 'top', 'all', 300),
        ('Unexpected', 'top', 'all', 300),
        ('oddlysatisfying', 'top', 'all', 300),
        
        # Cringe/Reaction
        ('facepalm', 'top', 'all', 300),
        ('cringe', 'top', 'all', 300),
        ('PublicFreakout', 'top', 'all', 300),
        ('instant_regret', 'top', 'all', 300),
        ('Whatcouldgowrong', 'top', 'all', 300),
        ('WatchPeopleDieInside', 'top', 'all', 300),
        
        # Twitter/Social Media
        ('WhitePeopleTwitter', 'top', 'all', 400),
        ('BlackPeopleTwitter', 'top', 'all', 400),
        ('twitter', 'top', 'all', 300),
        
        # Interesting but unproductive
        ('todayilearned', 'top', 'all', 400),
        ('mildlyinteresting', 'top', 'all', 400),
        ('interestingasfuck', 'top', 'all', 300),
        ('Damnthatsinteresting', 'top', 'all', 300),
        
        # Random/Kids
        ('KidsAreFuckingStupid', 'top', 'all', 300),
        ('AnimalsBeingDerps', 'top', 'all', 300),
        ('therewasanattempt', 'top', 'all', 300),
        ('cursedcomments', 'top', 'all', 400),
        
        # Food/Lifestyle (casual)
        ('food', 'top', 'all', 300),
        ('FoodPorn', 'top', 'all', 300),
        ('recipes', 'top', 'all', 200),
        
        # Gossip/Celebrity
        ('entertainment', 'top', 'all', 300),
        ('celebrity', 'top', 'all', 200),
    ]
    
    print(f"\nScraping {len(subreddits)} subreddits...")
    
    for subreddit, sort, time_filter, limit in subreddits:
        if scraper.stats['saved'] >= target:
            print(f"\n✓ Target reached!")
            break
        
        texts = scraper.scrape_reddit_subreddit(subreddit, sort, time_filter, limit)
        all_texts.extend(texts)
        
        # Save in batches
        if len(all_texts) >= 500:
            scraper.scrape_and_save(all_texts, target=target)
            all_texts = []
    
    # Save remaining
    if all_texts and scraper.stats['saved'] < target:
        scraper.scrape_and_save(all_texts, target=target)
    
    # ========================================================================
    # FINAL SUMMARY
    # ========================================================================
    
    final_count = len(list(scraper.unproductive_dir.glob("*.txt")))
    
    print("\n" + "="*70)
    print("FINAL SUMMARY")
    print("="*70)
    print(f"Unproductive samples: {final_count}")
    print(f"Newly added: {scraper.stats['saved']}")
    print(f"Filtered (too short): {scraper.stats['filtered']}")
    print("="*70)
    print(f"\n✓ Dataset balanced at: {scraper.base_dir}/unproductive/")
    print("  Ready for DistilBERT fine-tuning!")

UNPRODUCTIVE CONTENT SCRAPER
Target: 3000+ samples
Current unproductive samples: Check your dataset/unproductive/ folder

Current samples: 132
Target samples: 3000
Need to collect: 2868

Scraping 55 subreddits...

[Reddit] Scraping r/funny (top/all)
  Collected 1 posts

[Reddit] Scraping r/memes (top/all)
  Collected 0 posts

[Reddit] Scraping r/dankmemes (top/all)
  Collected 1 posts

[Reddit] Scraping r/me_irl (top/all)
  Collected 0 posts

[Reddit] Scraping r/meirl (top/all)
  Collected 0 posts

[Reddit] Scraping r/AdviceAnimals (top/all)
  Collected 1 posts

[Reddit] Scraping r/wholesomememes (top/all)
  Collected 1 posts

[Reddit] Scraping r/movies (top/all)
  Collected 15 posts

[Reddit] Scraping r/television (top/all)
  Collected 15 posts

[Reddit] Scraping r/entertainment (top/all)
  Collected 0 posts

[Reddit] Scraping r/Music (top/all)
  Collected 25 posts

[Reddit] Scraping r/Netflix (top/all)
  Collected 58 posts

[Reddit] Scraping r/gaming (top/all)
  Collected 6 posts

[R

KeyboardInterrupt: 

In [6]:
import requests
from bs4 import BeautifulSoup
import os
import time
import re
import json
from pathlib import Path
import random

class ProductiveScraper:
    def __init__(self, base_dir="dataset", min_words=50, delay=(1, 2), max_retries=3):
        """Scraper focused on productive content only."""
        self.base_dir = base_dir
        self.min_words = min_words
        self.delay = delay
        self.max_retries = max_retries
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        }
        
        self.productive_dir = Path(base_dir) / "productive"
        self.productive_dir.mkdir(parents=True, exist_ok=True)
        
        self.stats = {'saved': 0, 'filtered': 0, 'failed': 0}
    
    def clean_text(self, text):
        """Clean and normalize text."""
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s.,!?;:\-\'\"]', '', text)
        return text.strip()
    
    def extract_text(self, html):
        """Extract clean text from HTML."""
        soup = BeautifulSoup(html, 'html.parser')
        
        for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'iframe', 'form']):
            tag.decompose()
        
        text = ""
        content_selectors = ['article', 'main', '.content', '#content', '.post-content', '.entry-content']
        
        for selector in content_selectors:
            elements = soup.select(selector) if selector.startswith(('.', '#')) else soup.find_all(selector)
            if elements:
                text = ' '.join([elem.get_text(separator=' ', strip=True) for elem in elements])
                break
        
        if not text:
            body = soup.find('body')
            text = body.get_text(separator=' ', strip=True) if body else ""
        
        return self.clean_text(text)
    
    def count_words(self, text):
        """Count words in text."""
        return len(text.split())
    
    def fetch_url(self, url, retries=0):
        """Fetch URL with retry logic."""
        try:
            response = requests.get(url, headers=self.headers, timeout=15)
            response.raise_for_status()
            return response.text
        except requests.RequestException as e:
            if retries < self.max_retries:
                wait = random.uniform(2, 5)
                time.sleep(wait)
                return self.fetch_url(url, retries + 1)
            return None
    
    def get_next_filename(self):
        """Get next sequential filename."""
        existing_files = list(self.productive_dir.glob("*.txt"))
        if not existing_files:
            return "00001.txt"
        
        numbers = [int(re.match(r'(\d+)\.txt', f.name).group(1)) 
                   for f in existing_files if re.match(r'(\d+)\.txt', f.name)]
        next_num = max(numbers) + 1 if numbers else 1
        return f"{next_num:05d}.txt"
    
    def save_text(self, text):
        """Save text to file."""
        filename = self.get_next_filename()
        filepath = self.productive_dir / filename
        
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(text)
        
        self.stats['saved'] += 1
        return filepath
    
    def scrape_wikipedia_category(self, category_name, max_pages=100):
        """Scrape Wikipedia pages from a category."""
        print(f"\n[Wikipedia] Scraping category: {category_name}")
        api_url = "https://en.wikipedia.org/w/api.php"
        
        urls = []
        params = {
            'action': 'query',
            'list': 'categorymembers',
            'cmtitle': f'Category:{category_name}',
            'cmlimit': 500,
            'format': 'json'
        }
        
        while len(urls) < max_pages:
            response = self.fetch_url(api_url + '?' + '&'.join([f'{k}={v}' for k, v in params.items()]))
            if not response:
                break
            
            try:
                data = json.loads(response)
                members = data.get('query', {}).get('categorymembers', [])
                
                for member in members:
                    if member.get('ns') == 0:  # Main namespace only
                        title = member['title'].replace(' ', '_')
                        urls.append(f"https://en.wikipedia.org/wiki/{title}")
                
                if 'continue' not in data or len(urls) >= max_pages:
                    break
                
                params['cmcontinue'] = data['continue']['cmcontinue']
                time.sleep(random.uniform(*self.delay))
            except:
                break
        
        print(f"  Collected {len(urls)} pages")
        return urls[:max_pages]
    
    def process_url(self, url):
        """Process a single URL and save if valid."""
        html = self.fetch_url(url)
        if html is None:
            self.stats['failed'] += 1
            return False
        
        text = self.extract_text(html)
        word_count = self.count_words(text)
        
        if word_count < self.min_words:
            self.stats['filtered'] += 1
            return False
        
        self.save_text(text)
        return True
    
    def scrape_urls(self, urls, target=None):
        """Scrape list of URLs."""
        print(f"\nProcessing {len(urls)} URLs...")
        
        for i, url in enumerate(urls, 1):
            if target and self.stats['saved'] >= target:
                print(f"\n✓ Target reached: {self.stats['saved']} samples")
                break
            
            print(f"[{i}/{len(urls)}] {self.stats['saved']} saved | {url[:60]}...")
            
            success = self.process_url(url)
            if success:
                print(f"  ✓ Saved")
            
            time.sleep(random.uniform(*self.delay))
        
        print(f"\n✓ Saved: {self.stats['saved']} | Filtered: {self.stats['filtered']} | Failed: {self.stats['failed']}")


# ============================================================================
# PRODUCTIVE CONTENT COLLECTION - 400 MORE SAMPLES
# ============================================================================

if __name__ == "__main__":
    scraper = ProductiveScraper(
        base_dir="dataset",
        min_words=50,
        delay=(1, 2),
        max_retries=3
    )
    
    print("="*70)
    print("PRODUCTIVE CONTENT SCRAPER")
    print(f"Target: 400 additional samples")
    print("="*70)
    
    current_count = len(list(scraper.productive_dir.glob("*.txt")))
    target = current_count + 400
    
    print(f"\nCurrent samples: {current_count}")
    print(f"Target samples: {target}")
    print(f"Need to collect: 400")
    
    # ========================================================================
    # WIKIPEDIA CATEGORIES - Technical & Educational
    # ========================================================================
    
    all_urls = []
    
    # Core tech categories
    wiki_categories = [
        ('Artificial_intelligence', 50),
        ('Data_science', 50),
        ('Computer_networks', 40),
        ('Databases', 40),
        ('Cryptography', 40),
        ('Operating_systems', 40),
        ('Web_development', 40),
        ('Cybersecurity', 40),
        ('Computational_linguistics', 30),
        ('Theoretical_computer_science', 30),
        ('Computer_architecture', 30),
        ('Distributed_computing', 30),
        ('Cloud_computing', 30),
        ('Quantum_computing', 30),
    ]
    
    print(f"\nCollecting URLs from {len(wiki_categories)} Wikipedia categories...")
    
    for category, max_pages in wiki_categories:
        if scraper.stats['saved'] >= 400:
            break
        
        urls = scraper.scrape_wikipedia_category(category, max_pages=max_pages)
        all_urls.extend(urls)
    
    print(f"\nTotal URLs collected: {len(all_urls)}")
    
    # Scrape URLs
    scraper.scrape_urls(all_urls, target=400)
    
    # ========================================================================
    # FINAL SUMMARY
    # ========================================================================
    
    final_count = len(list(scraper.productive_dir.glob("*.txt")))
    
    print("\n" + "="*70)
    print("FINAL SUMMARY")
    print("="*70)
    print(f"Productive samples: {final_count}")
    print(f"Newly added: {scraper.stats['saved']}")
    print(f"Filtered (too short): {scraper.stats['filtered']}")
    print(f"Failed: {scraper.stats['failed']}")
    print("="*70)
    print(f"\n✓ Dataset ready at: {scraper.base_dir}/productive/")
    print("  400 more productive samples collected!")

PRODUCTIVE CONTENT SCRAPER
Target: 400 additional samples

Current samples: 803
Target samples: 1203
Need to collect: 400

Collecting URLs from 14 Wikipedia categories...

[Wikipedia] Scraping category: Artificial_intelligence
  Collected 205 pages

[Wikipedia] Scraping category: Data_science
  Collected 24 pages

[Wikipedia] Scraping category: Computer_networks
  Collected 72 pages

[Wikipedia] Scraping category: Databases
  Collected 135 pages

[Wikipedia] Scraping category: Cryptography
  Collected 224 pages

[Wikipedia] Scraping category: Operating_systems
  Collected 31 pages

[Wikipedia] Scraping category: Web_development
  Collected 95 pages

[Wikipedia] Scraping category: Cybersecurity
  Collected 0 pages

[Wikipedia] Scraping category: Computational_linguistics
  Collected 215 pages

[Wikipedia] Scraping category: Theoretical_computer_science
  Collected 138 pages

[Wikipedia] Scraping category: Computer_architecture
  Collected 91 pages

[Wikipedia] Scraping category: Distrib

In [20]:
import requests
from bs4 import BeautifulSoup
import os
import time
import re
from pathlib import Path
import random

class AlternativeTextScraper:
    def __init__(self, base_dir="dataset", min_words=50, delay=(1, 2)):
        self.base_dir = base_dir
        self.min_words = min_words
        self.delay = delay
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        
        self.productive_dir = Path(base_dir) / "productive"
        self.unproductive_dir = Path(base_dir) / "unproductive"
        self.productive_dir.mkdir(parents=True, exist_ok=True)
        self.unproductive_dir.mkdir(parents=True, exist_ok=True)
        
        self.stats = {
            'productive': {'saved': 0, 'failed': 0},
            'unproductive': {'saved': 0, 'failed': 0}
        }
    
    def clean_text(self, text):
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s.,!?;:\-\'\"]', '', text)
        return text.strip()
    
    def get_next_index(self, category):
        directory = self.productive_dir if category == 'productive' else self.unproductive_dir
        files = list(directory.glob("*.txt"))
        if not files:
            return 1
        numbers = [int(re.match(r'(\d+)\.txt', f.name).group(1)) 
                   for f in files if re.match(r'(\d+)\.txt', f.name)]
        return max(numbers) + 1 if numbers else 1
    
    def save_text(self, text, category):
        directory = self.productive_dir if category == 'productive' else self.unproductive_dir
        idx = self.get_next_index(category)
        filename = directory / f"{idx:05d}.txt"
        
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(text)
        
        self.stats[category]['saved'] += 1
        return filename
    
    def fetch_url(self, url):
        try:
            response = requests.get(url, headers=self.headers, timeout=15)
            response.raise_for_status()
            return response.text
        except:
            return None
    
    # =========================================================================
    # PRODUCTIVE SOURCES
    # =========================================================================
    
    def scrape_medium_tech(self, tag, max_articles=100):
        """Scrape Medium tech articles"""
        print(f"\n[Medium] Scraping tag: {tag}")
        articles = []
        
        url = f"https://medium.com/tag/{tag}"
        html = self.fetch_url(url)
        
        if not html:
            return articles
        
        soup = BeautifulSoup(html, 'html.parser')
        article_links = soup.find_all('a', href=True)
        
        for link in article_links[:max_articles]:
            href = link['href']
            if '/p/' in href or '@' in href:
                full_url = f"https://medium.com{href}" if href.startswith('/') else href
                articles.append(full_url)
        
        print(f"  Found {len(articles)} article URLs")
        return articles
    
    def scrape_dev_to(self, tag, max_articles=100):
        """Scrape Dev.to articles"""
        print(f"\n[Dev.to] Scraping tag: {tag}")
        articles = []
        
        url = f"https://dev.to/t/{tag}"
        html = self.fetch_url(url)
        
        if not html:
            return articles
        
        soup = BeautifulSoup(html, 'html.parser')
        article_divs = soup.find_all('div', class_='crayons-story')
        
        for div in article_divs[:max_articles]:
            link = div.find('a', class_='crayons-story__hidden-navigation-link')
            if link and link.get('href'):
                articles.append(f"https://dev.to{link['href']}")
        
        print(f"  Found {len(articles)} article URLs")
        return articles
    
    def scrape_hackernoon(self, topic, max_articles=50):
        """Scrape HackerNoon articles"""
        print(f"\n[HackerNoon] Scraping topic: {topic}")
        articles = []
        
        url = f"https://hackernoon.com/tagged/{topic}"
        html = self.fetch_url(url)
        
        if not html:
            return articles
        
        soup = BeautifulSoup(html, 'html.parser')
        links = soup.find_all('a', href=True)
        
        for link in links[:max_articles]:
            href = link['href']
            if href.startswith('/') and len(href) > 10:
                articles.append(f"https://hackernoon.com{href}")
        
        print(f"  Found {len(articles)} article URLs")
        return articles
    
    # =========================================================================
    # UNPRODUCTIVE SOURCES
    # =========================================================================
    
    def scrape_buzzfeed(self, max_articles=100):
        """Scrape BuzzFeed articles"""
        print(f"\n[BuzzFeed] Scraping trending")
        articles = []
        
        url = "https://www.buzzfeed.com/trending"
        html = self.fetch_url(url)
        
        if not html:
            return articles
        
        soup = BeautifulSoup(html, 'html.parser')
        links = soup.find_all('a', href=True)
        
        for link in links[:max_articles]:
            href = link['href']
            if 'buzzfeed.com' in href and len(href) > 30:
                articles.append(href)
        
        print(f"  Found {len(articles)} article URLs")
        return articles
    
    def scrape_9gag_text(self, section='hot', max_posts=50):
        """Scrape 9GAG post titles and descriptions"""
        print(f"\n[9GAG] Scraping {section}")
        texts = []
        
        url = f"https://9gag.com/{section}"
        html = self.fetch_url(url)
        
        if not html:
            return texts
        
        soup = BeautifulSoup(html, 'html.parser')
        posts = soup.find_all('article')
        
        for post in posts[:max_posts]:
            title_elem = post.find('h1')
            desc_elem = post.find('p')
            
            title = title_elem.get_text(strip=True) if title_elem else ""
            desc = desc_elem.get_text(strip=True) if desc_elem else ""
            
            combined = f"{title}. {desc}".strip()
            if len(combined.split()) >= self.min_words:
                texts.append(combined)
        
        print(f"  Found {len(texts)} text posts")
        return texts
    
    # =========================================================================
    # GENERIC SCRAPING
    # =========================================================================
    
    def extract_article_text(self, url):
        """Extract main article text from any URL"""
        html = self.fetch_url(url)
        if not html:
            return None
        
        soup = BeautifulSoup(html, 'html.parser')
        
        # Remove unwanted elements
        for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'iframe', 'form', 'button']):
            tag.decompose()
        
        # Try common article selectors
        selectors = ['article', 'main', '.post-content', '.entry-content', '.article-content', '.story-body']
        
        text = ""
        for selector in selectors:
            elements = soup.select(selector) if selector.startswith(('.', '#')) else soup.find_all(selector)
            if elements:
                text = ' '.join([elem.get_text(separator=' ', strip=True) for elem in elements])
                break
        
        if not text:
            body = soup.find('body')
            text = body.get_text(separator=' ', strip=True) if body else ""
        
        return self.clean_text(text)
    
    def scrape_urls(self, urls, category, target=None):
        """Scrape list of URLs and save"""
        print(f"\n{'='*70}")
        print(f"Processing {len(urls)} URLs for: {category.upper()}")
        if target:
            print(f"Target: {target} samples")
        print(f"{'='*70}\n")
        
        for i, item in enumerate(urls, 1):
            if target and self.stats[category]['saved'] >= target:
                print(f"\n✓ Target reached!")
                break
            
            # Check if it's a URL or direct text
            if isinstance(item, str) and item.startswith('http'):
                text = self.extract_article_text(item)
                if text and len(text.split()) >= self.min_words:
                    self.save_text(text, category)
                    print(f"[{i}/{len(urls)}] ✓ Saved: {self.stats[category]['saved']} | {item[:50]}...")
                else:
                    self.stats[category]['failed'] += 1
                    print(f"[{i}/{len(urls)}] ✗ Failed or too short")
            else:
                # Direct text
                if len(item.split()) >= self.min_words:
                    self.save_text(item, category)
                    print(f"[{i}/{len(urls)}] ✓ Saved: {self.stats[category]['saved']} | Direct text")
                else:
                    self.stats[category]['failed'] += 1
            
            time.sleep(random.uniform(*self.delay))
        
        print(f"\n{'='*70}")
        print(f"{category.upper()} Summary: {self.stats[category]['saved']} saved, {self.stats[category]['failed']} failed")
        print(f"{'='*70}\n")


# =============================================================================
# MAIN EXECUTION
# =============================================================================

if __name__ == "__main__":
    scraper = AlternativeTextScraper(
        base_dir=r"C:\my_notebook\eda\dataset",
        min_words=50,
        delay=(1, 2)
    )
    
    print("="*70)
    print("ALTERNATIVE TEXT SCRAPER (Better than YouTube!)")
    print("="*70)
    
    # Current counts
    current_prod = len(list(scraper.productive_dir.glob("*.txt")))
    current_unprod = len(list(scraper.unproductive_dir.glob("*.txt")))
    
    print(f"\nCurrent productive: {current_prod}")
    print(f"Current unproductive: {current_unprod}")
    
    # ==========================================================================
    # PRODUCTIVE CONTENT
    # ==========================================================================
    
    print("\n" + "="*70)
    print("PHASE 1: PRODUCTIVE CONTENT")
    print("="*70)
    
    productive_urls = []
    
    # Medium tech articles
    medium_tags = ['machine-learning', 'data-science', 'programming', 'artificial-intelligence', 
                   'python', 'javascript', 'web-development', 'software-engineering']
    for tag in medium_tags:
        urls = scraper.scrape_medium_tech(tag, max_articles=30)
        productive_urls.extend(urls)
        time.sleep(2)
    
    # Dev.to articles
    devto_tags = ['python', 'javascript', 'webdev', 'machinelearning', 'datascience', 
                  'programming', 'tutorial', 'beginners']
    for tag in devto_tags:
        urls = scraper.scrape_dev_to(tag, max_articles=30)
        productive_urls.extend(urls)
        time.sleep(2)
    
    # HackerNoon
    hackernoon_topics = ['programming', 'machine-learning', 'blockchain', 'cybersecurity']
    for topic in hackernoon_topics:
        urls = scraper.scrape_hackernoon(topic, max_articles=20)
        productive_urls.extend(urls)
        time.sleep(2)
    
    print(f"\nTotal productive URLs collected: {len(productive_urls)}")
    scraper.scrape_urls(productive_urls, 'productive', target=400)
    
    # ==========================================================================
    # UNPRODUCTIVE CONTENT
    # ==========================================================================
    
    print("\n" + "="*70)
    print("PHASE 2: UNPRODUCTIVE CONTENT")
    print("="*70)
    
    unproductive_sources = []
    
    # BuzzFeed
    buzzfeed_urls = scraper.scrape_buzzfeed(max_articles=100)
    unproductive_sources.extend(buzzfeed_urls)
    
    # 9GAG text posts
    gag_texts = scraper.scrape_9gag_text('hot', max_posts=50)
    unproductive_sources.extend(gag_texts)
    
    print(f"\nTotal unproductive items collected: {len(unproductive_sources)}")
    scraper.scrape_urls(unproductive_sources, 'unproductive', target=400)
    
    # ==========================================================================
    # FINAL SUMMARY
    # ==========================================================================
    
    final_prod = len(list(scraper.productive_dir.glob("*.txt")))
    final_unprod = len(list(scraper.unproductive_dir.glob("*.txt")))
    
    print("\n" + "="*70)
    print("FINAL SUMMARY")
    print("="*70)
    print(f"Productive: {final_prod} (added {final_prod - current_prod})")
    print(f"Unproductive: {final_unprod} (added {final_unprod - current_unprod})")
    print(f"Total: {final_prod + final_unprod}")
    print("="*70)
    print("\n✓ Dataset complete! No YouTube transcripts needed.")

ALTERNATIVE TEXT SCRAPER (Better than YouTube!)

Current productive: 1203
Current unproductive: 1270

PHASE 1: PRODUCTIVE CONTENT

[Medium] Scraping tag: machine-learning
  Found 5 article URLs

[Medium] Scraping tag: data-science
  Found 5 article URLs

[Medium] Scraping tag: programming
  Found 8 article URLs

[Medium] Scraping tag: artificial-intelligence
  Found 5 article URLs

[Medium] Scraping tag: python
  Found 2 article URLs

[Medium] Scraping tag: javascript
  Found 8 article URLs

[Medium] Scraping tag: web-development
  Found 5 article URLs

[Medium] Scraping tag: software-engineering
  Found 5 article URLs

[Dev.to] Scraping tag: python
  Found 25 article URLs

[Dev.to] Scraping tag: javascript
  Found 25 article URLs

[Dev.to] Scraping tag: webdev
  Found 25 article URLs

[Dev.to] Scraping tag: machinelearning
  Found 25 article URLs

[Dev.to] Scraping tag: datascience
  Found 25 article URLs

[Dev.to] Scraping tag: programming
  Found 25 article URLs

[Dev.to] Scraping t

In [21]:
import os
import re
import requests
from bs4 import BeautifulSoup
from googleapiclient.discovery import build
import time
import json

# =====================
# CONFIGURATION
# =====================
API_KEY = "AIzaSyAMCAkxznT5CGZTQkKVyT84I1yA_2VTlbc"
SEARCH_QUERY_PRODUCTIVE = ["tutorial", "educational", "python programming", "machine learning", 
                           "data science tutorial", "coding tutorial", "math lecture"]
SEARCH_QUERY_UNPRODUCTIVE = ["funny", "entertainment", "gaming", "memes", "funny videos", 
                             "comedy", "pranks", "vlog"]
VIDEOS_PER_CLASS = 500
PRODUCTIVE_PATH = r"C:\my_notebook\eda\dataset\productive"
UNPRODUCTIVE_PATH = r"C:\my_notebook\eda\dataset\unproductive"

os.makedirs(PRODUCTIVE_PATH, exist_ok=True)
os.makedirs(UNPRODUCTIVE_PATH, exist_ok=True)

# =====================
# HELPER FUNCTIONS
# =====================
def clean_text(text):
    """Clean text by removing excessive whitespace."""
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def get_next_index(folder):
    """Get next available sequential index in folder"""
    files = [f for f in os.listdir(folder) if f.endswith(".txt")]
    if not files: 
        return 1
    indices = [int(f.split(".")[0]) for f in files if f.split(".")[0].isdigit()]
    return max(indices) + 1 if indices else 1

def save_text(text, folder, idx):
    """Save text to folder with sequential numbering"""
    filename = os.path.join(folder, f"{idx:05d}.txt")
    with open(filename, "w", encoding="utf-8") as f:
        f.write(text)

# =====================
# YOUTUBE API FUNCTIONS
# =====================
def search_youtube(query_list, max_results):
    """Search YouTube using API and return video IDs"""
    youtube = build("youtube", "v3", developerKey=API_KEY)
    video_ids = []
    
    for q in query_list:
        try:
            request = youtube.search().list(
                q=q,
                part="id",
                type="video",
                maxResults=50,
                relevanceLanguage="en"
            )
            response = request.execute()
            
            for item in response.get("items", []):
                vid_id = item["id"]["videoId"]
                if vid_id not in video_ids:
                    video_ids.append(vid_id)
                    
                if len(video_ids) >= max_results:
                    return video_ids
                    
            time.sleep(1)
            
        except Exception as e:
            print(f"Search error for '{q}': {e}")
            continue
    
    return video_ids

def get_video_metadata_api(video_id):
    """Get video title and description using YouTube API"""
    try:
        youtube = build("youtube", "v3", developerKey=API_KEY)
        request = youtube.videos().list(
            part="snippet",
            id=video_id
        )
        response = request.execute()
        
        if not response.get('items'):
            return None, None
        
        snippet = response['items'][0]['snippet']
        title = snippet.get('title', '')
        description = snippet.get('description', '')
        
        return title, description
    except:
        return None, None

def get_video_comments_api(video_id, max_comments=20):
    """Get video comments using YouTube API"""
    try:
        youtube = build("youtube", "v3", developerKey=API_KEY)
        request = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            maxResults=max_comments,
            textFormat="plainText",
            order="relevance"
        )
        response = request.execute()
        
        comments = []
        for item in response.get('items', []):
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            comments.append(comment)
        
        return comments
    except:
        return []

# =====================
# SCRAPING WITH HTML (BACKUP METHOD)
# =====================
def get_video_metadata_html(video_id):
    """Scrape video title and description from HTML (backup method)"""
    try:
        url = f"https://www.youtube.com/watch?v={video_id}"
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
            'Accept-Language': 'en-US,en;q=0.9'
        }
        
        response = requests.get(url, headers=headers, timeout=10)
        
        if response.status_code != 200:
            return None, None
        
        html = response.text
        
        # Extract title
        title_match = re.search(r'"title":"(.*?)"', html)
        title = title_match.group(1) if title_match else ""
        
        # Extract description
        desc_match = re.search(r'"shortDescription":"(.*?)"', html)
        description = desc_match.group(1) if desc_match else ""
        
        # Unescape unicode
        title = title.encode().decode('unicode_escape')
        description = description.encode().decode('unicode_escape')
        
        return title, description
    except:
        return None, None

# =====================
# COMBINED SCRAPER
# =====================
def scrape_video_content(video_id, use_api=True):
    """
    Get video title, description, and comments.
    Combines all text into one document.
    """
    try:
        if use_api:
            # Try API first (more reliable)
            title, description = get_video_metadata_api(video_id)
            comments = get_video_comments_api(video_id, max_comments=20)
        else:
            # Fallback to HTML scraping
            title, description = get_video_metadata_html(video_id)
            comments = []
        
        if not title and not description:
            return None
        
        # Combine all text
        full_text = f"{title}. {description}"
        
        if comments:
            comments_text = " ".join(comments[:20])  # Top 20 comments
            full_text += f". Comments: {comments_text}"
        
        full_text = clean_text(full_text)
        
        return full_text
        
    except Exception as e:
        return None

def fetch_youtube_metadata(video_ids, folder, use_api=True):
    """Fetch metadata from YouTube videos and save them"""
    idx = get_next_index(folder)
    saved_count = 0
    skipped_count = 0
    
    for i, vid in enumerate(video_ids, 1):
        try:
            text = scrape_video_content(vid, use_api=use_api)
            
            if text is None:
                print(f"[{i}/{len(video_ids)}] ✗ Failed {vid}: No metadata available")
                skipped_count += 1
                continue
            
            word_count = len(text.split())
            
            if word_count >= 50:
                save_text(text, folder, idx)
                print(f"[{i}/{len(video_ids)}] ✓ Saved: {idx:05d}.txt ({word_count} words) | Total: {saved_count + 1}")
                idx += 1
                saved_count += 1
            else:
                print(f"[{i}/{len(video_ids)}] ✗ Skipped {vid}: too short ({word_count} words)")
                skipped_count += 1
        
        except Exception as e:
            error_msg = str(e)[:60]
            print(f"[{i}/{len(video_ids)}] ✗ Failed {vid}: {error_msg}")
            skipped_count += 1
        
        # Delay to respect rate limits
        time.sleep(1)
    
    print(f"\n{'='*60}")
    print(f"Summary for {folder}:")
    print(f"  Saved: {saved_count}")
    print(f"  Skipped: {skipped_count}")
    print(f"{'='*60}\n")
    
    return saved_count

# =====================
# MAIN SCRIPT
# =====================
if __name__ == "__main__":
    print("="*70)
    print("YOUTUBE METADATA SCRAPER")
    print("(Title + Description + Comments)")
    print("="*70)
    
    # Get current counts
    current_prod = len([f for f in os.listdir(PRODUCTIVE_PATH) if f.endswith('.txt')])
    current_unprod = len([f for f in os.listdir(UNPRODUCTIVE_PATH) if f.endswith('.txt')])
    
    print(f"\nCurrent productive samples: {current_prod}")
    print(f"Current unproductive samples: {current_unprod}")
    print(f"Target per class: {VIDEOS_PER_CLASS} videos\n")
    
    # Choose method
    print("Method: Using YouTube Data API (more reliable)")
    print("Includes: Title + Description + Top 20 Comments\n")
    
    # ========================================================================
    # PRODUCTIVE VIDEOS
    # ========================================================================
    
    print("="*70)
    print("PHASE 1: PRODUCTIVE VIDEOS")
    print("="*70)
    prod_video_ids = search_youtube(SEARCH_QUERY_PRODUCTIVE, VIDEOS_PER_CLASS)
    print(f"Found {len(prod_video_ids)} productive video IDs\n")
    
    prod_saved = fetch_youtube_metadata(prod_video_ids, PRODUCTIVE_PATH, use_api=True)
    
    # ========================================================================
    # UNPRODUCTIVE VIDEOS
    # ========================================================================
    
    print("="*70)
    print("PHASE 2: UNPRODUCTIVE VIDEOS")
    print("="*70)
    unprod_video_ids = search_youtube(SEARCH_QUERY_UNPRODUCTIVE, VIDEOS_PER_CLASS)
    print(f"Found {len(unprod_video_ids)} unproductive video IDs\n")
    
    unprod_saved = fetch_youtube_metadata(unprod_video_ids, UNPRODUCTIVE_PATH, use_api=True)
    
    # ========================================================================
    # FINAL SUMMARY
    # ========================================================================
    
    final_prod = len([f for f in os.listdir(PRODUCTIVE_PATH) if f.endswith('.txt')])
    final_unprod = len([f for f in os.listdir(UNPRODUCTIVE_PATH) if f.endswith('.txt')])
    
    print("\n" + "="*70)
    print("FINAL DATASET SUMMARY")
    print("="*70)
    print(f"Productive samples: {final_prod} (added {prod_saved})")
    print(f"Unproductive samples: {final_unprod} (added {unprod_saved})")
    print(f"Total samples: {final_prod + final_unprod}")
    print("="*70)
    print("\n✓ Done! Dataset updated with YouTube metadata.")
    print("Each file contains: Title + Description + Top Comments")

YOUTUBE METADATA SCRAPER
(Title + Description + Comments)

Current productive samples: 1248
Current unproductive samples: 1310
Target per class: 500 videos

Method: Using YouTube Data API (more reliable)
Includes: Title + Description + Top 20 Comments

PHASE 1: PRODUCTIVE VIDEOS
Found 306 productive video IDs

[1/306] ✓ Saved: 01249.txt (56 words) | Total: 1
[2/306] ✓ Saved: 01250.txt (334 words) | Total: 2
[3/306] ✓ Saved: 01251.txt (231 words) | Total: 3
[4/306] ✓ Saved: 01252.txt (410 words) | Total: 4
[5/306] ✓ Saved: 01253.txt (272 words) | Total: 5
[6/306] ✓ Saved: 01254.txt (210 words) | Total: 6
[7/306] ✓ Saved: 01255.txt (215 words) | Total: 7
[8/306] ✓ Saved: 01256.txt (316 words) | Total: 8
[9/306] ✓ Saved: 01257.txt (260 words) | Total: 9
[10/306] ✓ Saved: 01258.txt (139 words) | Total: 10
[11/306] ✓ Saved: 01259.txt (182 words) | Total: 11
[12/306] ✓ Saved: 01260.txt (377 words) | Total: 12
[13/306] ✓ Saved: 01261.txt (856 words) | Total: 13
[14/306] ✓ Saved: 01262.txt (20