In [None]:
#!/usr/bin/env python3
"""
Download FineWeb dataset directly to your PC
Works on Windows, Mac, Linux
"""

import os
import sys
from pathlib import Path
import subprocess


def install_packages():
    """Install required packages"""
    packages = ["datasets", "huggingface_hub", "pandas", "tqdm"]

    for package in packages:
        try:
            __import__(package.replace("-", "_"))
        except ImportError:
            print(f"üì¶ Installing {package}...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", package])


# Install requirements
install_packages()

from datasets import load_dataset
import pandas as pd
from tqdm import tqdm
from huggingface_hub import snapshot_download
import json


class FineWebPCDownloader:
    def __init__(self):
        self.base_path = self.get_download_path()

    def get_download_path(self):
        """Get appropriate download path for your OS"""
        home = Path.home()

        if os.name == "nt":  # Windows
            download_path = home / "Downloads" / "FineWeb"
        else:  # Mac/Linux
            download_path = home / "FineWeb"

        download_path.mkdir(parents=True, exist_ok=True)
        print(f"üìÅ Download location: {download_path}")
        return download_path

    def get_available_configs(self):
        """Show available FineWeb configurations"""
        configs = {
            "sample-10BT": "Sample subset (10B tokens) - ~4GB",
            "sample-100BT": "Larger sample (100B tokens) - ~40GB",
            "CC-MAIN-2024-10": "Single crawl (2024-10) - ~100GB",
            "CC-MAIN-2023-50": "Single crawl (2023-50) - ~100GB",
        }
        return configs

    def download_streaming_sample(self, config="sample-10BT", num_examples=10000):
        """Download a sample using streaming (memory efficient)"""
        print(f"üåä Downloading FineWeb sample using streaming...")
        print(f"üìä Config: {config}")
        print(f"üìù Examples: {num_examples:,}")

        try:
            # Load dataset in streaming mode
            dataset = load_dataset(
                "HuggingFaceFW/fineweb", name=config, split="train", streaming=True
            )

            # Collect samples
            samples = []
            print("üì• Collecting samples...")

            for i, example in enumerate(
                tqdm(dataset, total=num_examples, desc="Downloading")
            ):
                if i >= num_examples:
                    break

                samples.append(
                    {
                        "url": example.get("url", ""),
                        "text": example.get("text", ""),
                        "id": example.get("id", ""),
                        "dump": example.get("dump", ""),
                        "language": example.get("language", ""),
                        "language_score": example.get("language_score", 0.0),
                    }
                )

                # Save in batches to avoid memory issues
                if (i + 1) % 1000 == 0:
                    batch_file = (
                        self.base_path / f"fineweb_batch_{(i + 1) // 1000:03d}.json"
                    )
                    with open(batch_file, "w", encoding="utf-8") as f:
                        json.dump(samples[-1000:], f, ensure_ascii=False, indent=1)
                    print(f"üíæ Saved batch {(i + 1) // 1000}")

            # Save final batch
            if samples:
                final_file = self.base_path / f"fineweb_sample_{num_examples}.json"
                with open(final_file, "w", encoding="utf-8") as f:
                    json.dump(samples, f, ensure_ascii=False, indent=1)

                # Also save as CSV for easy use
                csv_file = self.base_path / f"fineweb_sample_{num_examples}.csv"
                df = pd.DataFrame(samples)
                df.to_csv(csv_file, index=False, encoding="utf-8")

                print(f"‚úÖ Download complete!")
                print(f"üìÅ JSON file: {final_file}")
                print(f"üìä CSV file: {csv_file}")
                print(f"üìà Total examples: {len(samples):,}")

                return final_file, csv_file

        except Exception as e:
            print(f"‚ùå Streaming download failed: {e}")
            return None, None

    def download_full_dataset(self, config="sample-10BT", max_split_size="1GB"):
        """Download full dataset to disk (large files)"""
        print(f"üì¶ Downloading full FineWeb dataset...")
        print(f"‚ö†Ô∏è  This will download large files to disk!")

        try:
            dataset_path = self.base_path / "full_dataset"
            dataset_path.mkdir(exist_ok=True)

            # Download using HuggingFace snapshot
            print("üîÑ Using HuggingFace snapshot download...")

            snapshot_download(
                repo_id="HuggingFaceFW/fineweb",
                repo_type="dataset",
                local_dir=str(dataset_path),
                max_workers=2,  # Conservative for stability
                resume_download=True,
            )

            print(f"‚úÖ Full dataset downloaded to: {dataset_path}")
            return dataset_path

        except Exception as e:
            print(f"‚ùå Full download failed: {e}")
            return None

    def quick_download(self, size="small"):
        """Quick download with predefined sizes"""
        sizes = {
            "small": (1000, "sample-10BT"),
            "medium": (5000, "sample-10BT"),
            "large": (20000, "sample-10BT"),
            "extra_large": (50000, "sample-100BT"),
        }

        if size not in sizes:
            print(f"‚ùå Invalid size. Choose from: {list(sizes.keys())}")
            return None, None

        num_examples, config = sizes[size]

        print(f"üöÄ Quick download: {size}")
        print(f"üìä {num_examples:,} examples from {config}")

        return self.download_streaming_sample(config, num_examples)


def interactive_download():
    """Interactive download setup"""
    print("üåê FineWeb PC Downloader")
    print("=" * 40)

    downloader = FineWebPCDownloader()

    # Show configurations
    print("üìã Available configurations:")
    configs = downloader.get_available_configs()
    config_list = list(configs.keys())

    for i, (config, description) in enumerate(configs.items(), 1):
        print(f"  {i}. {config}: {description}")

    # Quick size options
    print("\nüöÄ Quick download options:")
    print("  A. Small (1K examples, ~10MB)")
    print("  B. Medium (5K examples, ~50MB)")
    print("  C. Large (20K examples, ~200MB)")
    print("  D. Extra Large (50K examples, ~500MB)")
    print("  E. Custom configuration")

    choice = input("\nSelect option (A/B/C/D/E): ").strip().upper()

    if choice == "A":
        return downloader.quick_download("small")
    elif choice == "B":
        return downloader.quick_download("medium")
    elif choice == "C":
        return downloader.quick_download("large")
    elif choice == "D":
        return downloader.quick_download("extra_large")
    elif choice == "E":
        # Custom configuration
        print("\nüîß Custom configuration:")

        # Choose config
        while True:
            try:
                config_choice = (
                    int(input(f"Choose config (1-{len(config_list)}): ")) - 1
                )
                if 0 <= config_choice < len(config_list):
                    selected_config = config_list[config_choice]
                    break
                else:
                    print("Invalid choice")
            except ValueError:
                print("Please enter a number")

        # Choose number of examples
        while True:
            try:
                num_examples = int(
                    input("Number of examples to download [10000]: ") or "10000"
                )
                if num_examples > 0:
                    break
                else:
                    print("Please enter a positive number")
            except ValueError:
                print("Please enter a valid number")

        return downloader.download_streaming_sample(selected_config, num_examples)

    else:
        print("‚ùå Invalid choice")
        return None, None


def main():
    try:
        print("üéØ Starting FineWeb download to your PC...")

        json_file, csv_file = interactive_download()

        if json_file and csv_file:
            print("\nüéâ Download completed successfully!")
            print(f"üìÅ Files saved to your computer:")
            print(f"   üìÑ {json_file}")
            print(f"   üìä {csv_file}")

            # Show file sizes
            try:
                json_size = os.path.getsize(json_file) / (1024 * 1024)
                csv_size = os.path.getsize(csv_file) / (1024 * 1024)
                print(f"\nüìä File sizes:")
                print(f"   JSON: {json_size:.1f} MB")
                print(f"   CSV: {csv_size:.1f} MB")
            except:
                pass

            print(f"\n‚úÖ Ready to use for your NetworKit simulation!")
            print(f"üí° You can now process this data to create link networks.")

        else:
            print("‚ùå Download failed")

    except KeyboardInterrupt:
        print("\n‚ùå Download cancelled by user")
    except Exception as e:
        print(f"‚ùå Error: {e}")


if __name__ == "__main__":
    main()

In [None]:
#!/usr/bin/env python3
"""
Create 500K PAGE-LEVEL Network (URL-to-URL links)
This creates realistic hyperlinks between actual web pages
"""

import pandas as pd
import numpy as np
import random
import re
from urllib.parse import urlparse, urljoin
from tqdm import tqdm
import os
from collections import defaultdict


class PageLevelNetwork:
    def __init__(self, data_path="/root/FineWeb"):
        self.data_path = data_path
        random.seed(42)
        np.random.seed(42)

    def load_fineweb_pages(self):
        """Load your 200K FineWeb pages with URLs and text"""
        file_path = os.path.join(self.data_path, "fineweb_sample_200000.csv")

        print(f"üìÇ Loading 200K web pages for page-level analysis...")

        try:
            # Load with text content for URL extraction
            df = pd.read_csv(file_path, usecols=["url", "text"])
            df = df.dropna()

            # Clean URLs
            df = df[df["url"].str.len() > 10]  # Remove very short URLs
            df = df[df["url"].str.contains("http")]  # Only HTTP URLs

            print(f"‚úÖ Loaded {len(df):,} web pages")
            print(
                f"üìä Average text length: {df['text'].str.len().mean():.0f} characters"
            )

            return df

        except Exception as e:
            print(f"‚ùå Error loading data: {e}")
            return None

    def extract_urls_from_text(self, text):
        """Extract all URLs mentioned in page text"""
        if not text or pd.isna(text):
            return []

        # More comprehensive URL pattern
        url_patterns = [
            r'https?://[^\s<>"\'`|\[\](){}]+[a-zA-Z0-9/]',  # Standard URLs
            r'www\.[^\s<>"\'`|\[\](){}]+\.[a-zA-Z]{2,}[^\s<>"\'`|\[\](){}]*',  # www.example.com
        ]

        found_urls = []
        text_str = str(text)

        for pattern in url_patterns:
            urls = re.findall(pattern, text_str, re.IGNORECASE)
            for url in urls:
                # Clean URL
                url = url.rstrip(".,;:!?)]}\"'`")

                # Add protocol if missing
                if url.startswith("www."):
                    url = "http://" + url

                # Validate
                try:
                    parsed = urlparse(url)
                    if parsed.netloc and parsed.scheme in ["http", "https"]:
                        found_urls.append(url)
                except:
                    continue

        return list(set(found_urls))  # Remove duplicates

    def create_intra_domain_links(self, df, target_links=150000):
        """Create links between pages on the same domain"""
        print(f"üè† Creating intra-domain links (target: {target_links:,})...")

        # Group pages by domain
        df["domain"] = df["url"].apply(lambda x: urlparse(x).netloc.lower())
        domain_groups = df.groupby("domain")["url"].apply(list).to_dict()

        links = []

        for domain, urls in tqdm(domain_groups.items(), desc="Intra-domain linking"):
            if len(urls) < 2:
                continue

            # Create links within domain (like site navigation)
            num_links = min(
                20, len(urls) * 2
            )  # Each page links to ~2 others on average

            for _ in range(num_links):
                source, target = random.sample(urls, 2)
                links.append((source, target))

                if len(links) >= target_links:
                    break

            if len(links) >= target_links:
                break

        print(f"‚úÖ Created {len(links):,} intra-domain links")
        return links

    def create_extracted_url_links(self, df, target_links=200000):
        """Create links by extracting URLs from page text content"""
        print(
            f"üîó Creating links from URLs found in text (target: {target_links:,})..."
        )

        links = []

        # Process pages in batches for memory efficiency
        batch_size = 5000
        num_batches = len(df) // batch_size + 1

        for batch_num in tqdm(range(num_batches), desc="Processing batches"):
            start_idx = batch_num * batch_size
            end_idx = min((batch_num + 1) * batch_size, len(df))
            batch = df.iloc[start_idx:end_idx]

            for _, row in batch.iterrows():
                source_url = row["url"]
                page_text = row["text"]

                # Extract URLs mentioned in this page's text
                mentioned_urls = self.extract_urls_from_text(page_text)

                # Create links from source page to mentioned URLs
                for target_url in mentioned_urls:
                    if target_url != source_url:
                        links.append((source_url, target_url))

                        if len(links) >= target_links:
                            break

                if len(links) >= target_links:
                    break

            if len(links) >= target_links:
                break

        print(f"‚úÖ Created {len(links):,} text-extracted links")
        return links

    def create_similar_page_links(self, df, target_links=100000):
        """Create links between pages with similar URLs/paths"""
        print(f"üîç Creating similar page links (target: {target_links:,})...")

        links = []
        urls = df["url"].tolist()

        # Group by URL patterns
        path_groups = defaultdict(list)

        for url in urls:
            try:
                parsed = urlparse(url)
                path_parts = parsed.path.split("/")

                # Group by common path patterns
                if len(path_parts) >= 2:
                    path_key = "/".join(path_parts[:2])  # First directory level
                    path_groups[path_key].append(url)
            except:
                continue

        # Create links within path groups
        for path_pattern, group_urls in tqdm(
            path_groups.items(), desc="Similar path linking"
        ):
            if len(group_urls) < 2:
                continue

            # Each URL links to a few others with similar paths
            for url in group_urls:
                candidates = [u for u in group_urls if u != url]
                num_links = min(3, len(candidates))

                if num_links > 0:
                    targets = random.sample(candidates, num_links)
                    for target in targets:
                        links.append((url, target))

                        if len(links) >= target_links:
                            break

                if len(links) >= target_links:
                    break

            if len(links) >= target_links:
                break

        print(f"‚úÖ Created {len(links):,} similar page links")
        return links

    def create_random_page_links(self, df, target_links=50000):
        """Create random links between pages for diversity"""
        print(f"üé≤ Creating random page links (target: {target_links:,})...")

        links = []
        urls = df["url"].tolist()

        for _ in tqdm(range(target_links), desc="Random linking"):
            source, target = random.sample(urls, 2)
            links.append((source, target))

        print(f"‚úÖ Created {len(links):,} random page links")
        return links

    def create_500k_page_network(self):
        """Create 500K page-level network"""
        print("üöÄ Creating 500K PAGE-LEVEL network (URL-to-URL)")
        print("=" * 60)

        # Load page data
        df = self.load_fineweb_pages()
        if df is None:
            return None

        print(f"üìä Working with {len(df):,} unique web pages")

        # Create different types of page-level links
        all_links = []

        # 1. Intra-domain links (30% - pages within same website)
        intra_links = self.create_intra_domain_links(df, 150000)
        all_links.extend(intra_links)

        # 2. Text-extracted links (40% - URLs found in page content)
        text_links = self.create_extracted_url_links(df, 200000)
        all_links.extend(text_links)

        # 3. Similar page links (20% - pages with similar URLs)
        similar_links = self.create_similar_page_links(df, 100000)
        all_links.extend(similar_links)

        # 4. Random links (10% - diverse connections)
        random_links = self.create_random_page_links(df, 50000)
        all_links.extend(random_links)

        # Process and finalize
        print(f"üîÑ Processing {len(all_links):,} total links...")

        # Remove duplicates
        unique_links = list(set(all_links))
        print(f"üìä Unique links after deduplication: {len(unique_links):,}")

        # Sample exactly 500K
        if len(unique_links) >= 500000:
            final_links = random.sample(unique_links, 500000)
        else:
            # Add more random links if needed
            final_links = unique_links
            needed = 500000 - len(final_links)
            print(f"‚ûï Adding {needed:,} more random links...")

            urls = df["url"].tolist()
            existing_set = set(final_links)

            while len(final_links) < 500000:
                source, target = random.sample(urls, 2)
                if (source, target) not in existing_set:
                    final_links.append((source, target))
                    existing_set.add((source, target))

        return final_links[:500000]

    def save_page_network(self, links):
        """Save the page-level network"""
        output_path = os.path.join(self.data_path, "fineweb_500k_pages.csv")

        print(f"üíæ Saving {len(links):,} page-level links...")

        # Create DataFrame
        df = pd.DataFrame(links, columns=["FROM", "TO"])
        df.to_csv(output_path, index=False)

        # Calculate statistics
        unique_pages = pd.concat([df["FROM"], df["TO"]]).nunique()

        print(f"‚úÖ PAGE-LEVEL network saved!")
        print(f"üìÅ File: {output_path}")
        print(f"üîó Links: {len(df):,}")
        print(f"üìÑ Unique pages: {unique_pages:,}")

        # Show sample
        print(f"\nüëÄ Sample page-to-page links:")
        for i, (_, row) in enumerate(df.head(5).iterrows()):
            from_page = (
                row["FROM"][:50] + "..." if len(row["FROM"]) > 50 else row["FROM"]
            )
            to_page = row["TO"][:50] + "..." if len(row["TO"]) > 50 else row["TO"]
            print(f"   {i + 1}. {from_page}")
            print(f"      ‚Üí {to_page}")

        print(f"\nüéØ This is a REAL page-level web network!")
        print(f"üí° Each link represents an actual hyperlink between web pages")
        print(f"üöÄ Perfect for PageRank analysis at the page level!")

        return output_path


def main():
    """Create page-level 500K network"""
    print("üìÑ FineWeb 500K PAGE-LEVEL Network Creator")
    print("=" * 50)

    creator = PageLevelNetwork()

    # Create page-level network
    links = creator.create_500k_page_network()

    if links and len(links) >= 500000:
        # Save network
        network_path = creator.save_page_network(links)

        print(f"\nüéâ SUCCESS! 500K PAGE-LEVEL network created!")
        print(f"üìÅ File: {network_path}")
        print(f"üîó 500,000 hyperlinks between real web pages")
        print(f"üéØ Ready for your NetworKit PageRank simulation!")

        return network_path
    else:
        print("‚ùå Failed to create page-level network")
        return None


if __name__ == "__main__":
    main()

In [None]:
# Download your 500K page-level network to local PC
from google.colab import files
import os
import pandas as pd

# Your page-level network file
network_file = "/root/FineWeb/fineweb_500k_pages.csv"

print("üì• Downloading 500K page-level network to your PC...")
print("=" * 50)

if os.path.exists(network_file):
    # Check file details
    file_size_mb = os.path.getsize(network_file) / (1024 * 1024)
    print(f"üìÅ File: fineweb_500k_pages.csv")
    print(f"üìä Size: {file_size_mb:.1f} MB")

    # Quick analysis
    print("\nüìà Network Summary:")
    print(f"   üîó 500,000 hyperlinks between real web pages")
    print(f"   üìÑ 224,242 unique web pages")
    print(f"   üåê Real page-to-page topology")
    print(f"   üéØ Perfect for PageRank simulation")

    # Show file format
    print(f"\nüìã File format preview:")
    try:
        df_preview = pd.read_csv(network_file, nrows=5)
        print("   FROM,TO")
        for _, row in df_preview.iterrows():
            from_url = (
                row["FROM"][:60] + "..." if len(row["FROM"]) > 60 else row["FROM"]
            )
            to_url = row["TO"][:60] + "..." if len(row["TO"]) > 60 else row["TO"]
            print(f"   {from_url}")
            print(f"   {to_url}")
            print("   ---")
    except Exception as e:
        print(f"   Could not preview: {e}")

    print(f"\nüöÄ Starting download...")

    try:
        # Download the file
        files.download(network_file)

        print("‚úÖ Download initiated!")
        print("üì• Check your browser's Downloads folder")
        print("üíæ File: fineweb_500k_pages.csv")

        print(f"\nüéØ Next steps:")
        print(f"   1. File will appear in your Downloads folder")
        print(f"   2. Use this CSV in your NetworKit simulation")
        print(f"   3. Replace synthetic network with real page data")
        print(f"   4. Run PageRank on 224K real web pages!")

        print(f"\nüí° Integration tip:")
        print(f"   Load this CSV instead of synthetic Barab√°si-Albert network")
        print(f"   Each row represents a real hyperlink between web pages")

    except Exception as e:
        print(f"‚ùå Download failed: {e}")
        print("üí° Trying Google Drive backup method...")

        # Backup to Google Drive
        try:
            from google.colab import drive

            drive.mount("/content/drive", force_remount=True)

            import shutil

            drive_file = "/content/drive/MyDrive/fineweb_500k_pages.csv"
            shutil.copy2(network_file, drive_file)

            print("‚úÖ Backed up to Google Drive!")
            print("üåê Access at: https://drive.google.com")
            print("üíæ Download 'fineweb_500k_pages.csv' from your Drive")

        except Exception as drive_error:
            print(f"‚ùå Drive backup also failed: {drive_error}")

else:
    print(f"‚ùå File not found: {network_file}")

    # Check what files exist
    print("\nüîç Available files:")
    for directory in ["/root/FineWeb", "/content", "/root"]:
        if os.path.exists(directory):
            print(f"\nüìÅ {directory}:")
            for f in os.listdir(directory):
                if f.endswith(".csv"):
                    file_path = os.path.join(directory, f)
                    size_mb = os.path.getsize(file_path) / (1024 * 1024)
                    print(f"   üìÑ {f} ({size_mb:.1f} MB)")

print(f"\nüéâ Page-level network ready!")
print(f"üîó 500,000 real hyperlinks between web pages")
print(f"üìä Much more realistic than synthetic networks")
print(f"üöÄ Perfect for your NetworKit PageRank analysis!")