# Idea Farm - End-to-End Tuning Playground üöú

Fine-tune both the **Content Extraction** and **AI Prompts** completely in the browser.

### Workflow
1. **Setup**: Install libraries and authenticate.
2. **Extraction Logic**: Edit the web scraping code (Trafilatura/YouTube).
3. **Prompt Template**: Edit the Gemini prompt.
4. **Test**: Provide a URL or Text, run the pipeline, and see the results.

In [None]:
# @title 1. Setup & Auth
# Authenticate with Google Cloud
from google.colab import auth
auth.authenticate_user()

# Install Dependencies
!pip install google-cloud-aiplatform trafilatura youtube-transcript-api requests --upgrade --quiet

import vertexai
from vertexai.generative_models import GenerativeModel, GenerationConfig
import json
import requests
import trafilatura
from youtube_transcript_api import YouTubeTranscriptApi
from urllib.parse import urlparse, parse_qs
import logging

# Initialize Project
PROJECT_ID = "idea-farm-70752" # @param {type:"string"}
LOCATION = "us-central1" # @param {type:"string"}

vertexai.init(project=PROJECT_ID, location=LOCATION)
model = GenerativeModel("gemini-2.5-flash")
print(f"‚úÖ Connected to {PROJECT_ID} in {LOCATION}")

# Configure basic logging
logging.basicConfig(level=logging.INFO)

In [None]:
# @title 2. Content Extraction Logic
# This code mirrors 'functions/services/content_extractor.py'.
# Edit this cell to improve how we scrape text!

def extract_content(url: str) -> str | None:
    """
    Extracts content from a URL.
    Detects if it's a YouTube video or a regular web page.
    """
    if not url:
        return None

    try:
        # Check for YouTube
        video_id = _get_youtube_video_id(url)
        if video_id:
            print(f"üé• Detected YouTube video: {video_id}")
            return _get_youtube_transcript(video_id)
        
        # Default to web page extraction
        print(f"üåê Extracting web page: {url}")
        return _extract_web_page(url)
    except Exception as e:
        print(f"‚ùå Extraction failed for {url}: {e}")
        return None

def _get_youtube_video_id(url: str) -> str | None:
    """Parses YouTube video ID from URL."""
    parsed = urlparse(url)
    if parsed.hostname in ('youtu.be', 'www.youtu.be'):
        return parsed.path[1:]
    if parsed.hostname in ('youtube.com', 'www.youtube.com'):
        if parsed.path == '/watch':
            return parse_qs(parsed.query).get('v', [None])[0]
    return None

def _get_youtube_transcript(video_id: str) -> str:
    """Fetches transcript for a YouTube video."""
    try:
        transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
        # Combine text parts
        full_text = " ".join([entry['text'] for entry in transcript_list])
        return f"YouTube Transcript:\n\n{full_text}"
    except Exception as e:
        print(f"‚ö†Ô∏è Could not get transcript for {video_id}: {e}")
        return None

def _extract_web_page(url: str) -> str:
    """
    Extracts main text content from a web page using Trafilatura.
    Uses requests with a browser-like User-Agent to bypass basic bot filters.
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Referer": "https://www.google.com/"
    }
    
    downloaded = None
    try:
        print(f"Fetching URL: {url}")
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        downloaded = response.text
        print(f"Downloaded RAW content length: {len(downloaded) if downloaded else 0}")
    except Exception as e:
        print(f"Requests failed. Retrying with Trafilatura fetch... ({e})")
        downloaded = trafilatura.fetch_url(url)
    
    if not downloaded:
        raise ValueError("Failed to fetch URL content")
    
    text = trafilatura.extract(downloaded, include_comments=False)
    if not text:
        raise ValueError("No text extracted (empty result)")
        
    print(f"‚úÖ Extracted Text Length: {len(text)} chars")
    return text

print("‚úÖ Extraction Logic Loaded")

In [None]:
# @title 3. Prompt Template
# Edit the Gemini prompt below. Keep {content} placeholder.

prompt_template = """
Analyze the following text and provide a structured JSON response.

Text:
{content}  # Truncate

Output Format (JSON):
{{
    "overview": "A concise paragraph summary (3-5 sentences) suitable for quick reading.",
    "detailedAnalysis": "A comprehensive, 1-2 page deep dive into the content. Use Markdown formatting (## Headers, - bullets, **bold**). Highlight key insights, arguments, and context.",
    "topic": "Suggested Category",
    "suggestedLinks": [
        {{ "title": "Link Title", "url": "https://example.com", "description": "Why relevant" }}
    ]
}}
"""
print("üìù Prompt Template Updated")

In [None]:
# @title 4. Run Pipeline
# Enter a URL OR paste raw text.

target_url = "https://example.com" # @param {type:"string"}
raw_text_override = "" # @param {type:"string"}

final_input_text = ""

if raw_text_override.strip():
    print("üìÑ Using Raw Text Override")
    final_input_text = raw_text_override
elif target_url:
    print(f"üîó Processing URL: {target_url}")
    extracted = extract_content(target_url)
    if extracted:
        final_input_text = extracted
    else:
        print("‚ùå Extraction failed. Stopping.")
else:
    print("‚ùå Please provide a URL or Text.")

if final_input_text:
    print("\n‚è≥ Generating Summary with Gemini...")
    try:
        # inject content
        final_prompt = prompt_template.format(content=final_input_text[:15000]) # Increased limit for notebook
        
        responses = model.generate_content(
            final_prompt,
            generation_config=GenerationConfig(
                temperature=0.2,
                max_output_tokens=8192,
                top_p=0.8,
                top_k=40,
                response_mime_type="application/json"
            ),
            stream=False
        )
        
        # Parse and Display
        result_text = responses.text
        cleaned_text = result_text.replace('```json', '').replace('```', '').strip()
        result_json = json.loads(cleaned_text)
        
        print("\n‚úÖ Generation Complete!\n")
        print(json.dumps(result_json, indent=2))
        
    except Exception as e:
        print(f"‚ùå AI Generation Error: {e}")