In [None]:
"""
Gemini Query Fanout - Google Colab Version
This script generates 10 semantically related questions from a keyword using Gemini
Then uses Google embeddings and cosine similarity to rank and return the top N most relevant questions
"""

import json
import requests
import numpy as np
from google.colab import userdata


def get_gemini_api_key():
    """Get Gemini API key from Google Colab secrets"""
    try:
        api_key = userdata.get("gemini_api")
        return api_key
    except Exception as e:
        print(f"‚ùå Error getting API key from Colab secrets: {e}")
        print(
            "Please add 'gemini_api' to your Colab secrets (üîë icon in the left sidebar)"
        )
        return None


def get_gemini_embeddings(texts, api_key):
    """
    Get embeddings from Google Gemini API for a list of texts

    Args:
        texts: List of text strings to embed
        api_key: Gemini API key

    Returns:
        List of embeddings or None if failed
    """
    embeddings = []
    print(f"üîç Getting Gemini embeddings for {len(texts)} texts...")

    for i, text in enumerate(texts):
        url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-embedding-001:embedContent?key={api_key}"

        headers = {"Content-Type": "application/json"}

        data = {"model": "gemini-embedding-001", "content": {"parts": [{"text": text}]}}

        try:
            response = requests.post(url, headers=headers, json=data, timeout=60)
            response.raise_for_status()
            result = response.json()

            if "embedding" in result and "values" in result["embedding"]:
                embedding = result["embedding"]["values"]
                embeddings.append(embedding)
                print(
                    f"   ‚úÖ Text {i+1}/{len(texts)}: Got embedding with {len(embedding)} dimensions"
                )
            else:
                print(f"   ‚ùå Text {i+1}/{len(texts)}: Invalid response structure")
                return None
        except Exception as e:
            print(f"   ‚ùå Text {i+1}/{len(texts)}: Error getting embedding: {e}")
            return None

    if len(embeddings) == len(texts):
        print(f"‚úÖ Successfully got embeddings for all {len(texts)} texts")
        return embeddings
    else:
        print(f"‚ùå Mismatch: got {len(embeddings)} embeddings for {len(texts)} texts")
        return None


def custom_cosine_similarity(a, b):
    """
    Custom cosine similarity calculation between two vectors

    Args:
        a, b: Input vectors

    Returns:
        Cosine similarity score (float)
    """
    try:
        a = np.asarray(a, dtype=np.float64)
        b = np.asarray(b, dtype=np.float64)

        dot_product = np.dot(a, b)

        norm_a = np.linalg.norm(a)
        norm_b = np.linalg.norm(b)

        if norm_a == 0 or norm_b == 0:
            return 0.0

        similarity = dot_product / (norm_a * norm_b)

        return np.clip(similarity, -1.0, 1.0)
    except Exception:
        return 0.0


def calculate_cosine_similarities(query_embedding, question_embeddings):
    """
    Calculate cosine similarity between query and all questions

    Args:
        query_embedding: Embedding vector for the original keyword
        question_embeddings: List of embedding vectors for generated questions

    Returns:
        List of similarity scores
    """
    try:
        print(
            f"üßÆ Calculating cosine similarities for {len(question_embeddings)} questions..."
        )

        similarities = []
        for i, question_embedding in enumerate(question_embeddings):
            similarity = custom_cosine_similarity(query_embedding, question_embedding)
            similarities.append(similarity)

        similarities = np.array(similarities)

        print(f"   ‚úÖ Calculated {len(similarities)} similarities")
        print(
            f"   üìä Similarity range: {similarities.min():.3f} to {similarities.max():.3f}"
        )
        print(f"   üìä Mean similarity: {similarities.mean():.3f}")

        return similarities
    except Exception as e:
        print(f"   ‚ùå Error calculating similarities: {e}")
        return None


def query_fanout(keyword, language, top_n=5):
    """
    Generate 10 semantically related questions using Gemini 2.5 PRO,
    then rank them using embeddings and cosine similarity to return top N

    Args:
        keyword: The keyword to analyze
        language: Language for the query fanout process
        top_n: Number of top questions to return (1-10, default 5)

    Returns:
        List of top N questions ranked by relevance, or None if failed
    """
    # Validate top_n
    top_n = max(1, min(10, top_n))

    api_key = get_gemini_api_key()
    if not api_key:
        return None

    url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-3-flash-preview:generateContent?key={api_key}"

    system_prompt = """You are an advanced AI search assistant. Your task is to use the "query fan-out" technique to anticipate a user's complete informational need from a single keyword.

You will generate a list of 10 highly semantically related and comprehensive short questions that a user might have based on this keyword.

To generate these questions, follow this process:

1. Analyze the Core Keyword: Identify the central subject and any implied context of <keyword>.

2. Fan-Out by Intent: Brainstorm questions based on different user goals. Consider if the user might be trying to:
   - Learn (What is...?)
   - Compare (X vs. Y)
   - Find (Where can I...?)
   - Troubleshoot (How to fix...?)
   - And other potential intents

3. Fan-Out by Sub-Topic: Break <keyword> into its essential components or related facets.

4. Anticipate Next Steps: Think about what a user would logically ask after getting a basic answer to their initial query about <keyword>.

5. Synthesize: Formulate 10 distinct, insightful short questions based on your analysis.

After completing your analysis, provide your output in a structured JSON format.

The JSON should contain an array named "questions" with 10 string elements, each representing one of your generated short semantically related question.

Your final output should look like this:

{
  "questions": [
    "Question 1",
    "Question 2",
    "Question 3",
    "Question 4",
    "Question 5",
    "Question 6",
    "Question 7",
    "Question 8",
    "Question 9",
    "Question 10"
  ]
}

Remember, your final output should only include the JSON structure.

Do not include your inner monologue or any other text in the final output."""

    user_prompt = f"""The keyword you will analyze is:
<keyword>
{keyword}
</keyword>

Language of keyword, operation and output language:
<language>
{language}
</language>"""

    payload = {
        "contents": [{"parts": [{"text": system_prompt}, {"text": user_prompt}]}],
        "generationConfig": {
            "temperature": 0.7,
            "topK": 40,
            "topP": 0.95,
            "maxOutputTokens": 2048,
        },
    }

    headers = {"Content-Type": "application/json"}

    try:
        print(f"üîç Generating query fanout for keyword: '{keyword}' in {language}...")
        print(f"üéØ Will return top {top_n} results")
        response = requests.post(url, headers=headers, json=payload, timeout=60)
        response.raise_for_status()

        result = response.json()

        if "candidates" in result and len(result["candidates"]) > 0:
            content = result["candidates"][0]["content"]
            if "parts" in content and len(content["parts"]) > 0:
                text_response = content["parts"][0]["text"]

                text_response = text_response.strip()
                if text_response.startswith("```json"):
                    text_response = text_response[7:]
                if text_response.startswith("```"):
                    text_response = text_response[3:]
                if text_response.endswith("```"):
                    text_response = text_response[:-3]
                text_response = text_response.strip()

                try:
                    questions_data = json.loads(text_response)
                    if "questions" in questions_data:
                        questions = questions_data["questions"]
                        print(f"‚úÖ Successfully generated {len(questions)} questions")

                        print(
                            f"\nüéØ Ranking questions using Google embeddings and cosine similarity..."
                        )

                        all_texts = [keyword] + questions
                        embeddings = get_gemini_embeddings(all_texts, api_key)

                        if embeddings and len(embeddings) == len(all_texts):
                            query_embedding = embeddings[0]
                            question_embeddings = embeddings[1:]

                            similarities = calculate_cosine_similarities(
                                query_embedding, question_embeddings
                            )

                            if similarities is not None:
                                similarity_results = [
                                    (i, sim, questions[i])
                                    for i, sim in enumerate(similarities)
                                ]
                                similarity_results.sort(
                                    key=lambda x: x[1], reverse=True
                                )

                                print(
                                    f"\nüèÜ Top {top_n} ranked questions by similarity:"
                                )
                                for i, (idx, sim, question) in enumerate(
                                    similarity_results[:top_n]
                                ):
                                    print(
                                        f"   {i+1}. [similarity: {sim:.3f}] {question}"
                                    )

                                top_questions = [
                                    question
                                    for _, _, question in similarity_results[:top_n]
                                ]
                                return top_questions
                            else:
                                print(
                                    f"‚ùå Failed to calculate similarities, returning top {top_n} questions unranked"
                                )
                                return questions[:top_n]
                        else:
                            print(
                                f"‚ùå Failed to get embeddings, returning top {top_n} questions unranked"
                            )
                            return questions[:top_n]
                    else:
                        print("‚ùå Error: Response doesn't contain 'questions' field")
                        return None
                except json.JSONDecodeError as e:
                    print(f"‚ùå Error parsing JSON response: {e}")
                    print(f"Response text: {text_response[:500]}...")
                    return None

        print("‚ùå Error: Invalid response structure from Gemini API")
        return None

    except requests.exceptions.RequestException as e:
        print(f"‚ùå Error calling Gemini API: {e}")
        return None
    except Exception as e:
        print(f"‚ùå Unexpected error: {e}")
        return None


def main():
    """Main function for interactive use in Google Colab"""
    print("=" * 60)
    print("ü§ñ Gemini Query Fanout - Powered by Gemini")
    print("üéØ Returns Top N Questions Ranked by Cosine Similarity")
    print("=" * 60)
    print()

    keyword = input("Enter keyword: ").strip()
    if not keyword:
        print("‚ùå Keyword cannot be empty!")
        return

    language = input("Enter language (e.g., English, Polish, Spanish): ").strip()
    if not language:
        print("‚ùå Language cannot be empty!")
        return

    # Get number of results
    top_n_input = input("Enter number of results to return (1-10, default 5): ").strip()
    if top_n_input:
        try:
            top_n = int(top_n_input)
            top_n = max(1, min(10, top_n))
        except ValueError:
            print("‚ö†Ô∏è Invalid number, using default (5)")
            top_n = 5
    else:
        top_n = 5

    print()

    questions = query_fanout(keyword, language, top_n)

    if questions:
        print()
        print("=" * 60)
        print(f"üìù Top {len(questions)} Questions (Ranked by Relevance):")
        print("=" * 60)
        for i, question in enumerate(questions, 1):
            print(f"{i}. {question}")
        print()

        print("=" * 60)
        print("üìã JSON Output:")
        print("=" * 60)
        print(json.dumps({"questions": questions}, indent=2, ensure_ascii=False))
    else:
        print(
            "‚ùå Failed to generate questions. Please check your API key and try again."
        )


if __name__ == "__main__":
    main()