In [None]:
from google.colab import userdata
key=userdata.get('GOOGLE_FREE_API_KEY')



In [None]:
from google import genai
from google.genai import types

client = genai.Client(api_key=key)

response = client.models.generate_content(
    model="gemini-2.5-flash-lite",
    contents="How does AI work?",
    config=types.GenerateContentConfig(
        thinking_config=types.ThinkingConfig(thinking_budget=0) # Disables thinking
    ),
)
print(response.text)

That's a fantastic question, and the answer is multifaceted, evolving, and can get quite technical. However, I can give you a comprehensive, yet understandable, overview of how AI works.

At its core, **Artificial Intelligence (AI) aims to create systems that can perform tasks that typically require human intelligence.** This includes things like learning, problem-solving, decision-making, understanding language, and recognizing patterns.

Instead of being explicitly programmed for every single scenario, AI systems are designed to **learn from data and adapt their behavior.** This is the fundamental difference between traditional programming and AI.

Here's a breakdown of the key components and concepts:

**1. Data: The Fuel of AI**

*   **AI systems are trained on vast amounts of data.** This data can be anything: text, images, audio, videos, numbers, sensor readings, etc.
*   **The quality and quantity of data are crucial.** More data, and cleaner, more relevant data, generally leads

In [None]:
#https://www.fia.com/sites/default/files/fia_2025_formula_1_sporting_regulations_-_issue_1_-_2024-07-31.pdf
!pip install pypdf
import requests
from pypdf import PdfReader
import io

# 1. Define the URL and Filenames
# NOTE: Replace this URL with the current, specific regulations PDF you need.
pdf_url = "https://www.fia.com/sites/default/files/fia_2025_formula_1_sporting_regulations_-_issue_1_-_2024-07-31.pdf"
local_pdf_filename = "f1_regulations.pdf"
output_txt_filename = "f1_regulations_text.txt"

# 2. Download the PDF to the Colab environment
print(f"Downloading PDF from: {pdf_url}")
try:
    # Use requests to download the content
    response = requests.get(pdf_url)
    response.raise_for_status() # Raise an exception for bad status codes

    # Save the binary content to a file in the Colab instance
    with open(local_pdf_filename, 'wb') as f:
        f.write(response.content)

    print(f"Successfully downloaded: {local_pdf_filename}")

    # 3. Extract Text from the PDF and Save as TXT
    print("Extracting text from PDF...")

    # Initialize PDF Reader object using the downloaded file
    reader = PdfReader(local_pdf_filename)
    full_text = ""

    # Loop through all pages and extract text
    for page in reader.pages:
        # Extract text from the page
        text = page.extract_text()
        if text:
            # Add extracted text and a separator for new pages
            full_text += text + "\n\n--- PAGE BREAK ---\n\n"

    # Save the extracted text to a .txt file
    with open(output_txt_filename, 'w', encoding='utf-8') as f:
        f.write(full_text)

    print(f"Successfully extracted and saved text to: {output_txt_filename}")



except requests.exceptions.HTTPError as e:
    print(f"Error downloading the file: The URL may be invalid or restricted. HTTP Error: {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Collecting pypdf
  Downloading pypdf-6.3.0-py3-none-any.whl.metadata (7.1 kB)
Downloading pypdf-6.3.0-py3-none-any.whl (328 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/328.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.9/328.9 kB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-6.3.0
Downloading PDF from: https://www.fia.com/sites/default/files/fia_2025_formula_1_sporting_regulations_-_issue_1_-_2024-07-31.pdf
Successfully downloaded: f1_regulations.pdf
Extracting text from PDF...
Successfully extracted and saved text to: f1_regulations_text.txt


In [None]:
# Install the necessary libraries
!pip install google-genai nltk

# Download the required NLTK resource for sentence splitting
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
import os
from google import genai
from nltk.tokenize import sent_tokenize
model_name = 'gemini-2.5-flash' # Fast and capable model for this task



def chunk_text_by_sentences(text, sentences_per_chunk=3):
    """Splits text into chunks based on a fixed number of sentences."""
    # Split the text into individual sentences
    sentences = sent_tokenize(text)
    chunks = []

    # Group sentences into chunks
    for i in range(0, len(sentences), sentences_per_chunk):
        chunk = " ".join(sentences[i:i + sentences_per_chunk])
        chunks.append(chunk)

    return chunks

# 1. Generate the Anchor Chunks
anchor_chunks = chunk_text_by_sentences(full_text, sentences_per_chunk=3)
print(f"Generated {len(anchor_chunks)} Anchor Chunks.")

# Example of the first chunk
# print("\nFirst Anchor Chunk:")
# print(anchor_chunks[0])

Generated 583 Anchor Chunks.


In [None]:
model_name = 'gemini-2.5-flash-lite'
import tqdm
from tqdm.notebook import tqdm
import re
def generate_triplet_data_with_progress(anchor_chunks):
    """
    Generates synthetic Anchor-Positive-Negative triplets with a progress bar.
    Includes a skip feature and uses 'Hard Negative' generation via the LLM.
    """
    triplets = []

    # 1. Start the progress bar using tqdm
    for i, anchor in tqdm(enumerate(anchor_chunks), total=len(anchor_chunks), desc="Generating Triplets"):

        # --- Skip Condition (Example: Skip short or incomplete chunks) ---
        #if len(anchor.split()) < 20 or "must submit their financial reports" in anchor:
        #    print(f"\nSkipping short/irrelevant chunk {i+1}: '{anchor[:30]}...'")
        #    continue
        #print(anchor)
        # --- 2. Construct the Prompt for Hard Negative Generation ---
        # Requesting an 'off-topic but related' hard negative
        prompt = f"""
        Given the following text chunk (Anchor) from the F1 Regulations:

        ANCHOR: "{anchor}"

        Please generate two separate outputs, formatted as a Python list:
        1. A detailed **paraphrase** of the ANCHOR. This will be the POSITIVE sample.
        2. A single, high-quality **hard negative** sentence. This sentence must be **highly relevant** to Formula 1, motor racing, or technical sport design (e.g., mentioning aerodynamics or engineering) but must be **semantically non-matching** and **NOT** a paraphrase of the Anchor.

        Output format MUST be a list containing exactly two strings:
        ['[POSITIVE SENTENCE]', '[HARD NEGATIVE SENTENCE]']
        """

        try:
            # --- 3. Call the Gemini API ---
            response = client.models.generate_content(
                model=model_name,
                contents=prompt,
                config=genai.types.GenerateContentConfig(
                    temperature=0.3 # Slightly higher temperature for better hard negative
                )
            )

            # --- 4. Parse and Validate the Output ---
            # Use regex and simple evaluation to safely parse the list from the string

            # Simple cleanup to help eval() parse the string
            clean_text = response.text.strip().replace('\n', ' ').replace('\r', '')
            match = re.search(r"\[\s*['\"]([^'\"]*)['\"]\s*,\s*['\"]([^'\"]*)['\"]\s*\]", clean_text)

            if match:
                positive_text = match.group(1).strip()
                negative_text = match.group(2).strip()
            else:
                # Fallback if the model output doesn't strictly follow the list format
                raise ValueError("Model output did not match expected list format.")

            if not positive_text or not negative_text:
                 raise ValueError("One of the generated strings was empty.")

            # --- 5. Store the Triplet ---
            triplets.append({
                'anchor': anchor,
                'positive': positive_text,
                'negative': negative_text
            })

        except Exception as e:
            tqdm.write(f"\nError processing chunk {i+1}. Skipping. Error: {e}")

    return triplets

# Run the generation process
triplet_data = generate_triplet_data_with_progress(anchor_chunks)

# --- Final Output ---
print("\n--- Generated Hard Triplet Dataset Sample ---")
for t in triplet_data[:2]:
    print(f"Anchor:    {t['anchor'][:80]}...")
    print(f"Positive:  {t['positive'][:80]}...")
    print(f"Negative:  {t['negative'][:80]}...")
    print("-" * 20)

Generating Triplets:   0%|          | 0/583 [00:00<?, ?it/s]


Error processing chunk 2. Skipping. Error: Model output did not match expected list format.

Error processing chunk 4. Skipping. Error: Model output did not match expected list format.

Error processing chunk 5. Skipping. Error: Model output did not match expected list format.

Error processing chunk 6. Skipping. Error: Model output did not match expected list format.


KeyboardInterrupt: 

In [None]:
triplet_data

NameError: name 'triplet_data' is not defined

In [None]:
from google.genai.types import GenerateContentConfig, Part, Type
output_schema = {
    "type": "object",
    "properties": {
        "positive": {"type": "string", "description": "The detailed paraphrase of the ANCHOR."},
        "negative": {"type": "string", "description": "The hard negative sentence, highly relevant but semantically non-matching."}
    },
    "required": ["positive", "negative"]
}

import google.genai as genai
from google.genai.types import GenerateContentConfig, Type
import tqdm
from tqdm.notebook import tqdm
import json
import re

model_name = 'gemini-2.5-flash-lite' # Using 'flash' is recommended for this task for speed/cost

# Example anchor chunks (replace with your actual data)

def generate_triplet_data_with_progress(anchor_chunks):
    """
    Generates synthetic Anchor-Positive-Negative triplets with a progress bar,
    using JSON Schema to ensure reliable output formatting.
    """
    triplets = []

    # 1. Define the Output Schema for JSON
    # This instructs the model to return a valid JSON object with these keys/types.
    output_schema = {
        "type": "object",
        "properties": {
            "positive": {
                "type": "string",
                "description": "A detailed paraphrase of the ANCHOR text."
            },
            "negative": {
                "type": "string",
                "description": "A single, high-quality hard negative sentence. Must be highly relevant to F1/racing but semantically non-matching to the Anchor."
            }
        },
        "required": ["positive", "negative"]
    }

    # 2. Start the progress bar using tqdm
    for i, anchor in tqdm(enumerate(anchor_chunks), total=len(anchor_chunks), desc="Generating Triplets"):

        # --- Construct the Prompt for Hard Negative Generation ---
        # The prompt is simpler now as the schema handles the structure
        prompt = f"""
        Given the following text chunk (Anchor) from the F1 Regulations:

        ANCHOR: "{anchor}"

        Please generate two separate outputs:
        1. A detailed **paraphrase** of the ANCHOR. This will be the POSITIVE sample.
        2. A single, high-quality **hard negative** sentence. This sentence must be **highly relevant** to Formula 1, motor racing, or technical sport design (e.g., mentioning aerodynamics or engineering) but must be **semantically non-matching** and **NOT** a paraphrase of the Anchor.

        Your entire response MUST be a valid JSON object matching the provided schema.
        """

        try:
            # --- 3. Call the Gemini API with JSON Config ---
            response = client.models.generate_content(
                model=model_name,
                contents=prompt,
                config=GenerateContentConfig(
                    temperature=0.3, # Slightly higher temperature for better hard negative
                    # Enforce JSON output format
                    response_mime_type="application/json",
                    response_schema=output_schema
                )
            )

            # --- 4. Parse the JSON Output ---
            # The result is guaranteed to be a JSON string that matches the schema
            json_string = response.candidates[0].content.parts[0].text
            data = json.loads(json_string)

            positive_text = data.get('positive', '').strip()
            negative_text = data.get('negative', '').strip()

            if not positive_text or not negative_text:
                # This should rarely trigger with schema enforcement, but is a final check
                raise ValueError("Parsed JSON did not contain both required fields or they were empty.")

            # --- 5. Store the Triplet ---
            triplets.append({
                'anchor': anchor,
                'positive': positive_text,
                'negative': negative_text
            })

        except Exception as e:
            # The error is now more likely a connection issue or a fundamental parsing error
            tqdm.write(f"\nError processing chunk {i+1}. Skipping. Error: {e}")

    return triplets

# Run the generation process
triplet_data = generate_triplet_data_with_progress(anchor_chunks)

# --- Final Output ---
print("\n--- Generated Hard Triplet Dataset Sample ---")
for t in triplet_data[:5]: # Display up to 5 results
    print(f"**Anchor:** {t['anchor'][:80]}...")
    print(f"**Positive:** {t['positive'][:80]}...")
    print(f"**Negative:** {t['negative'][:80]}...")
    print("-" * 20)

Generating Triplets:   0%|          | 0/583 [00:00<?, ?it/s]


Error processing chunk 1. Skipping. Error: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. \n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 15, model: gemini-2.5-flash-lite\nPlease retry in 12.555274479s.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerMinutePerProjectPerModel-FreeTier', 'quotaDimensions': {'l

KeyboardInterrupt: 