In [1]:
import os
import sys
import json
import csv
import traceback
import google.generativeai as genai
import json
import time
from tqdm import tqdm  # for progress bar

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
folder_path = '/content/drive/MyDrive/ProblemGeneratorBaseline'

# Create the directory if it doesn't exist
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# Change the current working directory to the folder path
# This ensures that CSV reading and JSON writing happen in the correct location
os.chdir(folder_path)
print(f"✓ Changed working directory to: {os.getcwd()}")

✓ Changed working directory to: /content/drive/MyDrive/ProblemGeneratorBaseline


In [4]:
# ============================================================================
# 1. INSTALLATIONS, API KEY, SETTING LLM MODEL
# ============================================================================

In [22]:
import google.generativeai as genai
from google.colab import userdata

# Try to get API key from Colab secrets (more secure), otherwise use the hardcoded one
# try:
#     api_key = userdata.get('AIzaSyAujOLrQ8gihFekqPNtzO2oWmRmxqzZAjg')
# except Exception:
#     api_key = None

# # Fallback to hardcoded key if not in secrets
# if not api_key:
#     api_key = "AIzaSyAFXcn9DT0qmT5nimVzyx2J7dogW3JQaJs"

# if not api_key:
#     raise RuntimeError("Missing GOOGLE_API_KEY. Set it in Colab Secrets or the script before running.")

genai.configure(api_key="AIzaSyAFXcn9DT0qmT5nimVzyx2J7dogW3JQaJs")
llm = genai.GenerativeModel('gemini-2.5-flash')

print("✓ Gemini API configured successfully")

✓ Gemini API configured successfully


In [6]:
# ============================================================================
# 2. INPUT - CSV ITERATION FUNCTION
# ============================================================================

In [7]:
def iterate_csv_pairs(csv_filename, required_columns=('question', 'solution'),
                      encoding='utf-8', start_index=1):
    """
    Generator that yields rows from a CSV file as dictionaries with normalized keys.
    """
    # In Colab/Jupyter, __file__ is not defined. We use the current working directory.
    current_dir = os.getcwd()

    candidates = []
    if os.path.isabs(csv_filename):
        candidates.append(csv_filename)
    else:
        candidates.append(os.path.join(current_dir, csv_filename))
        # Keep these just in case structure varies, relative to CWD
        candidates.append(os.path.join(current_dir, '..', 'seed_problems', csv_filename))
        candidates.append(os.path.join(current_dir, '..', csv_filename))

    csv_path = None
    for c in candidates:
        if os.path.exists(c):
            csv_path = c
            break

    if not csv_path:
        raise FileNotFoundError(f"CSV file not found. Tried: {candidates}")

    print(f"✓ Found CSV file at: {csv_path}")

    pair_counter = start_index
    with open(csv_path, 'r', encoding=encoding, newline='') as fh:
        reader = csv.DictReader(fh)

        if reader.fieldnames:
            normalized_fieldnames = [fn.strip().lower() for fn in reader.fieldnames]

        for raw_row in reader:
            row = {(k.strip().lower() if k else k): (v.strip() if isinstance(v, str) else v)
                   for k, v in raw_row.items()}

            missing = [c for c in required_columns if not row.get(c)]
            if missing:
                print(f"⚠ Skipping row {pair_counter}: missing {missing}")
                pair_counter += 1
                continue

            try:
                pn = int(row.get('pair_number')) if row.get('pair_number') else pair_counter
            except Exception:
                pn = pair_counter

            row['Pair_Number'] = pn

            if not row.get('source_problem_id'):
                chapter = row.get('chapter_name') or row.get('chapter') or ''
                row['source_problem_ID'] = f"{chapter}_R{pn}" if chapter else f"CSV_R{pn}"
            else:
                row['source_problem_ID'] = row.get('source_problem_id')

            yield row
            pair_counter += 1

In [8]:
# ============================================================================
# 3. THE SYSTEM PROMPT
# ============================================================================


In [9]:
dataset_generation_prompt = """
You are an expert Physics Problem Generator. Your task is to take a single "Seed Problem" and generate 5 distinct, high-quality variations.

INPUT:
- Seed Question: {question}
- Seed Solution: {solution}

### PROCESS GUIDELINES
1. **Analyze the Physics**: Identify the core formulas and logic used in the seed solution.
2. **Variable Boundaries**: For every variable involved, mentally establish realistic physical ranges (e.g., if the problem involves a car, mass should be 800-2000kg, not 5kg).
3. **Scenario Diversity**: Generate 5 completely different real-world settings (e.g., Car on ramp -> Satellite in orbit -> Electron in field -> Stone in sling, etc.) that utilize the *exact same* mathematical relationship.
4. **Calculation Rigor**: You must perform the actual calculation to get the `numerical_answer`. Do not guess.

### OUTPUT REQUIREMENTS
- Generate a strictly valid JSON object.
- The JSON must contain a list of 5 variations.
- Each variation must have:
    - `problem_text`: The full word problem.
    - `numerical_answer`: The calculated float value (rounded to 2 decimal places).
- **DO NOT** include the Python code or formula derivation in the JSON output.

### CONSTRAINTS
- **Difficulty**: Maintain the cognitive level of the seed problem (e.g., JEE Mains).
- **Clarity**: Ensure the problem explicitly states what needs to be found.
- **Uniqueness**: The 5 variations must not resemble each other in wording or setting.

### FORMAT
Response must be a SINGLE JSON object:
{{
  "variations": [
    {{
      "problem_text": "Text of variation 1...",
      "numerical_answer": 12.5
    }},
    {{
      "problem_text": "Text of variation 2...",
      "numerical_answer": 0.88
    }},
    ... (3 more variations)
  ]
}}
"""

In [10]:
# ============================================================================
# 5. PARSING THE LLM OUTPUT
# ============================================================================


In [11]:
def llm_op_to_json(op_call):
    """
    Clean an LLM text response and parse it as JSON.
    Handles cases where LLM includes explanatory text before/after JSON.
    """
    if not hasattr(op_call, 'text'):
        s = str(op_call)
    else:
        s = op_call.text

    cleaned = s.strip()

    # Strategy 1: Try to find JSON within markdown code fences
    if '```json' in cleaned:
        start_idx = cleaned.find('```json') + len('```json')
        end_idx = cleaned.find('```', start_idx)
        if end_idx != -1:
            cleaned = cleaned[start_idx:end_idx].strip()
            try:
                return json.loads(cleaned)
            except json.JSONDecodeError:
                pass  # Fall through to next strategy

    # Strategy 2: Try to find JSON within generic code fences
    if cleaned.startswith('```') and '```' in cleaned[3:]:
        start_idx = cleaned.find('\n', 0) + 1  # Skip first ```
        end_idx = cleaned.find('```', start_idx)
        if end_idx != -1:
            cleaned = cleaned[start_idx:end_idx].strip()
            try:
                return json.loads(cleaned)
            except json.JSONDecodeError:
                pass  # Fall through to next strategy

    # Strategy 3: Find JSON by looking for outermost { } brackets
    first_brace = cleaned.find('{')
    last_brace = cleaned.rfind('}')

    if first_brace != -1 and last_brace != -1 and last_brace > first_brace:
        json_candidate = cleaned[first_brace:last_brace + 1]
        try:
            return json.loads(json_candidate)
        except json.JSONDecodeError:
            pass  # Fall through to next strategy

    # Strategy 4: Try parsing the whole thing (original approach)
    try:
        return json.loads(cleaned)
    except json.JSONDecodeError as e:
        print("=" * 73)
        print(f"JSONDecodeError: {e}")
        print("Attempted all parsing strategies. Showing response preview:")
        print(f"{cleaned[:500]}...")
        if len(cleaned) > 500:
            print(f"\n... [truncated {len(cleaned) - 500} more characters]")
        print("=" * 73)
    except Exception as e:
        print("=" * 73)
        print(f"Unexpected error parsing LLM output: {e}")
        traceback.print_exc()
        print("=" * 73)


In [12]:
# ============================================================================
# 6. SAVING THE OUTPUT TO A FILE
# ============================================================================


In [13]:
def atomic_write_json(path, data):
    """
    Write data to path atomically using a .tmp file and os.replace.
    """
    tmp_path = path + '.tmp'
    with open(tmp_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    os.replace(tmp_path, path)

In [14]:
# ============================================================================
# MAIN PROCESSING LOOP
# ============================================================================


In [24]:
import os
import json
import traceback
import time

def main():
    # Configuration
    csv_filename = "9.Centre of Mass.csv"

    # Load existing successful records
    base_name = os.path.splitext(os.path.basename(csv_filename))[0]

    # Use current working directory instead of __file__
    current_dir = os.getcwd()
    success_file_path = os.path.join(current_dir, f"{base_name}_generated_problems.json")
    failure_file_path = os.path.join(current_dir, f"{base_name}_failures.json")

    existing_records = []
    if os.path.exists(success_file_path):
        try:
            with open(success_file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                if isinstance(data, list):
                    existing_records = data
                else:
                    print("⚠ Existing file is not a list; will be overwritten.")
        except Exception as e:
            print(f"⚠ Could not read existing file: {e}")

    # Map of signature -> record (for deduplication)
    existing_signatures = {r.get('signature') for r in existing_records
                          if isinstance(r, dict) and r.get('signature')}

    # Count existing variations per seed problem to handle skipping
    # Key: (source_problem_ID, Pair_Number)
    existing_counts = {}
    for r in existing_records:
        key = (r.get('source_problem_ID'), r.get('Pair_Number'))
        existing_counts[key] = existing_counts.get(key, 0) + 1

    failed_rows = []

    print(f"\n{'='*73}")
    print(f"Starting processing of '{csv_filename}'")
    print(f"Found {len(existing_records)} existing records")
    print(f"Output file: {success_file_path}")
    print(f"{'='*73}\n")

    # Process each CSV row
    for row_index, row in enumerate(iterate_csv_pairs(csv_filename), start=1):
        print(f"\n{'='*73}")
        print(f"Processing Row {row_index}")
        print(f"Pair_Number: {row.get('Pair_Number')}")
        print(f"source_problem_ID: {row.get('source_problem_ID')}")

        question = row.get('question')
        solution = row.get('solution')
        Pair_Number = row.get('Pair_Number')
        source_problem_ID = row.get('source_problem_ID')

        # Check if we already have 10 variations for this problem
        row_key = (source_problem_ID, Pair_Number)
        current_count = existing_counts.get(row_key, 0)

        if current_count >= 10:
            print(f"⊘ Skipping - already have {current_count} variations for this problem.")
            print(f"{'='*73}")
            continue

        # We want to generate 2 batches of 5 to get 10 total
        # If we have partial data (e.g. 5), this simple logic will generate 10 more (total 15).
        # For strict adherence to "run twice", we run twice here.

        print(f"→ Goal: Generate 10 variations (Running 2 batches)")
        print(f"{'='*73}")

        for batch_idx in range(2): # Run LLM twice
            print(f"\n-- Batch {batch_idx + 1}/2 --")

            # 4. THE LLM CALL WITH RETRY LOGIC
            MAX_RETRIES = 3
            op_call = None

            for attempt in range(MAX_RETRIES):
                try:
                    print(f"→ Calling LLM (Attempt {attempt+1})...")
                    if 'llm' not in globals():
                         print("✗ 'llm' model is not defined. Please run the API setup cell first.")
                         return

                    op_call = llm.generate_content(
                        dataset_generation_prompt.format(
                            question=question,
                            solution=solution
                        )
                    )
                    print(f"✓ LLM response received")
                    break # Success
                except Exception as e:
                    if attempt == MAX_RETRIES - 1:
                        print(f"✗ LLM generation failed after {MAX_RETRIES} attempts: {e}")
                        traceback.print_exc()
                        failed_rows.append({
                            'row_index': row_index,
                            'batch': batch_idx,
                            'source_problem_ID': source_problem_ID,
                            'error': str(e),
                            'reason': 'Max retries exceeded'
                        })
                    else:
                        print(f"⚠ Retry {attempt + 1}/{MAX_RETRIES} after error: {e}")
                        time.sleep(2 ** attempt)

            if not op_call:
                continue # Skip to next batch if this one failed completely

            # Parse LLM output
            op_data = llm_op_to_json(op_call)

            # Validate LLM output
            if not op_data or 'variations' not in op_data:
                print("✗ Invalid response structure (missing 'variations')")
                failed_rows.append({
                    'row_index': row_index,
                    'batch': batch_idx,
                    'source_problem_ID': source_problem_ID,
                    'error': 'Invalid JSON structure',
                    'raw_response': str(op_call.text) if hasattr(op_call, 'text') else str(op_call)
                })
                continue

            variations = op_data.get('variations', [])
            if len(variations) != 5:
                print(f"⚠ Warning: Expected 5 variations, got {len(variations)}")

            print(f"✓ Parsed {len(variations)} variations")

            # Create record for this row
            per_batch_successful = []

            # Calculate starting variation number based on batch
            # Batch 0: 1-5, Batch 1: 6-10
            start_var_num = (batch_idx * 5) + 1

            for idx, variation in enumerate(variations):
                current_var_num = start_var_num + idx

                # Create a unique signature per variation
                signature = f"{source_problem_ID}_{Pair_Number}_v{current_var_num}"

                record = {
                    'signature': signature,
                    'source_problem_ID': source_problem_ID,
                    'Pair_Number': Pair_Number,
                    'variation_number': current_var_num,
                    'problem_text': variation.get('problem_text', ''),
                    'numerical_answer': variation.get('numerical_answer', None)
                }
                per_batch_successful.append(record)

            # Save incrementally
            try:
                saved_count = 0
                for rec in per_batch_successful:
                    sig = rec.get('signature')
                    if sig and sig not in existing_signatures:
                        existing_records.append(rec)
                        existing_signatures.add(sig)
                        saved_count += 1

                if saved_count > 0:
                    atomic_write_json(success_file_path, existing_records)
                    print(f"✓ Saved {saved_count} new records to file (total: {len(existing_records)})")
                else:
                    print("✓ No new records to save (duplicates detected)")

            except Exception as e:
                print(f"✗ Could not save file: {e}")
                traceback.print_exc()

            # Rate limiting pause between batches
            time.sleep(2)

    # Write failure log at the end
    if failed_rows:
        try:
            with open(failure_file_path, 'w', encoding='utf-8') as f:
                json.dump(failed_rows, f, indent=2, ensure_ascii=False)
            print(f"\n⚠ Validation complete. Saved {len(failed_rows)} failed rows to '{failure_file_path}'")
        except Exception as e:
            print(f"✗ Could not save failure log: {e}")

    print(f"\n{'='*73}")
    print(f"PROCESSING COMPLETE")
    print(f"Total records saved: {len(existing_records)}")
    print(f"Output file: {success_file_path}")
    print(f"{'='*73}\n")

if __name__ == "__main__":
    main()


Starting processing of '9.Centre of Mass.csv'
Found 0 existing records
Output file: /content/drive/MyDrive/ProblemGeneratorBaseline/9.Centre of Mass_generated_problems.json

✓ Found CSV file at: /content/drive/MyDrive/ProblemGeneratorBaseline/9.Centre of Mass.csv

Processing Row 1
Pair_Number: 1
source_problem_ID: Centre of Mass_R1
→ Goal: Generate 10 variations (Running 2 batches)

-- Batch 1/2 --
→ Calling LLM (Attempt 1)...
✓ LLM response received
✓ Parsed 5 variations
✓ Saved 5 new records to file (total: 5)

-- Batch 2/2 --
→ Calling LLM (Attempt 1)...
✓ LLM response received
✓ Parsed 5 variations
✓ Saved 5 new records to file (total: 10)

Processing Row 2
Pair_Number: 2
source_problem_ID: Centre of Mass_R2
→ Goal: Generate 10 variations (Running 2 batches)

-- Batch 1/2 --
→ Calling LLM (Attempt 1)...
✓ LLM response received
✓ Parsed 5 variations
✓ Saved 5 new records to file (total: 15)

-- Batch 2/2 --
→ Calling LLM (Attempt 1)...
✓ LLM response received
✓ Parsed 5 variations
