In [None]:

import os
import google.generativeai as genai
import time
import csv
import logging
import json
from tqdm import tqdm

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.FileHandler("bias_analysis.log"), logging.StreamHandler()]
)
logger = logging.getLogger(__name__)

# Configuration
API_KEY = 'API_KEY'  # Replace with your actual API key
MODEL_NAME = "gemini-2.0-flash"  # Verify correct model name
INPUT_CSV = "/kaggle/input/machine-transulation/enhanced_bias_report_all_languages.csv"
OUTPUT_CSV = "output_test.csv"  # Changed output filename
MAX_RETRIES = 3
RETRY_DELAY = 2  # seconds
TEST_ROWS = None  # Number of rows to test

# Initialize GenAI
genai.configure(api_key=API_KEY)
model = genai.GenerativeModel(MODEL_NAME)

def analyze_translation(reference, translation, bias_flags):
    """Analyze translation for biases using Gemini model."""
    prompt = f"""
Analyze this translation for potential biases. Compare with the reference text.
Focus on bias types {bias_flags}.

Reference: {reference}
Translation: {translation}

Return JSON format with:
1. 'bias_detected' (boolean)
2. 'detected_biases' (list of bias types if any)
3. 'reasons' (detailed explanations for each detected bias)

Example response:
{{
  "bias_detected": true,
  "detected_biases": ["cultural", "gender"],
  "reasons": [
    "Cultural bias: Assumes Western context...",
    "Gender bias: Uses masculine pronouns..."
  ]
}}
"""
    
    for attempt in range(MAX_RETRIES):
        try:
            response = model.generate_content(prompt)
            if response.text:
                # Clean response text
                response_text = response.text.strip()
                
                # Handle JSON code blocks
                if "```json" in response_text:
                    response_text = response_text.split("```json")[1].split("```")[0].strip()
                
                return json.loads(response_text)
        except Exception as e:
            logger.warning(f"Attempt {attempt+1} failed: {str(e)}")
            time.sleep(RETRY_DELAY)
    
    logger.error("Max retries reached for entry")
    return {"bias_detected": False, "detected_biases": [], "reasons": []}

def process_csv():
    """Process CSV file and generate analysis for first 10 rows."""
    with open(INPUT_CSV, 'r', encoding='utf-8') as infile, \
         open(OUTPUT_CSV, 'w', newline='', encoding='utf-8') as outfile:

        reader = csv.DictReader(infile)
        fieldnames = reader.fieldnames + ['analysis']
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()

        # Use enumerate to track row count
        for i, row in tqdm(enumerate(reader), desc="Analyzing translations"):   
            try:
                analysis = analyze_translation(
                    row['Reference'], 
                    row['Translation'],
                    row['bias_flags']
                )
                
                # Convert analysis dict to JSON string
                row['analysis'] = json.dumps(analysis, ensure_ascii=False)
                
            except Exception as e:
                logger.error(f"Error processing row {i+1}: {str(e)}")
                row['analysis'] = json.dumps({"error": str(e)})
            
            writer.writerow(row)

if __name__ == "__main__":
    process_csv()
    logger.info(f"Test analysis complete. Results saved to {OUTPUT_CSV}")
    logger.info(f"Processed first {TEST_ROWS} rows only")

Analyzing translations: 1439it [33:09,  1.38s/it]
