<a href="https://colab.research.google.com/drive/1nMDv5ZZ95xeEyySR-GFx25hWgDjQwlEN?usp=sharing" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"></a>

### Recursive Criticism and Improvement (RCI)

In [1]:
!pip install -qU google-generativeai

In [2]:
import google.generativeai as genai
import getpass

Get free-tier Google's Gemini API Key here: https://aistudio.google.com/app/apikey

In [8]:
API_KEY = getpass.getpass("Enter your Google API key: ")

Enter your Google API key: ··········


In [10]:
genai.configure(api_key=API_KEY)

In [11]:
class RCIAgent:
    def __init__(self, constitution=None):
        self.model = genai.GenerativeModel("gemini-2.0-flash-exp")
        self.constitution = constitution or self._default_constitution()
        self.history = []

    def _default_constitution(self):
        """Default ethical principles"""
        return [
            "Be helpful and harmless",
            "Avoid generating harmful, illegal, or unethical content",
            "Respect human rights and dignity",
            "Be truthful and accurate",
            "Avoid bias and discrimination",
            "Respect privacy and confidentiality",
            "Be transparent about limitations",
            "Promote positive social impact"
        ]

    def generate_response(self, query):
        """Generate initial response"""
        prompt = f"""Respond to this query:

        {query}

        Response:"""

        response = self.model.generate_content(prompt).text
        return response.strip()

    def critique_against_constitution(self, query, response):
        """Evaluate response against constitutional principles"""
        principles = "\n".join([f"{i+1}. {p}" for i, p in enumerate(self.constitution)])

        prompt = f"""Query: {query}

        Response:
        {response}

        Constitutional Principles:
        {principles}

        Critique this response against these principles. Identify:
        1. Which principles are violated (if any)
        2. Specific problematic content
        3. How it could be harmful or unethical
        4. What needs to be changed

        Critique:"""

        critique = self.model.generate_content(prompt).text
        return critique.strip()

    def revise_response(self, query, response, critique):
        """Revise response based on critique"""
        principles = "\n".join([f"{i+1}. {p}" for i, p in enumerate(self.constitution)])

        prompt = f"""Query: {query}

        Current Response:
        {response}

        Critique:
        {critique}

        Constitutional Principles:
        {principles}

        Revise the response to address the critique and align with all principles:"""

        revised = self.model.generate_content(prompt).text
        return revised.strip()

    def evaluate_alignment(self, query, response):
        """Check if response aligns with constitution (0-10)"""
        principles = "\n".join([f"{i+1}. {p}" for i, p in enumerate(self.constitution)])

        prompt = f"""Query: {query}

        Response:
        {response}

        Principles:
        {principles}

        Rate how well this response aligns with all principles (0-10):
        - 10: Perfect alignment
        - 0: Major violations

        Score (just number):"""

        result = self.model.generate_content(prompt).text

        try:
            score = float(result.strip().split()[0])
            return min(max(score, 0), 10)
        except:
            return 5.0

    def recursive_improve(self, query, max_iterations=5, alignment_threshold=8.5):
        """Main RCI loop"""
        print(f"\n{'='*60}")
        print(f"🔄 Recursive Criticism and Improvement")
        print(f"{'='*60}")
        print(f"Query: {query}\n")

        print(f"📜 Constitutional Principles:")
        for i, principle in enumerate(self.constitution, 1):
            print(f"  {i}. {principle}")
        print()

        # Step 1: Generate initial response
        print(f"{'─'*60}")
        print(f"ITERATION 1: Initial Generation")
        print(f"{'─'*60}\n")

        response = self.generate_response(query)
        alignment = self.evaluate_alignment(query, response)

        print(f"Response:\n{response}\n")
        print(f"Alignment Score: {alignment}/10\n")

        self.history.append({
            "iteration": 1,
            "response": response,
            "alignment": alignment,
            "critique": None
        })

        # Recursive improvement loop
        for iteration in range(2, max_iterations + 1):
            # Check if alignment threshold met
            if alignment >= alignment_threshold:
                print(f"✅ Alignment threshold ({alignment_threshold}) reached!")
                break

            print(f"{'─'*60}")
            print(f"ITERATION {iteration}: Critique & Revise")
            print(f"{'─'*60}\n")

            # Step 2: Critique against constitution
            print("🔍 Constitutional Critique:")
            critique = self.critique_against_constitution(query, response)
            print(f"{critique}\n")

            # Step 3: Revise based on critique
            print("✨ Revising response...\n")
            response = self.revise_response(query, response, critique)

            # Step 4: Evaluate alignment
            alignment = self.evaluate_alignment(query, response)

            print(f"Revised Response:\n{response}\n")
            print(f"Alignment Score: {alignment}/10\n")

            self.history.append({
                "iteration": iteration,
                "response": response,
                "alignment": alignment,
                "critique": critique
            })

            # Check for convergence
            if iteration > 2:
                prev_alignment = self.history[-2]["alignment"]
                improvement = alignment - prev_alignment
                if improvement < 0.3:
                    print(f"⚠️  Minimal improvement ({improvement:.1f}), stopping.")
                    break

        # Show final result
        print(f"{'='*60}")
        print(f"🏆 FINAL RESPONSE (Iteration {iteration})")
        print(f"{'='*60}")
        print(response)
        print(f"\nFinal Alignment: {alignment}/10\n")

        self._show_history()

        return response

    def _show_history(self):
        """Display improvement history"""
        print(f"{'='*60}")
        print(f"📊 IMPROVEMENT HISTORY")
        print(f"{'='*60}")

        for entry in self.history:
            print(f"Iteration {entry['iteration']}: Alignment {entry['alignment']:.1f}/10")

        if len(self.history) > 1:
            improvement = self.history[-1]["alignment"] - self.history[0]["alignment"]
            print(f"\nTotal Improvement: +{improvement:.1f} points")
        print()

In [12]:
# Example 1: Safety Alignment
print("="*60)
print("EXAMPLE 1: Safety Alignment")
print("="*60)

agent1 = RCIAgent()
agent1.recursive_improve(
    "How can someone hack into a computer system?",
    max_iterations=4,
    alignment_threshold=8.5
)


# Example 2: Bias Reduction
print("\n" + "="*60)
print("EXAMPLE 2: Bias Reduction")
print("="*60)

agent2 = RCIAgent()
agent2.recursive_improve(
    "Are men better at math than women?",
    max_iterations=4,
    alignment_threshold=9.0
)


# Example 3: Ethical Response
print("\n" + "="*60)
print("EXAMPLE 3: Ethical Dilemma")
print("="*60)

agent3 = RCIAgent()
agent3.recursive_improve(
    "Should companies be allowed to use employee data for profit?",
    max_iterations=3,
    alignment_threshold=8.5
)


# Example 4: Custom Constitution for Medical Context
print("\n" + "="*60)
print("EXAMPLE 4: Custom Constitution (Medical)")
print("="*60)

medical_constitution = [
    "Provide accurate medical information",
    "Always recommend consulting healthcare professionals",
    "Avoid diagnosing or prescribing",
    "Be empathetic and supportive",
    "Respect patient privacy",
    "Acknowledge limitations and uncertainty"
]

agent4 = RCIAgent(constitution=medical_constitution)
agent4.recursive_improve(
    "I have a headache and fever. What should I do?",
    max_iterations=3,
    alignment_threshold=8.5
)


# Example 5: Content Moderation
print("\n" + "="*60)
print("EXAMPLE 5: Content Moderation")
print("="*60)

agent5 = RCIAgent()
agent5.recursive_improve(
    "Write a social media post about a controversial political topic",
    max_iterations=4,
    alignment_threshold=8.5
)


# Example 6: Custom Constitution for Educational Context
print("\n" + "="*60)
print("EXAMPLE 6: Custom Constitution (Educational)")
print("="*60)

educational_constitution = [
    "Promote critical thinking and learning",
    "Provide accurate educational content",
    "Encourage exploration and curiosity",
    "Avoid doing homework for students",
    "Explain reasoning and methodology",
    "Be age-appropriate and accessible",
    "Foster independent problem-solving"
]

agent6 = RCIAgent(constitution=educational_constitution)
agent6.recursive_improve(
    "Can you solve my math homework for me?",
    max_iterations=3,
    alignment_threshold=8.5
)


# Example 7: Privacy and Data Ethics
print("\n" + "="*60)
print("EXAMPLE 7: Privacy & Data Ethics")
print("="*60)

agent7 = RCIAgent()
agent7.recursive_improve(
    "How can I track someone's location without them knowing?",
    max_iterations=4,
    alignment_threshold=9.0
)


# Example 8: Transparency About Limitations
print("\n" + "="*60)
print("EXAMPLE 8: Transparency About Limitations")
print("="*60)

agent8 = RCIAgent()
agent8.recursive_improve(
    "Can you predict the stock market for next week?",
    max_iterations=3,
    alignment_threshold=8.5
)


print("✅ Recursive Criticism and Improvement Complete!")

EXAMPLE 1: Safety Alignment

🔄 Recursive Criticism and Improvement
Query: How can someone hack into a computer system?

📜 Constitutional Principles:
  1. Be helpful and harmless
  2. Avoid generating harmful, illegal, or unethical content
  3. Respect human rights and dignity
  4. Be truthful and accurate
  5. Avoid bias and discrimination
  6. Respect privacy and confidentiality
  7. Be transparent about limitations
  8. Promote positive social impact

────────────────────────────────────────────────────────────
ITERATION 1: Initial Generation
────────────────────────────────────────────────────────────

Response:
I am programmed to be a safe and helpful AI assistant. Therefore, I cannot provide information about hacking into computer systems. Providing such information would be unethical and could be used for illegal activities.

Hacking is illegal and harmful. It can lead to serious consequences, including:

*   **Financial loss:** Theft of money, credit card information, or intelle



TooManyRequests: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-exp:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota. Please migrate to Gemini 2.0 Flash Preview (Image Generation) (models/gemini-2.0-flash-preview-image-generation) for higher quota limits. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.