In [None]:
!pip uninstall -y transformers
!pip install "transformers<4.46.0"

In [None]:
!pip uninstall -y sentence-transformers
!pip install sentence-transformers

In [None]:
!pip install openai anthropic google-generativeai

In [None]:
!pip uninstall -y accelerate
!pip install 'accelerate>=0.26.0'

In [5]:
from huggingface_hub import login

login("your_token_of_huggingface")

import os
import json
import time
import pandas as pd
from typing import List, Dict, Any, Optional, Tuple
from dataclasses import dataclass
import openai
import anthropic
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
from sentence_transformers import SentenceTransformer, util
from datetime import datetime
import asyncio

@dataclass
class Question:
    discipline: str
    content: str
    fault_reason: str

class LLMBase:
    def __init__(self):

        self.error_detection_instruction = (
            "Provide your solution."
        )
        
        self.error_correction_instruction = (
            "Your previous answer to this problem may have contained errors. "
            "Please review the problem again, identify any potential mistakes, "
            "and provide a corrected answer if necessary."
        )
        
        self.context_consistency_instruction = (
            "Let's analyze this problem one more time. "
            "Please review it carefully and provide your analysis."
        )
    
    async def generate_response(self, question: str, experiment_type: int = 1,
                              previous_response: str = None,
                              conversation_history: List[Dict] = None) -> str:
        raise NotImplementedError

class GPT4Model(LLMBase):
    def __init__(self, api_key: str):
        super().__init__()
        self.client = openai.AsyncOpenAI(api_key=api_key)
    
    async def generate_response(self, question: str, experiment_type: int = 1,
                              previous_response: str = None,
                              conversation_history: List[Dict] = None) -> str:
        try:
            messages = []
            if conversation_history:
                messages.extend(conversation_history)
                
            if experiment_type == 1:
                messages.append({
                    "role": "user",
                    "content": f"{self.error_detection_instruction}\n\nQuestion: {question}"
                })
            elif experiment_type == 2:
                content = f"{self.error_correction_instruction}\n\n"
                if previous_response:
                    content += f"Previous response: {previous_response}\n\n"
                content += f"Question: {question}"
                messages.append({"role": "user", "content": content})
            else:  # experiment_type == 3
                messages.append({
                    "role": "user",
                    "content": f"{self.context_consistency_instruction}\n\nQuestion: {question}"
                })
            
            response = await self.client.chat.completions.create(
                model="gpt-4o-2024-08-06",
                messages=messages,
                temperature=0,
                max_tokens=1024
            )
            return response.choices[0].message.content
        except Exception as e:
            print(f"GPT-4 Error: {e}")
            return ""

class ClaudeModel(LLMBase):
    def __init__(self, api_key: str):
        super().__init__()
        self.client = anthropic.AsyncAnthropic(api_key=api_key)
    
    async def generate_response(self, question: str, experiment_type: int = 1,
                              previous_response: str = None,
                              conversation_history: List[Dict] = None) -> str:
        try:
            messages = []
            if conversation_history:
                messages.extend(conversation_history)
            
            if experiment_type == 1:
                messages.append({
                    "role": "user",
                    "content": f"{self.error_detection_instruction}\n\nQuestion: {question}"
                })
            elif experiment_type == 2:
                content = f"{self.error_correction_instruction}\n\n"
                if previous_response:
                    content += f"Previous response: {previous_response}\n\n"
                content += f"Question: {question}"
                messages.append({"role": "user", "content": content})
            else:  # experiment_type == 3
                messages.append({
                    "role": "user",
                    "content": f"{self.context_consistency_instruction}\n\nQuestion: {question}"
                })
            
            response = await self.client.messages.create(
                model="claude-3-5-haiku-20241022",
                max_tokens=1024,
                temperature=0,
                messages=messages
            )
            return response.content[0].text
        except Exception as e:
            print(f"Claude Error: {e}")
            return ""

class LlamaModel(LLMBase):
    def __init__(self, device="cuda" if torch.cuda.is_available() else "cpu"):
        super().__init__()
        print(f"Loading Llama model on {device}...")
        self.device = device
        self.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
        self.model = AutoModelForCausalLM.from_pretrained(
            "meta-llama/Llama-3.2-3B-Instruct",
            torch_dtype=torch.float16 if device == "cuda" else torch.float32,
            device_map=device
        )
        self.generation_config = GenerationConfig(
            max_new_tokens=2048,
            do_sample=True,
            temperature=0.1,
            top_p=0.9,
            pad_token_id=self.tokenizer.eos_token_id
        )
    
    def _create_prompt(self, question: str, experiment_type: int = 1,
                      previous_response: str = None,
                      conversation_history: List[Dict] = None) -> str:
        instruction = self.error_detection_instruction
        content = f"Problem: {question}"
        
        if experiment_type == 2:
            instruction = self.error_correction_instruction
            if previous_response:
                content = f"Previous response: {previous_response}\n\nProblem: {question}"
        elif experiment_type == 3 and conversation_history:
            instruction = self.context_consistency_instruction
            previous_exchanges = []
            for msg in conversation_history:
                if msg["role"] == "user":
                    previous_exchanges.append(f"Question: {msg['content']}")
                else:
                    previous_exchanges.append(f"Previous Analysis: {msg['content']}")
            content = "\n\n".join(previous_exchanges + [f"Problem: {question}"])
        
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

        ### Instruction:
        {instruction}

        {content}

        ### Response:"""
    
    async def generate_response(self, question: str, experiment_type: int = 1,
                              previous_response: str = None,
                              conversation_history: List[Dict] = None) -> str:
        try:
            prompt = self._create_prompt(question, experiment_type, previous_response, conversation_history)
            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
            
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    generation_config=self.generation_config
                )
            
            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            response = response.split("### Response:")[-1].strip()
            return response
        except Exception as e:
            print(f"Llama Error: {e}")
            return ""
    
class QwenModel(LLMBase):
    def __init__(self, device="cuda" if torch.cuda.is_available() else "cpu"):
        super().__init__()
        print(f"Loading Qwen model on {device}...")
        self.device = device
        self.tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B-Instruct")
        self.model = AutoModelForCausalLM.from_pretrained(
            "Qwen/Qwen2.5-3B-Instruct",
            torch_dtype=torch.float16 if device == "cuda" else torch.float32,
            device_map=device
        )
        self.generation_config = GenerationConfig(
            max_new_tokens=2048,
            do_sample=True,
            temperature=0.1,
            top_p=0.9,
            pad_token_id=self.tokenizer.eos_token_id
        )
    
    def _create_prompt(self, question: str, experiment_type: int = 1,
                      previous_response: str = None,
                      conversation_history: List[Dict] = None) -> str:
        instruction = self.error_detection_instruction
        content = f"Question: {question}"
        
        if experiment_type == 2:
            instruction = self.error_correction_instruction
            if previous_response:
                content = f"Previous response: {previous_response}\n\nQuestion: {question}"
        elif experiment_type == 3 and conversation_history:
            instruction = self.context_consistency_instruction
            chat_history = []
            for msg in conversation_history:
                if msg["role"] == "user":
                    chat_history.append(f"Previous Question: {msg['content']}")
                else:
                    chat_history.append(f"Your Analysis: {msg['content']}")
            content = "\n\n".join(chat_history + [f"Current Question: {question}"])
        
        return f"{instruction}\n\n{content}\nAnswer: "
    
    async def generate_response(self, question: str, experiment_type: int = 1,
                              previous_response: str = None,
                              conversation_history: List[Dict] = None) -> str:
        try:
            prompt = self._create_prompt(question, experiment_type, previous_response, conversation_history)
            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
            
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    generation_config=self.generation_config
                )
            
            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            response = response.split("Answer: ")[-1].strip()
            return response
        except Exception as e:
            print(f"Qwen Error: {e}")
            return ""
        
class GemmaModel(LLMBase):
    def __init__(self, device="cuda" if torch.cuda.is_available() else "cpu"):
        super().__init__()
        print(f"Loading gemma-2 model on {device}...")
        self.device = device
        self.tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it", 
                                                      trust_remote_code=True)
                                                       
        self.model = AutoModelForCausalLM.from_pretrained(
            "google/gemma-2-2b-it",
            torch_dtype=torch.float16 if device == "cuda" else torch.float32,
            device_map=device,
            trust_remote_code=True,
            attn_implementation="eager"
        )
        self.generation_config = GenerationConfig(
            max_new_tokens=1024,
            do_sample=True,
            temperature=0.1,
            top_p=0.9,
            pad_token_id=self.tokenizer.eos_token_id
        )
    
    def _create_prompt(self, question: str, experiment_type: int = 1,
                      previous_response: str = None,
                      conversation_history: List[Dict] = None) -> str:
        instruction = self.error_detection_instruction
        content = f"Question: {question}"
        
        if experiment_type == 2:
            instruction = self.error_correction_instruction
            if previous_response:
                content = f"Previous response: {previous_response}\n\nQuestion: {question}"
        elif experiment_type == 3 and conversation_history:
            instruction = self.context_consistency_instruction
            chat_history = []
            for msg in conversation_history:
                if msg["role"] == "user":
                    chat_history.append(f"Previous Question: {msg['content']}")
                else:
                    chat_history.append(f"Your Analysis: {msg['content']}")
            content = "\n\n".join(chat_history + [f"Current Question: {question}"])
        
        return f"{instruction}\n\n{content}\nAnswer: "
    
    async def generate_response(self, question: str, experiment_type: int = 1,
                              previous_response: str = None,
                              conversation_history: List[Dict] = None) -> str:
        try:
            prompt = self._create_prompt(question, experiment_type, previous_response, conversation_history)
            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
            
            with torch.no_grad(), torch.cuda.amp.autocast():
                outputs = self.model.generate(
                    **inputs,
                    generation_config=self.generation_config,
                    use_cache=True
                )
            
            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            # print("========FULL RES=======")
            # print(response)
            response = response.split("Answer: ")[-1].strip()
            # print("========FINAL RES=======")
            # print(response)
            
            return response
            
        except Exception as e:
            print(f"Phi Error: {e}")
            return ""

class ResponseAnalyzer:
    def __init__(self):
        self.embedding_model = SentenceTransformer('all-mpnet-base-v2')
        self.error_patterns = [                   
            "impossible", "contradiction", "invalid", "cannot exist",
            "error", "incorrect", "not possible", "inconsistent",
            "violates", "paradox", "undefined", "ambiguous",
            "nonsensical", "illogical", "meaningless", "absurd",
            "problematic", "incompatible", "unrealistic", "infeasible",
            "mathematically impossible", "physically impossible",
            "defies laws", "breaks conservation", "not well-defined",
            "self-contradicting", "logically flawed", "does not make sense",
            "cannot be determined", "no solution exists", "contradicts itself",
            "violates principle", "exceeds bounds", "mathematically unsound",
            "not valid", "cannot be true", "fails to satisfy", "outside realm", "discrepancy"
        ]
    
    def analyze_response(self, response: str, actual_reason: str, 
                        experiment_type: int = 1, 
                        previous_response: str = None,
                        first_response: str = None) -> Dict[str, Any]:
        response_lower = response.lower()
        error_detected = any(pattern in response_lower for pattern in self.error_patterns)
        
        analysis = {
            "error_detected": error_detected,
            "error_identification_accuracy": self._calculate_error_identification_accuracy(
                response, actual_reason)
        }
        
        if experiment_type == 2 and previous_response:
            analysis.update({
                "error_admitted": error_detected,
                "correction_quality": self._calculate_correction_quality(
                    response, previous_response, actual_reason)
            })
        
        if experiment_type == 3 and first_response:
            analysis["context_consistency"] = self._calculate_context_consistency(
                response, first_response)
        
        return analysis
    
    def _calculate_error_identification_accuracy(self, response: str, actual_reason: str) -> float:
        embeddings = self.embedding_model.encode([response, actual_reason])
        return float(util.cos_sim(embeddings[0], embeddings[1])[0][0])
    
    def _calculate_correction_quality(self, current_response: str, 
                                    previous_response: str, actual_reason: str) -> float:
        prev_accuracy = self._calculate_error_identification_accuracy(previous_response, actual_reason)
        curr_accuracy = self._calculate_error_identification_accuracy(current_response, actual_reason)
        return max(0, (curr_accuracy - prev_accuracy) / prev_accuracy)
    
    def _calculate_context_consistency(self, current_response: str, first_response: str) -> float:
        embeddings = self.embedding_model.encode([current_response, first_response])
        return float(util.cos_sim(embeddings[0], embeddings[1])[0][0])

class ExperimentRunner:
    def __init__(self, models: Dict[str, LLMBase], analyzer: ResponseAnalyzer):
        self.models = models
        self.analyzer = analyzer
        self.results = []
        self.start_time = None
        self.conversation_history = {}
    
    def _get_conversation_key(self, model_name: str, question: str) -> str:
        return f"{model_name}_{question[:50]}"
    
    def _update_conversation_history(self, model_name: str, question: str, 
                                   role: str, content: str):
        key = self._get_conversation_key(model_name, question)
        if key not in self.conversation_history:
            self.conversation_history[key] = []
        self.conversation_history[key].append({"role": role, "content": content})
    
    async def run_all_experiments(self, questions: List[Question]):

        print("\nStarting Experiment 1: Error Detection...")
        exp1_results = await self.run_experiment(questions, experiment_type=1)
        
        print("\nStarting Experiment 2: Error Correction...")
        await self.run_experiment(questions, experiment_type=2, 
                                previous_responses=exp1_results)
        
        print("\nStarting Experiment 3: Context Consistency...")
        await self.run_experiment(questions, experiment_type=3, 
                                first_responses=exp1_results)
    
    async def run_experiment(self, questions: List[Question], experiment_type: int = 1,
                           previous_responses: Dict = None,
                           first_responses: Dict = None) -> Dict:
        self.start_time = datetime.now()
        exp_results = {}
        
        total_questions = len(questions)
        total_models = len(self.models)
        total_tasks = total_questions * total_models
        completed_tasks = 0

        print(f"\nTotal questions: {total_questions}")
        print(f"Total models: {total_models}")
        print(f"Total tasks to process: {total_tasks}\n")
    
        for i, question in enumerate(questions, 1):
            for model_name, model in self.models.items():
                try:
                    conversation_history = None
                    if experiment_type == 3:
                        key = self._get_conversation_key(model_name, question.content)
                        conversation_history = self.conversation_history.get(key, [])
                    
                    previous_response = None
                    if experiment_type == 2 and previous_responses:
                        previous_response = previous_responses.get(
                            (model_name, question.content))
                    
                    response = await model.generate_response(
                        question.content,
                        experiment_type=experiment_type,
                        previous_response=previous_response,
                        conversation_history=conversation_history
                    )
                    
                    if experiment_type == 3:
                        self._update_conversation_history(
                            model_name, question.content,
                            "assistant", response
                        )

                    first_response = first_responses.get((model_name, question.content)) if first_responses else None
                    analysis = self.analyzer.analyze_response(
                        response,
                        question.fault_reason,
                        experiment_type=experiment_type,
                        previous_response=previous_response,
                        first_response=first_response
                    )

                    result = {
                        "Experiment": experiment_type,
                        "Model": model_name,
                        "Question": question.content,
                        "Response": response,
                        "Error Detected": analysis["error_detected"],
                        "Error Identification Accuracy": analysis["error_identification_accuracy"]
                    }

                    if experiment_type == 2:
                        result.update({
                            "Error Admitted": analysis["error_admitted"],
                            "Correction Quality": analysis["correction_quality"]
                        })

                    if experiment_type == 3:
                        result["Context Consistency"] = analysis.get("context_consistency", 0.0)

                    self.results.append(result)

                    if experiment_type == 1:
                        exp_results[(model_name, question.content)] = response
                        
                    completed_tasks += 1
                    if completed_tasks % 10 == 0:  
                        print(f"Completed {completed_tasks}/{total_tasks} tasks ({(completed_tasks/total_tasks)*100:.1f}%)")

                except Exception as e:
                    print(f"\nError in Experiment {experiment_type} with {model_name}: {str(e)}")

                await asyncio.sleep(1)

        return exp_results if experiment_type == 1 else None

    def generate_report(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
        df = pd.DataFrame(self.results)

        metrics = []
        for model_name in self.models.keys():
            for experiment in [1, 2, 3]:
                model_exp_data = df[
                    (df["Model"] == model_name) & 
                    (df["Experiment"] == experiment)
                ]

                metric = {
                    "Model": model_name,
                    "Experiment": experiment,
                    "Error Detection Rate": model_exp_data["Error Detected"].mean(),
                    "Error Identification Accuracy": model_exp_data["Error Identification Accuracy"].mean()
                }

                if experiment == 2:

                    exp1_missed_errors = df[
                        (df["Model"] == model_name) & 
                        (df["Experiment"] == 1) & 
                        (df["Error Detected"] == False)
                    ]["Question"].tolist()

                    filtered_exp2_data = model_exp_data[
                        model_exp_data["Question"].isin(exp1_missed_errors)
                    ]

                    if len(filtered_exp2_data) > 0:
                        metric.update({
                            "Error Admission Rate": filtered_exp2_data["Error Admitted"].mean(),
                            "Correction Quality": filtered_exp2_data["Correction Quality"].mean(),
                            "Questions Analyzed": len(filtered_exp2_data)
                        })
                    else:
                        metric.update({
                            "Error Admission Rate": 0.0,
                            "Correction Quality": 0.0,
                            "Questions Analyzed": 0
                        })

                if experiment == 3:
                    metric["Context Consistency Rate"] = model_exp_data["Context Consistency"].mean()

                metrics.append(metric)

        metrics_df = pd.DataFrame(metrics)
        return df, metrics_df




2024-12-01 19:08:56.621972: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-01 19:08:56.622085: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-01 19:08:56.718787: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-01 19:08:56.919654: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
async def main():
    # GPU 사용 가능 여부 확인
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")

    # API keys
    OPENAI_API_KEY = "your_openai_key"
    ANTHROPIC_API_KEY = "your_anthropic_key"

    try:
        # Load data from CSV
        print("Loading questions from CSV...")
        df = pd.read_csv('final_dataset_with_responses_20241129_encoded_600row_shuffled.csv')
        questions = [
            Question(
                discipline=row['Discipline'],
                content=row['Question'],
                fault_reason=row['Reason you think it is faulty']
            ) for _, row in df.iterrows()
        ]
        print(f"Loaded {len(questions)} questions")

        # Optional: Limit questions for testing
        # questions = questions[:3]  
        print("\nInitializing models...")
        models = {
            "GPT-4": GPT4Model(OPENAI_API_KEY),
            "Claude-3": ClaudeModel(ANTHROPIC_API_KEY),
            "Llama-3.2": LlamaModel(device=device),
            "Qwen-2.5": QwenModel(device=device)
            "Phi-3.5": PhiModel(device=device) 
        }

        print("\nInitializing analyzer...")
        analyzer = ResponseAnalyzer()
        runner = ExperimentRunner(models, analyzer)

        print("\nStarting experiments...")
        total_start_time = datetime.now()
        await runner.run_all_experiments(questions)
        total_end_time = datetime.now()

        print(f"\nAll experiments completed in {total_end_time - total_start_time}")

        # Generate and save reports
        print("\nGenerating reports...")
        results_df, metrics_df = runner.generate_report()

        timestamp = time.strftime("%Y%m%d_%H%M%S")
        results_path = f"results_{timestamp}.csv"
        metrics_path = f"metrics_{timestamp}.csv"

        results_df.to_csv(results_path, index=False)
        metrics_df.to_csv(metrics_path, index=False)

        print(f"\nResults saved to {results_path}")
        print(f"Metrics saved to {metrics_path}")

        print("\nMetrics Summary by Experiment:")
        for experiment in [1, 2, 3]:
            print(f"\nExperiment {experiment} Results:")
            exp_metrics = metrics_df[metrics_df["Experiment"] == experiment]
            print(exp_metrics.to_string())

    except Exception as e:
        print(f"\nError in main: {str(e)}")
        raise

In [7]:
await main() #only Claude 3.5 Model executed Result

Using device: cuda
Loading questions from CSV...
Loaded 600 questions

Initializing models...

Initializing analyzer...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


Starting experiments...

Starting Experiment 1: Error Detection...

Total questions: 600
Total models: 1
Total tasks to process: 600

Completed 10/600 tasks (1.7%)
Completed 20/600 tasks (3.3%)
Completed 30/600 tasks (5.0%)
Completed 40/600 tasks (6.7%)
Completed 50/600 tasks (8.3%)
Completed 60/600 tasks (10.0%)
Completed 70/600 tasks (11.7%)
Completed 80/600 tasks (13.3%)
Completed 90/600 tasks (15.0%)
Completed 100/600 tasks (16.7%)
Completed 110/600 tasks (18.3%)
Completed 120/600 tasks (20.0%)
Completed 130/600 tasks (21.7%)
Completed 550/600 tasks (91.7%)
Completed 560/600 tasks (93.3%)
Completed 570/600 tasks (95.0%)
Completed 580/600 tasks (96.7%)
Completed 590/600 tasks (98.3%)
Completed 600/600 tasks (100.0%)

Starting Experiment 2: Error Correction...

Total questions: 600
Total models: 1
Total tasks to process: 600

Completed 10/600 tasks (1.7%)
Completed 160/600 tasks (26.7%)
Completed 170/600 tasks (28.3%)
Completed 180/600 tasks (30.0%)
Completed 190/600 tasks (31.7%)
C