In [None]:
! pip install openai langchain_community langchain_huggingface

In [None]:
! pip install torch torchvision
! pip install pandas

# Emotion Classification using RAG and Reasoning
## Overview
This notebook implements an **emotion classification** pipeline using a **Retrieval-Augmented Generation (RAG)** approach and in context learning. The goal is to classify emotions in text and assign intensity levels.

## Steps in the Notebook:
1. **Initialize OpenAI API and Setup Cache**  
2. **Load Sentence Transformer Embeddings for Vector Storage**  
3. **Retrieve Similar Training Examples from FAISS Vector Store**  
4. **Construct a System Prompt for Emotion Analysis**  
5. **Use OpenAI Model to Predict Emotions and Justifications**  
6. **Batch Processing of CSV Files for Emotion Classification**  
7. **Save Results and Run Interactive Chat Mode (Optional)**  

## Dependencies:
- `OpenAI` for accessing a **deepseek-chat** model  
- `HuggingFace` embeddings for **vector search**  
- `FAISS` for efficient **similarity search**  
- `Pandas` for **data manipulation**  

---

In [None]:
# ==============================================
# Emotion Classification using RAG and OpenAI
# ==============================================

# Import necessary libraries
import os
import json
import hashlib
from openai import OpenAI
import pandas as pd
from tqdm import tqdm
import ast
from pathlib import Path
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
import torch

# --------------------------
# 1. Initialize OpenAI Client
# --------------------------
# Replace with your API key (Ensure to keep it secure!)
api_key = ''
client = OpenAI(api_key=api_key, base_url="https://api.deepseek.com")

# ------------------------------
# 2. Setup Cache Directory
# ------------------------------
CACHE_DIR = Path('prompt_cache')
CACHE_DIR.mkdir(exist_ok=True)

# Check for CUDA availability
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# ------------------------------
# 3. Load Sentence Transformer Embeddings
# ------------------------------
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
    model_kwargs={'device': device}
)

# ------------------------------
# 4. Caching Mechanism
# ------------------------------
def get_cache_key(text, system_prompt):
    """Generate a unique cache key for a given text and system prompt."""
    combined = f"{text}||{system_prompt}"
    return hashlib.md5(combined.encode()).hexdigest()

def get_cached_response(cache_key):
    """Retrieve cached response if available."""
    cache_file = CACHE_DIR / f"{cache_key}.json"
    if cache_file.exists():
        with open(cache_file, 'r') as f:
            return json.load(f)
    return None

def cache_response(cache_key, response):
    """Store the response in cache."""
    cache_file = CACHE_DIR / f"{cache_key}.json"
    with open(cache_file, 'w') as f:
        json.dump(response, f)

# ------------------------------
# 5. Emotion Intensity Levels
# ------------------------------
EMOTION_LEVELS = {
    0: "none",
    1: "low",
    2: "moderate",
    3: "high",
    # 4: "very high"
}

# ------------------------------
# 6. Create FAISS Vector Store
# ------------------------------
def create_vector_store(csv_path):
    """Create a FAISS vector store from the training dataset."""
    df = pd.read_csv(csv_path)
    texts = df['text'].tolist()

    # Create metadata for each text
    metadatas = []
    for _, row in df.iterrows():
        emotions = {
            'joy': row['joy'],
            'fear': row['fear'],
            'anger': row['anger'],
            'sadness': row['sadness'],
            'disgust': row['disgust'],
            'surprise': row['surprise']
        }
        metadatas.append({'emotions': emotions})

    # Return FAISS vector store
    return FAISS.from_texts(texts, embeddings, metadatas=metadatas)

# ------------------------------
# 7. Retrieve Similar Examples
# ------------------------------
def get_similar_examples(vector_store, query_text, k=5):
    """Retrieve k most similar examples from the vector store."""
    results = vector_store.similarity_search_with_score(query_text, k=k)

    examples = ''
    for i, (doc, score) in enumerate(results):
        emotion_scores = doc.metadata['emotions']

        # Convert numeric values to labels
        emotion_descriptions = ", ".join(
            f"{emotion}: {EMOTION_LEVELS.get(value, 'unknown')}"
            for emotion, value in emotion_scores.items()
        )

        examples += f"Input {i}: {doc.page_content}\n"
        examples += f"Output {i}: [{emotion_descriptions}]\n"

    return examples

# ------------------------------
# 8. Construct System Prompt
# ------------------------------
def get_combined_prompt(examples):
    """Generate a single system prompt that combines reasoning, classification, and intensity analysis."""
    return f"""You are an emotion classification expert. Your task has two parts:

1. First, analyze the text and explain why certain emotions are present or absent, and provide an intensity level (from none to high) for each emotion.
   Consider these emotions: joy, fear, anger, sadness, disgust, surprise

   Your analysis should:
   - Provide specific evidence from the text
   - Consider both explicit words and contextual implications
   - Be objective and clear
   - Assign an intensity level for each emotion: none, low, moderate, high

2. Then, provide your final classification by listing all detected emotions along with their intensity levels.
   Use only these emotions: joy, fear, anger, sadness, disgust, surprise, and the intensity levels: none, low, moderate, high.

IMPORTANT:
- The examples below are provided to help guide your reasoning. They contain insights and annotations from experts who labeled the dataset. Pay close attention to how emotions and their intensities were derived in these examples, and use this understanding to inform your own analysis.

Format your response EXACTLY as follows:

Explanation:
(Your detailed analysis here, including why each emotion is assigned its specific intensity level)

Final Classification:
[emotion1: intensity1, emotion2: intensity2, ...]

Here are some similar examples to help guide your analysis:
{examples}"""

# ------------------------------
# 9. Process Emotions using RAG
# ------------------------------
def process_emotions(text, vector_store, system_prompt=None):
    """Predict emotions and reasoning using Retrieval-Augmented Generation (RAG)."""
    try:
        examples = get_similar_examples(vector_store, text)

        if system_prompt is None:
            system_prompt = get_combined_prompt(examples)

        cache_key = get_cache_key(text, system_prompt)
        cached_response = get_cached_response(cache_key)

        if cached_response:
            return cached_response

        response = client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": f"Analyze this text and provide both the evidence analysis and final classification: {text}"}
            ],
            max_tokens=1000,
            temperature=0
        )

        response_text = response.choices[0].message.content.strip()

        # Split response into reasoning and classification
        parts = response_text.split("Final Classification:")
        if len(parts) != 2:
            return {'emotions': {'none': 'none'}, 'reasoning': response_text}

        reasoning = parts[0].replace("Explanation:", "").strip()
        classification = parts[1].strip()

        # Extract emotions
        pred_emotions = {'none': 'none'}
        if '[' in classification and ']' in classification:
            emotions_str = classification[classification.find('[')+1:classification.find(']')]
            pred_emotions = {
                emotion.strip().lower(): intensity.strip().lower()
                for emotion, intensity in (item.split(':') for item in emotions_str.split(','))
            }

        result = {'emotions': pred_emotions, 'reasoning': reasoning}
        cache_response(cache_key, result)
        return result

    except Exception as e:
        return {'emotions': {'none': 'none'}, 'reasoning': f"Error: {str(e)}"}

# ------------------------------
# 10. Batch Processing for CSV
# ------------------------------
def classify_emotions_batch(input_csv, output_csv, training_csv):
    """Process multiple texts and save results."""
    try:
        df = pd.read_csv(input_csv)
        df_copy = df.copy()

        print("Creating vector store from training data...")
        vector_store = create_vector_store(training_csv)

        emotion_columns = ['joy', 'fear', 'anger', 'sadness', 'disgust', 'surprise']
        for emotion in emotion_columns:
            df_copy[emotion] = 0

        df_copy['reasoning'] = ''

        intensity_map = {'none': 0, 'low': 1, 'moderate': 2, 'high': 3}

        print("Processing texts...")
        for idx, row in tqdm(df_copy.iterrows(), total=len(df_copy)):
            result = process_emotions(row['text'], vector_store)
            pred_emotions = result.get('emotions', {'none': 'none'})
            reasoning = result.get('reasoning', '')

            for emotion, intensity in pred_emotions.items():
                if emotion in emotion_columns:
                    df_copy.at[idx, emotion] = intensity_map.get(intensity, 0)

            df_copy.at[idx, 'reasoning'] = reasoning

        df_copy.to_csv(output_csv)
        print(f"Results saved to {output_csv}")

    except Exception as e:
        print(f"Error: {e}")

# ------------------------------
# 11. Run Classification
# ------------------------------
test_file = './track_b/dev/chn.csv'
test_output = './track_b/dev/chn_result.csv'
training_csv = './track_b/train/chn.csv'

print(f"Processing test file: {test_file}")
classify_emotions_batch(test_file, test_output, training_csv)


#Emotion Classification using Reasoning
##Overview
This notebook implements an emotion classification pipeline using in context learning. The goal is to classify emotions in text and assign intensity levels.

In [None]:
# ==============================================
# Emotion Classification using RAG and OpenAI
# ==============================================

# Import necessary libraries
import os
import json
import hashlib
from openai import OpenAI
import pandas as pd
from tqdm import tqdm
import ast
from pathlib import Path
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
import torch

# --------------------------
# 1. Initialize OpenAI Client
# --------------------------
# Replace with your API key (Ensure to keep it secure!)
api_key = ''
client = OpenAI(api_key=api_key, base_url="https://api.deepseek.com")

# ------------------------------
# 2. Setup Cache Directory
# ------------------------------
CACHE_DIR = Path('prompt_cache')
CACHE_DIR.mkdir(exist_ok=True)

# Check for CUDA availability
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# ------------------------------
# 3. Load Sentence Transformer Embeddings
# ------------------------------
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
    model_kwargs={'device': device}
)

# ------------------------------
# 4. Caching Mechanism
# ------------------------------
def get_cache_key(text, system_prompt):
    """Generate a unique cache key for a given text and system prompt."""
    combined = f"{text}||{system_prompt}"
    return hashlib.md5(combined.encode()).hexdigest()

def get_cached_response(cache_key):
    """Retrieve cached response if available."""
    cache_file = CACHE_DIR / f"{cache_key}.json"
    if cache_file.exists():
        with open(cache_file, 'r') as f:
            return json.load(f)
    return None

def cache_response(cache_key, response):
    """Store the response in cache."""
    cache_file = CACHE_DIR / f"{cache_key}.json"
    with open(cache_file, 'w') as f:
        json.dump(response, f)

# ------------------------------
# 5. Emotion Intensity Levels
# ------------------------------
EMOTION_LEVELS = {
    0: "none",
    1: "low",
    2: "moderate",
    3: "high",
    # 4: "very high"
}

# ------------------------------
# 6. Create FAISS Vector Store
# ------------------------------
def create_vector_store(csv_path):
    """Create a FAISS vector store from the training dataset."""
    df = pd.read_csv(csv_path)
    texts = df['text'].tolist()

    # Create metadata for each text
    metadatas = []
    for _, row in df.iterrows():
        emotions = {
            'joy': row['joy'],
            'fear': row['fear'],
            'anger': row['anger'],
            'sadness': row['sadness'],
            'disgust': row['disgust'],
            'surprise': row['surprise']
        }
        metadatas.append({'emotions': emotions})

    # Return FAISS vector store
    return FAISS.from_texts(texts, embeddings, metadatas=metadatas)

# ------------------------------
# 7. Retrieve Similar Examples
# ------------------------------
def get_similar_examples(vector_store, query_text, k=5):
    """Retrieve k most similar examples from the vector store."""
    results = vector_store.similarity_search_with_score(query_text, k=k)

    examples = ''
    for i, (doc, score) in enumerate(results):
        emotion_scores = doc.metadata['emotions']

        # Convert numeric values to labels
        emotion_descriptions = ", ".join(
            f"{emotion}: {EMOTION_LEVELS.get(value, 'unknown')}"
            for emotion, value in emotion_scores.items()
        )

        examples += f"Input {i}: {doc.page_content}\n"
        examples += f"Output {i}: [{emotion_descriptions}]\n"

    return examples

# ------------------------------
# 8. Construct System Prompt
# ------------------------------
def get_combined_prompt(examples):
    """Generate a single system prompt that combines reasoning, classification, and intensity analysis."""
    return f"""You are an emotion classification expert. Your task has two parts:

1. First, analyze the text and explain why certain emotions are present or absent, and provide an intensity level (from none to high) for each emotion.
   Consider these emotions: joy, fear, anger, sadness, disgust, surprise

   Your analysis should:
   - Provide specific evidence from the text
   - Consider both explicit words and contextual implications
   - Be objective and clear
   - Assign an intensity level for each emotion: none, low, moderate, high

2. Then, provide your final classification by listing all detected emotions along with their intensity levels.
   Use only these emotions: joy, fear, anger, sadness, disgust, surprise, and the intensity levels: none, low, moderate, high.

IMPORTANT:
- The examples below are provided to help guide your reasoning. They contain insights and annotations from experts who labeled the dataset. Pay close attention to how emotions and their intensities were derived in these examples, and use this understanding to inform your own analysis.

Format your response EXACTLY as follows:

Explanation:
(Your detailed analysis here, including why each emotion is assigned its specific intensity level)

Final Classification:
[emotion1: intensity1, emotion2: intensity2, ...]
"""

# ------------------------------
# 9. Process Emotions using RAG
# ------------------------------
def process_emotions(text, vector_store, system_prompt=None):
    """Predict emotions and reasoning using Retrieval-Augmented Generation (RAG)."""
    try:
        examples = get_similar_examples(vector_store, text)

        if system_prompt is None:
            system_prompt = get_combined_prompt(examples)

        cache_key = get_cache_key(text, system_prompt)
        cached_response = get_cached_response(cache_key)

        if cached_response:
            return cached_response

        response = client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": f"Analyze this text and provide both the evidence analysis and final classification: {text}"}
            ],
            max_tokens=1000,
            temperature=0
        )

        response_text = response.choices[0].message.content.strip()

        # Split response into reasoning and classification
        parts = response_text.split("Final Classification:")
        if len(parts) != 2:
            return {'emotions': {'none': 'none'}, 'reasoning': response_text}

        reasoning = parts[0].replace("Explanation:", "").strip()
        classification = parts[1].strip()

        # Extract emotions
        pred_emotions = {'none': 'none'}
        if '[' in classification and ']' in classification:
            emotions_str = classification[classification.find('[')+1:classification.find(']')]
            pred_emotions = {
                emotion.strip().lower(): intensity.strip().lower()
                for emotion, intensity in (item.split(':') for item in emotions_str.split(','))
            }

        result = {'emotions': pred_emotions, 'reasoning': reasoning}
        cache_response(cache_key, result)
        return result

    except Exception as e:
        return {'emotions': {'none': 'none'}, 'reasoning': f"Error: {str(e)}"}

# ------------------------------
# 10. Batch Processing for CSV
# ------------------------------
def classify_emotions_batch(input_csv, output_csv, training_csv):
    """Process multiple texts and save results."""
    try:
        df = pd.read_csv(input_csv)
        df_copy = df.copy()

        print("Creating vector store from training data...")
        vector_store = create_vector_store(training_csv)

        emotion_columns = ['joy', 'fear', 'anger', 'sadness', 'disgust', 'surprise']
        for emotion in emotion_columns:
            df_copy[emotion] = 0

        df_copy['reasoning'] = ''

        intensity_map = {'none': 0, 'low': 1, 'moderate': 2, 'high': 3}

        print("Processing texts...")
        for idx, row in tqdm(df_copy.iterrows(), total=len(df_copy)):
            result = process_emotions(row['text'], vector_store)
            pred_emotions = result.get('emotions', {'none': 'none'})
            reasoning = result.get('reasoning', '')

            for emotion, intensity in pred_emotions.items():
                if emotion in emotion_columns:
                    df_copy.at[idx, emotion] = intensity_map.get(intensity, 0)

            df_copy.at[idx, 'reasoning'] = reasoning

        df_copy.to_csv(output_csv)
        print(f"Results saved to {output_csv}")

    except Exception as e:
        print(f"Error: {e}")

# ------------------------------
# 11. Run Classification
# ------------------------------
test_file = './track_b/dev/chn.csv'
test_output = './track_b/dev/chn_result.csv'
training_csv = './track_b/train/chn.csv'

print(f"Processing test file: {test_file}")
classify_emotions_batch(test_file, test_output, training_csv)


# Emotion Classification using Prompting (Baseline)
## Overview
This notebook implements an **emotion classification** pipeline using prompting. It takes text as input, analyzes the emotions present, and assigns an intensity level to each detected emotion.


In [None]:
# ==============================================
# 1. Import Required Libraries
# ==============================================

import os
import json
import hashlib
from openai import OpenAI
import pandas as pd
from tqdm import tqdm
import ast
from pathlib import Path
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
import torch

# ==============================================
# 2. Initialize OpenAI Client
# ==============================================

# Replace with your API key (Ensure to keep it secure!)
api_key = 'sk-3c78d047dac44ead8300502913471650'

# Create an OpenAI client with DeepSeek API (alternative to GPT)
client = OpenAI(api_key=api_key, base_url="https://api.deepseek.com")

# ==============================================
# 3. Setup Cache for Storing API Responses
# ==============================================

CACHE_DIR = Path('prompt_cache')  # Define cache directory
CACHE_DIR.mkdir(exist_ok=True)  # Create the directory if it doesn't exist

# Check if GPU is available; otherwise, use CPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# ==============================================
# 4. Load Sentence Transformer Embeddings
# ==============================================

# Initialize HuggingFace transformer for sentence embeddings
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
    model_kwargs={'device': device}
)

# ==============================================
# 5. Caching Functions to Store and Retrieve API Responses
# ==============================================

def get_cache_key(text, system_prompt):
    """Generate a unique hash key for caching responses based on input text and system prompt."""
    combined = f"{text}||{system_prompt}"
    return hashlib.md5(combined.encode()).hexdigest()

def get_cached_response(cache_key):
    """Retrieve cached response if available to avoid redundant API calls."""
    cache_file = CACHE_DIR / f"{cache_key}.json"
    if cache_file.exists():
        with open(cache_file, 'r') as f:
            return json.load(f)
    return None  # Return None if no cached response exists

def cache_response(cache_key, response):
    """Store API responses in a JSON cache file to reduce redundant calls."""
    cache_file = CACHE_DIR / f"{cache_key}.json"
    with open(cache_file, 'w') as f:
        json.dump(response, f)

# ==============================================
# 6. Define Emotion Intensity Levels
# ==============================================

EMOTION_LEVELS = {
    0: "none",
    1: "low",
    2: "moderate",
    3: "high",
    # 4: "very high"  # Commented out if not in use
}

# ==============================================
# 7. Create a FAISS Vector Store from CSV Data
# ==============================================

def create_vector_store(csv_path):
    """Create a FAISS vector store from a dataset of text samples."""
    df = pd.read_csv(csv_path)
    texts = df['text'].tolist()  # Extract text column

    # Generate metadata (emotion labels) for each text sample
    metadatas = []
    for _, row in df.iterrows():
        emotions = {
            'joy': row['joy'],
            'fear': row['fear'],
            'anger': row['anger'],
            'sadness': row['sadness'],
            'disgust': row['disgust'],
            'surprise': row['surprise']
        }
        metadatas.append({'emotions': emotions})

    # Return FAISS vector store with text and metadata
    return FAISS.from_texts(texts, embeddings, metadatas=metadatas)

# ==============================================
# 8. Construct System Prompt for Emotion Analysis
# ==============================================

def get_combined_prompt():
    """Generate a single system prompt that defines the classification task."""
    return """You are an emotion classification expert. Your task is to classify the emotions present in the given text and assign an intensity level (from none to high) for each emotion.

Consider these emotions: joy, fear, anger, sadness, disgust, surprise.

Assign an intensity level for each emotion from the following options: none, low, moderate, high.

Format your response EXACTLY as follows:

Final Classification:
[joy: intensity, fear: intensity, anger: intensity, sadness: intensity, disgust: intensity, surprise: intensity]
"""

# ==============================================
# 9. Batch Processing for Emotion Classification
# ==============================================

def classify_emotions_batch(input_csv, output_csv, training_csv):
    """Process multiple texts and save results."""
    try:
        df = pd.read_csv(input_csv)
        df_copy = df.copy()

        print("Creating vector store from training data...")
        vector_store = create_vector_store(training_csv)

        emotion_columns = ['joy', 'fear', 'anger', 'sadness', 'disgust', 'surprise']
        for emotion in emotion_columns:
            df_copy[emotion] = 0  # Initialize emotion columns

        # Map intensities to numeric values
        intensity_map = {'none': 0, 'low': 1, 'moderate': 2, 'high': 3}

        print("Processing texts...")
        for idx, row in tqdm(df_copy.iterrows(), total=len(df_copy)):
            try:
                text = row['text']

                # Generate cache key to avoid redundant API calls
                cache_key = get_cache_key(text, "emotion_analysis")
                cached_response = get_cached_response(cache_key)

                if cached_response:
                    pred_emotions = cached_response
                else:
                    response = client.chat.completions.create(
                        model="deepseek-chat",
                        messages=[
                            {"role": "system", "content": get_combined_prompt()},
                            {"role": "user", "content": f"Classify emotions in: {text}"}
                        ],
                        max_tokens=1000,
                        temperature=0
                    )

                    response_text = response.choices[0].message.content.strip()
                    pred_emotions = {emotion.strip().lower(): "low" for emotion in response_text.split(',')}
                    cache_response(cache_key, pred_emotions)

                # Update emotion intensity columns with numeric values
                for emotion, intensity in pred_emotions.items():
                    if emotion in emotion_columns:
                        df_copy.at[idx, emotion] = intensity_map.get(intensity, 0)

            except Exception as e:
                print(f"Error processing row {idx}: {e}")
                for emotion in emotion_columns:
                    df_copy.at[idx, emotion] = 0  # Default to 0 for all emotions

        df_copy.to_csv(output_csv)
        print(f"Results saved to {output_csv}")

    except Exception as e:
        print(f"Error: {e}")

# ==============================================
# 10. Run Emotion Classification on Test Data
# ==============================================

test_file = './track_b/dev/esp.csv'
test_output = './track_b/dev/esp_result_without_exm.csv'
training_csv = './track_b/train/esp.csv'

print(f"Processing test file: {test_file}")
classify_emotions_batch(test_file, test_output, training_csv)
