In [None]:
pip install transformers datasets torch scikit-learn pandas numpy nltk
pip install openai

In [None]:
import openai

openai.api_key = "xxxxxxxx"

# List all your Assistants
assistants = openai.beta.assistants.list()
print(assistants)

In [None]:
import openai
import pandas as pd
import json
import time
from sklearn.metrics.pairwise import cosine_similarity

# Set OpenAI API key
openai.api_key = "xxxxxxx"

# Define your custom assistant ID
CUSTOM_ASSISTANT_ID = "xxxxxxxx"

# Load training data
file1 = "Confidence Score_V1.xlsx"
file2 = "Confidence Score_V2.xlsx"
df1 = pd.read_excel(file1)
df2 = pd.read_excel(file2)

# Combine the training data
training_data = pd.concat([df1, df2]).reset_index(drop=True)

# Print column names to identify the correct text column
print("Training Data Columns:", training_data.columns)

# Step 1: Combine relevant columns into a single text field for embeddings
def combine_text_columns(row):
    """Merges multiple text fields into one for embeddings."""
    return f"Question_Text: {row.get('Question_Text', '')} | Company Response: {row.get('Company_Response', '')} | Analyst Comment: {row.get('Analyst_Comments', '')}"

training_data["combined_text"] = training_data.apply(combine_text_columns, axis=1)

# Step 2: Compute embeddings for training data
def get_embedding(text, model="text-embedding-ada-002"):
    #result = openai.Embedding.create(input=text, model=model)
    result = openai.embeddings.create(model=model, input=text)
    #return result["data"][0]["embedding"]
    return result.data[0].embedding

# Compute embeddings (replace 'text_column' with 'combined_text')
print("Precomputing embeddings for training data...")
training_data["embedding"] = training_data["combined_text"].apply(lambda x: get_embedding(str(x)))

# Function to find relevant training data for a given question
def get_relevant_data(question, training_data, top_n=10):
    question_embedding = get_embedding(question)
    
    # Compute cosine similarity between question and training data embeddings
    training_data["similarity"] = training_data["embedding"].apply(
        lambda x: cosine_similarity([question_embedding], [x])[0][0]
    )
    
    # Select the top `N` most relevant rows
    relevant_data = training_data.sort_values("similarity", ascending=False).head(top_n)
    return relevant_data

# Load input questions
questions_df = pd.read_excel("Input_Data_GPT_V1.xlsx")

# Step 3: Create a new thread
thread = openai.beta.threads.create()

# Step 4: Process each question dynamically
responses = []
for idx, row in questions_df.iterrows():
    question_text = str(row["Question_Text"])
    company_response = str(row["Company_Response"])
    analyst_comments = str(row["Analyst_Comments"])

    user_question = (
        f"Evaluate the confidence score for this ESG-related response.\n\n"
        f"**Question:** {question_text}\n"
        f"**Company Response:** {company_response}\n"
        f"**Analyst Comments:** {analyst_comments}\n\n"
        "Provide ONLY the following:\n"
        "1. Confidence Score (0-100)\n"
        "2. Brief Explanation for the score"
    )

    # Retrieve relevant training data for the question
    print(f"Retrieving relevant training data for Question {idx + 1}...")
    relevant_data = get_relevant_data(user_question, training_data, top_n=10)
    relevant_context = relevant_data["combined_text"].tolist()
    relevant_context_text = "\n\n".join(relevant_context)

    # Combine relevant context with the user question
    full_context = f"{relevant_context_text}\n\n{user_question}"

    # Send to assistant
    openai.beta.threads.messages.create(
        thread_id=thread.id,
        role="user",
        content=full_context
    )

    # Run the assistant
    run = openai.beta.threads.runs.create(
        thread_id=thread.id,
        assistant_id=CUSTOM_ASSISTANT_ID
    )

    # Poll for response
    assistant_response = None
    while True:
        run_status = openai.beta.threads.runs.retrieve(thread_id=thread.id, run_id=run.id)
        if run_status.status == "completed":
            messages = openai.beta.threads.messages.list(thread_id=thread.id)
            for msg in messages.data:
                if msg.role == "assistant":
                    assistant_response = msg.content
                    break
            if assistant_response:
                break
        time.sleep(2)  # Polling interval

    # Save response
    print(f"Received response for Question {idx + 1}.")
    responses.append(assistant_response)
    questions_df.loc[idx, "Confidence Score & Explanation"] = assistant_response

    # Save progress incrementally
    questions_df.to_excel("confidence_scores_dynamic.xlsx", index=False)

# Final save
questions_df.to_excel("confidence_scores_dynamic_final.xlsx", index=False)
print("Processing completed. Results saved to 'confidence_scores_dynamic_final.xlsx'.")
