In [1]:
import pandas as pd
from openai import OpenAI
import os
import json
from dotenv import load_dotenv

# Constants
INPUT_FILE = 'dublette_cleaned_results_final.xlsx'
OUTPUT_FILE = 'processed_results_detailed.xlsx'
OPENAI_MODEL = 'gpt-4o'
MAX_ROWS = 500  # Number of entries to process
load_dotenv()
# Instantiate the OpenAI client
client = OpenAI(api_key=os.getenv('OPENAI_APIKEY'))  # You can omit the api_key if it's set in your environment

PROMPT_TEMPLATE = '''
You are assisting in categorizing the potential value of a paper based on its title and abstract. The objective is to identify papers that contribute to one the following research focus areas:

1. **User or Stakeholder Requirements**: Papers should provide insights into the requirements for AI-based knowledge management/retrieval systems. This includes user needs, stakeholder expectations, or technical necessities like data organization, system performance, UX design, or interaction features. The focus should be on systems that handle **internal knowledge sources** and ideally on conversational or retrieval solutions.

2. **Design Principles and Acceptance Requirements**: Papers should discuss design principles, frameworks, or approaches that support successful implementation of such systems. This includes considerations for usability, system integration, or factors impacting user acceptance. But related to the previous point at best

Evaluate the relevance of each paper critically and categorize it into one of the following stages:

- **NOT_RELEVANT**: The paper does not address the research focus areas. Examples include papers about unrelated use cases, outdated methods, or overly narrow topics without transferable insights.
- **MAYBE_RELEVANT**: The paper potentially addresses one or more research focus areas. It warrants a closer review but lacks clear evidence in the title/abstract.
- **RELEVANT**: The paper clearly aligns with one or both focus areas, providing substantial or promising insights.
- **NOT_FOUND**: No abstract was provided or no successful request/response occurred.

Provide your response in plain JSON format to enable direct processing, including a brief justification. For example:

{{
  "CATEGORY": "RELEVANT",
  "REASON": "The abstract highlights user-centered design principles and discusses AI integration in knowledge retrieval systems."
}}

Be critical in your evaluation. If the paper does not explicitly or implicitly addresses on of the points, it should be classified as NOT_RELEVANT. 
  
Title: {Title}
Abstract: {Abstract}
'''



# Function to process each paper
def categorize_paper(title: str, abstract: str):
    if pd.isna(abstract) or abstract.strip() == '' or abstract.strip().upper() == 'NOT_FOUND':
        return 'NOT_FOUND', 'No abstract was found / no successful request or response.'

    prompt = PROMPT_TEMPLATE.format(Title=title, Abstract=abstract)

    try:
        response = client.chat.completions.create(
            model=OPENAI_MODEL,
            messages=[
                {"role": "user", "content": prompt}
            ],
            max_tokens=150,
            n=1,
            temperature=0.0,
            response_format={"type": "json_object"}
        )
        reply = response.choices[0].message.content
        result = json.loads(reply)
        category = result.get('CATEGORY', 'NOT_FOUND')
        reason = result.get('REASON', '')
    except json.JSONDecodeError:
        category = 'NOT_FOUND'
        reason = 'Could not parse the response as JSON.'
    except Exception as e:
        category = 'NOT_FOUND'
        reason = f'Error during API call: {str(e)}'
        print(reason)

    return category, reason


# Main processing function
def process_papers():
    # Load the data
    df = pd.read_excel(INPUT_FILE)

    # Check and add columns if they don't already exist
    if 'GPT_STAGING' not in df.columns:
        df.insert(0, 'GPT_STAGING', '')
    else:
        print("Column 'GPT_STAGING' already exists.")

    if 'REASON' not in df.columns:
        df.insert(1, 'REASON', '')
    else:
        print("Column 'REASON' already exists.")

    # Process each paper
    for idx, row in df.head(MAX_ROWS).iterrows():
        # Skip if GPT_STAGING is already filled with some value
        title = row['Title']
        abstract = row['Abstract']
        category, reason = categorize_paper(title, abstract)
        df.at[idx, 'GPT_STAGING'] = category
        df.at[idx, 'REASON'] = reason
        print(f'Processed paper {idx + 1}/{MAX_ROWS}: {category}')

    # Save the results
    df.to_excel(OUTPUT_FILE, index=False)
    print(f'\nProcessing complete. Results saved to {OUTPUT_FILE}')


In [2]:
# Process the papers
process_papers()

Processed paper 1/500: NOT_RELEVANT
Processed paper 2/500: NOT_RELEVANT
Processed paper 3/500: NOT_RELEVANT
Processed paper 4/500: NOT_RELEVANT
Processed paper 5/500: NOT_RELEVANT
Processed paper 6/500: NOT_RELEVANT
Processed paper 7/500: NOT_RELEVANT
Processed paper 8/500: MAYBE_RELEVANT
Processed paper 9/500: NOT_RELEVANT
Processed paper 10/500: NOT_RELEVANT
Processed paper 11/500: MAYBE_RELEVANT
Processed paper 12/500: NOT_RELEVANT
Processed paper 13/500: NOT_RELEVANT
Processed paper 14/500: NOT_RELEVANT
Processed paper 15/500: NOT_RELEVANT
Processed paper 16/500: NOT_RELEVANT
Processed paper 17/500: NOT_RELEVANT
Processed paper 18/500: NOT_RELEVANT
Processed paper 19/500: NOT_RELEVANT
Processed paper 20/500: NOT_RELEVANT
Processed paper 21/500: NOT_RELEVANT
Processed paper 22/500: NOT_RELEVANT
Processed paper 23/500: NOT_RELEVANT
Processed paper 24/500: NOT_RELEVANT
Processed paper 25/500: NOT_RELEVANT
Processed paper 26/500: NOT_RELEVANT
Processed paper 27/500: NOT_RELEVANT
Proces