In [1]:
import pandas as pd
from openai import OpenAI
import os
import json
from dotenv import load_dotenv


# Constants
INPUT_FILE = 'dublette_cleaned_results_final.xlsx'
OUTPUT_FILE = 'processed_results.xlsx'
OPENAI_MODEL = 'gpt-4o'
MAX_ROWS = 10  # Number of entries to process
load_dotenv()
# Instantiate the OpenAI client
client = OpenAI(api_key=os.getenv('OPENAI_APIKEY'))  # You can omit the api_key if it's set in your environment

# Prompt Template
PROMPT_TEMPLATE = '''
You are helping to categorize the potential value of a paper by its title + abstract. These are the research questions that potentially shall be answered in the literature review:
Q1: What requirements should AI-based solutions for efficient knowledge management/retrieval in (the corporate context) fulfill? (can be from the users or any stakeholders perspective, but also generally or technically, at best it should be conversational solutions or retrieval from internal knowledge)
Q2: Which design principles can be derived from such requirements to achieve such implementations successfully?

Title: {Title}
Abstract: {Abstract}

Now be critical and try to categorize the paper in one of the following stages:
NOT_RELEVANT -> Paper probably can't answer any of the topics. Or it's a too specific use case that can't be transferred.
MAYBE_RELEVANT -> Paper might answer any of the topics. It's worth taking a closer look.
RELEVANT -> Paper seems to have potential to address any topics of the research.
NOT_FOUND -> No abstract was found / no successful request or response

Please try to think as a researcher; it is okay to be critical and to discard papers as not relevant.

Your answer should be in plain json format to directly process it, like:
{{
"CATEGORY": "RELEVANT",
"REASON": "your reason in one or two sentences"
}}
'''

# Function to process each paper
def categorize_paper(title: str, abstract: str):
    if pd.isna(abstract) or abstract.strip() == '' or abstract.strip().upper() == 'NOT_FOUND':
        return 'NOT_FOUND', 'No abstract was found / no successful request or response.'

    prompt = PROMPT_TEMPLATE.format(Title=title, Abstract=abstract)

    try:
        response = client.chat.completions.create(
            model=OPENAI_MODEL,
            messages=[
                {"role": "user", "content": prompt}
            ],
            max_tokens=150,
            n=1,
            temperature=0.0,
            response_format={"type": "json_object"}
        )
        reply = response.choices[0].message.content
        result = json.loads(reply)
        category = result.get('CATEGORY', 'NOT_FOUND')
        reason = result.get('REASON', '')
    except json.JSONDecodeError:
        category = 'NOT_FOUND'
        reason = 'Could not parse the response as JSON.'
    except Exception as e:
        category = 'NOT_FOUND'
        reason = f'Error during API call: {str(e)}'
        print (reason)

    return category, reason

# Main processing function
def process_papers():
    # Load the data
    df = pd.read_excel(INPUT_FILE)

    # Insert new columns at the beginning
    df.insert(0, 'GPT_STAGING', '')
    df.insert(1, 'REASON', '')

    # Process each paper
    for idx, row in df.head(MAX_ROWS).iterrows():
        title = row['Title']
        abstract = row['Abstract']
        category, reason = categorize_paper(title, abstract)
        df.at[idx, 'GPT_STAGING'] = category
        df.at[idx, 'REASON'] = reason
        print(f'Processed paper {idx+1}/{MAX_ROWS}: {category}')

    # Save the results
    df.to_excel(OUTPUT_FILE, index=False)
    print(f'\nProcessing complete. Results saved to {OUTPUT_FILE}')


In [2]:
# Process the papers
process_papers()

Processed paper 1/10: NOT_RELEVANT
Processed paper 2/10: MAYBE_RELEVANT
Processed paper 3/10: MAYBE_RELEVANT
Processed paper 4/10: NOT_RELEVANT
Processed paper 5/10: NOT_RELEVANT
Processed paper 6/10: NOT_RELEVANT
Processed paper 7/10: MAYBE_RELEVANT
Processed paper 8/10: MAYBE_RELEVANT
Processed paper 9/10: NOT_RELEVANT
Processed paper 10/10: MAYBE_RELEVANT

Processing complete. Results saved to processed_results.xlsx
