##### First get the xml. With this we can parse all the information of our papers. Specifically we need the title and abstract for the prompt

**TODO: change the path below to your directory where the xml is stored**

In [60]:

xml_file_path = '../data/Task2_3_combined/NBER_working_papers.xml'

In [61]:
from openai import OpenAI
import xml.etree.ElementTree as ET
from tqdm import tqdm

tree = ET.parse(xml_file_path)
root = tree.getroot()

title = root.find('.//atl').text #find returns first instance of title, abstract
abstract = root.find('.//ab').text

##### Below we extract the sections of the xml and create a dataframe

In [62]:
import pandas as pd
import xml.etree.ElementTree as ET

# Extract relevant data
data = []
for rec in root.findall('rec'):
    header = rec.find('header')
    result_id = rec.get('resultID')
    
    # Extract header attributes
    short_db_name = header.get('shortDbName')
    long_db_name = header.get('longDbName')
    ui_term = header.get('uiTerm')

    # Extract control info
    control_info = header.find('controlInfo')
    pubinfo = control_info.find('pubinfo')
    artinfo = control_info.find('artinfo')
    display_info = header.find('displayInfo')
    
    # Extracting various fields with error handling
    article_data = {
        'resultID': result_id,
        'shortDbName': short_db_name,
        'longDbName': long_db_name,
        'uiTerm': ui_term,
        'jtl': control_info.find('./jinfo/jtl').text if control_info.find('./jinfo/jtl') is not None else None,
        'issn': control_info.find('./jinfo/issn').text if control_info.find('./jinfo/issn') is not None else None,
        'publication_date': pubinfo.find('dt').text if pubinfo.find('dt') is not None else None,
        'volume': pubinfo.find('vid').text if pubinfo.find('vid') is not None else None,
        'issue': pubinfo.find('iid').text if pubinfo.find('iid') is not None else None,
        'doi': artinfo.find('ui').text if artinfo.find('ui') is not None else None,
        'ppct': artinfo.find('ppct').text if artinfo.find('ppct') is not None else None,
        'pages': artinfo.find('pages').text if artinfo.find('pages') is not None else None,
        'title': artinfo.find('./tig/atl').text if artinfo.find('./tig/atl') is not None else None,
        'authors': ', '.join([au.text for au in artinfo.findall('./aug/au')]) if artinfo.findall('./aug/au') else None,
        'affiliations': ', '.join([affil.text for affil in artinfo.findall('./aug/affil')]) if artinfo.findall('./aug/affil') else None,
        'subjects': ', '.join([su.text for su in artinfo.findall('./su')]) if artinfo.findall('./su') else None,
        'abstract': artinfo.find('ab').text if artinfo.find('ab') is not None else None,
        'publication_type': artinfo.find('pubtype').text if artinfo.find('pubtype') is not None else None,
        'language': control_info.find('./language').text if control_info.find('./language') is not None else None,
        'url': display_info.find('./pLink/url').text if display_info.find('./pLink/url') is not None else None
    }
    
    data.append(article_data)

# Create DataFrame
df = pd.DataFrame(data)

##### View data frame

In [63]:
df

Unnamed: 0,resultID,shortDbName,longDbName,uiTerm,jtl,issn,publication_date,volume,issue,doi,ppct,pages,title,authors,affiliations,subjects,abstract,publication_type,language,url
0,1,ecn,EconLit,2098011,,,20240101,,,,,,A Supply Curve for Forest-Based CO2 Removal,"Franklin, Sergio L., Jr., Pindyck, Robert S.","Unlisted, Unlisted",Mathematical Methods; Programming Models; Math...,Forestation is viewed as an important means of...,Working Paper,English,https://search.ebscohost.com/login.aspx?direct...
1,2,ecn,EconLit,2098010,,,20240101,,,,,,New Area- and Population-based Geographic Cros...,"Ferrara, Andreas, Testa, Patrick A., Zhou, Liyang","Unlisted, Unlisted, Unlisted",Development of the Discipline: Historiographic...,"In applied historical research, geographic uni...",Working Paper,English,https://search.ebscohost.com/login.aspx?direct...
2,3,ecn,EconLit,2098009,,,20240101,,,,,,Decarbonizing Aviation: Cash-for-Clunkers in t...,"Brueckner, Jan K., Kahn, Matthew E., Nickelsbu...","Unlisted, Unlisted, Unlisted",Climate; Natural Disasters and Their Managemen...,The durability of the transportation capital s...,Working Paper,English,https://search.ebscohost.com/login.aspx?direct...
3,4,ecn,EconLit,2098008,,,20240101,,,,,,The Multigenerational Impact of Children and C...,"Karademir, Sencer, Laliberte, Jean-William P.,...","Unlisted, Unlisted, Unlisted",Fiscal Policies and Behavior of Economic Agent...,This paper examines the multigenerational impa...,Working Paper,English,https://search.ebscohost.com/login.aspx?direct...
4,5,ecn,EconLit,2098007,,,20240101,,,,,,Inequality Within Countries is Falling: Underr...,"Pinkovskiy, Maxim, Sala-i-Martin, Xavier, Chat...","Unlisted, Unlisted, Unlisted, Unlisted","Survey Methods; Sampling Methods, Personal Inc...",Household surveys suffer from persistent and g...,Working Paper,English,https://search.ebscohost.com/login.aspx?direct...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2576,2577,ecn,EconLit,1965085,,,20220101,,,,,,Global Innovation and Knowledge Diffusion,"Lind, Nelson, Ramondo, Natalia","Unlisted, Unlisted","Trade: General, Innovation; Research and Devel...",We develop a Ricardian model of trade in which...,Working Paper,English,https://search.ebscohost.com/login.aspx?direct...
2577,2578,ecn,EconLit,1965084,,,20220101,,,,,,"Macroeconomic Research, Present and Past","Glandon, Philip J., Kuttner, Kenneth, Mazumder...","Unlisted, Unlisted, Unlisted, Unlisted",Role of Economics; Role of Economists; Market ...,How is macroeconomic research conducted and wh...,Working Paper,English,https://search.ebscohost.com/login.aspx?direct...
2578,2579,ecn,EconLit,1965083,,,20220101,,,,,,Revealing Corruption: Firm and Worker Level Ev...,"Colonnelli, Emanuele, Lagaras, Spyridon, Ponti...","Unlisted, Unlisted, Unlisted, Unlisted, Unlisted",Bureaucracy; Administrative Processes in Publi...,We study how the disclosure of corrupt practic...,Working Paper,English,https://search.ebscohost.com/login.aspx?direct...
2579,2580,ecn,EconLit,1965082,,,20220101,,,,,,Debt as Safe Asset,"Brunnermeier, Markus K., Merkel, Sebastian A.,...","Unlisted, Unlisted, Unlisted","Financial Markets and the Macroeconomy, Portfo...",The price of a safe asset reflects not only th...,Working Paper,English,https://search.ebscohost.com/login.aspx?direct...


##### Now we create a .jsonl file with our requests

**TODO: on the last line of the code chunk below change the path to where the .jsonl should be created**

**TODO: change prompt if needed**

In [64]:
import json

# creates the api_batch_request.jsonl file in ../Task xyz/data

def create_batch_jsonl(df, file_path):
    """
    Create a .jsonl file for batching API requests.
    :param df: Pandas DataFrame containing 'title' and 'abstract' columns
    :param file_path: Path to save the output .jsonl file
    """
    with open(file_path, 'w') as f:
        for idx, row in df.iterrows():
            title = row['title']
            abstract = row['abstract']
            prompt = f"""Based on the title and abstract of the following research paper, evaluate its relevance to the topic "linking environmental changes to health" with the following three criteria. First, the paper should explicitly establish a connection between specific environmental changes (such as pollution, climate change, deforestation, etc.) and specific health outcomes (such as respiratory diseases, mental health, etc.). Secondly, you should prefer studies that demonstrate or hypothesize a direct impact or correlation between the environmental change and health. Thirdly, avoid papers that only discuss environmental changes or health outcomes in isolation. Both elements should be central to the study. 
            Your task is to rate the paper on a scale from 0 (no relevance) to 10 (highly relevant), focusing on papers with a clear, direct link between environmental changes and health outcomes as highly relevant. Keep the three criteria in mind while providing the rating. Provide only a single number (from 0 to 10) without any additional explanation. 
            Title: {title} 
            Abstract: {abstract}"""
                
            request_data = {
                "custom_id": f"request-{idx+1}",
                "method": "POST",
                "url": "/v1/chat/completions",
                "body": {
                    "model": "gpt-3.5-turbo-0125",
                    "messages": [
                        {"role": "system", "content": "You are a helpful research assistant."},
                        {"role": "user", "content": prompt}
                    ],
                    "temperature": 0.2, 
                    "max_tokens": 10
                }
            }
            f.write(json.dumps(request_data) + '\n') #json dumps converts a dictionary into json formatted string

# change path here
create_batch_jsonl(df, '../data/Task2_3_combined/NBER_workingpapers_api_batch_requests.jsonl')

##### API credentials

In [None]:
from openai import OpenAI
import json
from openai import OpenAI
from dotenv import load_dotenv
import os

env_path = "../keys.env"

load_dotenv(dotenv_path=env_path)
api_key = os.getenv("API_KEY")
project = os.getenv("PROJECT")
organization = os.getenv("ORGANIZATION")

client = OpenAI(
 organization=organization,
 project=project,
 api_key = api_key
)


In [None]:
client.batches.list() 
# when coming back after finishing the jobs, look at the list and get
# the appropriate batch_id if it wasn't saved. 
# alternatively just save the batch_id it. 

##### Upload batch file

**TODO: path of correct jsonl batch request file**

In [66]:
batch_input_file = client.files.create(
  file=open("../data/Task2_3_combined/NBER_workingpapers_api_batch_requests.jsonl", "rb"),
  purpose="batch"
)

batch_input_file_id = batch_input_file.id
print(batch_input_file_id)

file-9voFfTrX4ERIXzACsmMVCPgy


##### Create batch and create log file

In [67]:
import os
from datetime import datetime

batch = client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
      "description": "evaluating research papers from econlit"
    }
)
print(batch.id)


batch_67111913b8a4819097cd4ee40e41eb87


In [68]:
# save batch.id in a log
log_dir = '../data/logs'

current_time = datetime.now().strftime('%d-%m-%Y_%H-%M-%S')
log_filename = f'{current_time}.txt'
log_path = os.path.join(log_dir, log_filename)

with open(log_path, 'w') as log_file:
    log_file.write(batch.id + '\n')

print(f"Log file created: {log_path}")


batch_id = batch.id

Log file created: ../data/logs\17-10-2024_16-03-03.txt


##### Get status

In [75]:
# important, .retrieve() takes the batch.id not the batch_input_file.id
client.batches.retrieve(batch_id).status


'completed'

##### Get output file id

In [76]:
output_file_id = client.batches.retrieve(batch_id).output_file_id
print(output_file_id)

file-PqfFVPkMsi8j2cnmcY4A6Grj


**TODO: in the chunk below change output_path to path of directory where the output should be created in .jsonl format**

In [77]:

output_file = client.files.content(output_file_id).text

output_path = '../data/Task2_3_combined/NBER_workingpapers_batch-output.jsonl'
with open(output_path, 'w') as file:
    for line in output_file.splitlines():
        json_line = json.loads(line)
        file.write(json.dumps(json_line) + '\n')

print(f"Output has been written to {output_path}")

Output has been written to ../data/Task2_3_combined/NBER_workingpapers_batch-output.jsonl
