<a href="https://colab.research.google.com/github/chk-AI/abstract-screening/blob/main/Six_LLM_assisted_systematic_review.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#STEP 1: Installing necessary libraries and save secrets (API-keys and e-mail)

In [None]:
!pip install biopython
!pip install pandas
!pip install -q openai
!pip install anthropic
!pip install openpyxl

from google.colab import userdata
import csv
import time
import pandas as pd
import json
from Bio import Entrez
from openai import OpenAI
import anthropic


In [None]:
#Save API keys for OpenAI and Anthopic and email adress for PubMed-API in sectrets (key in the left column).
#API keys should be kept secret.

openai_api_secret_name = 'OPENAI_API_KEY'
# Try-except blocks for handling secret retrieval and initialization
try:
    OPENAI_API_KEY = userdata.get(openai_api_secret_name)
    client = OpenAI(api_key=OPENAI_API_KEY)
except Exception as e:
    print(f"There was an error initializing OpenAI client with the API key: {e}")
    raise e

# Anthropics API key
anthropic_api_secret_name = 'CLAUDE_API_KEY'
CLAUDE_API_KEY = userdata.get(anthropic_api_secret_name)
anthropic_client = anthropic.Anthropic(api_key=CLAUDE_API_KEY)

# Email for Biopython
Entrez.email = userdata.get('USER_EMAIL')

#STEP 2: Pubmed search

In [None]:
# PubMed search terms and combined search string
# Modify this search string for your systematic review

search_string_1 = '("Central Nervous System"[Mesh]) OR (brain*[All Fields]) OR (cerebr*[All Fields])'
search_string_2 = '("Diagnostic Imaging"[Mesh] OR CT[All Fields]) OR (MRI[All Fields])'
search_string_3 = '("Deep Learning"[Mesh]) OR ("Neural Networks, Computer"[Mesh]) OR (Neural network*[All Fields]) OR (Convolutional network*[All Fields]) OR (Deep learn*[All Fields]) OR (Artificial Intelligence*[All Fields])'

# Including the date range from January 1, 2017, to April 19, 2024
date_range = '("2017/01/01"[PDAT] : "2024/04/19"[PDAT])'
reviews = '(review[Publication Type])'

# Combined search string with filters
combined_search_string = f"({search_string_1}) AND ({search_string_2}) AND ({search_string_3}) AND ({date_range}) NOT ({reviews})"



In [None]:
#Functions for extracting search results and variables from studies from PubMed
def search_pubmed(query, retmax=10000):
    handle = Entrez.esearch(db="pubmed", term=query, retmax=retmax)
    record = Entrez.read(handle, validate=False)
    handle.close()
    time.sleep(0.4)  # To prevent overwhelming the server
    return record["IdList"]

def fetch_details(id_list):
    ids = ','.join(id_list)
    handle = Entrez.efetch(db="pubmed", id=ids, retmode="xml")
    papers = Entrez.read(handle, validate=False)
    handle.close()
    time.sleep(0.4)  # To prevent overwhelming the server
    return papers


def save_initial_search_results(papers, filename='pubmed_initial_results.csv'):
    with open(filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file, delimiter=';')
        writer.writerow(['Title', 'Authors', 'Journal', 'Year', 'PMID', 'DOI', 'Abstract'])
        for paper in papers['PubmedArticle']:
            article = paper['MedlineCitation']['Article']
            title = article.get('ArticleTitle', 'Not available')
            pubmed_id = paper['MedlineCitation']['PMID']
            journal = article['Journal'].get('Title', 'Not available')
            year = article['Journal']['JournalIssue']['PubDate'].get('Year', 'Not available')
            authors_list = article.get('AuthorList', [])
            authors = ', '.join([f"{a.get('LastName', '')}, {a.get('ForeName', '')}" for a in authors_list])
            abstract_text = " ".join(article['Abstract']['AbstractText']).replace(";",":") if 'Abstract' in article else 'Not available'
            doi = next((id for id in paper['PubmedData']['ArticleIdList'] if id.attributes.get('IdType') == 'doi'), 'Not available')
            writer.writerow([title, authors, journal, year, pubmed_id, doi, abstract_text])


In [None]:
# Complete extraction of full search
query_result = search_pubmed(combined_search_string, retmax=10000)
paper_details = fetch_details(query_result)

# Save the initial search results
save_initial_search_results(paper_details)



#STEP 3: Download the randomized sample and annotate it manually

In [None]:
# Take out randomized sample and save as csv-file:
df = pd.read_csv('pubmed_initial_results.csv', sep=';')
df = df.sample(n=100, random_state=48)

#Save as csv file for later analysis with LLMs
df.to_csv('100_sample.csv', index=False)
#Save as excel file for manual annotation of sample
df.to_excel('100_sample.xlsx', index=False)


Download the sample, and annotate it manually before moving on to the next step. The annotations will be used to evaluate which models achieve highest diagnostic test accuracy for title and abstract screening.

#STEP 3: LLM analysis of the random sample

In [None]:
# Load the saved random sample
df = pd.read_csv('100_sample.csv', sep=',')

In [None]:
#Define prompt for the LLMs
llm_prompt = 'Please assess the title and abstract based on the following criteria for inclusion in a systematic review: population: Does the study examine adults with suspected neurological disease(s)? [yes/no/NA]. intervention: Does the study use neural networks or deep learning for brain scan analysis? [yes/no/NA]. control: Does the study compare these methods against a standard reference (radiological report or expert readers)? [yes/no/NA]. outcome:  Does it measure the diagnostic accuracy of the intervention using an external cohort for validation? [yes/no/NA]. decision: Include if all the above criteria are met, exclude if any criteria are not met, uncertain if information is insufficient [include/exclude/uncertain].  Title and abstract:'


def analyze_abstract_with_gpt35T(llm_prompt, title, abstract_text):
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo-0125",
        temperature=0.2,
        max_tokens=60,
        response_format={ "type": "json_object" },
        messages=[
            {"role": "system", "content": "You are a medical researcher analyzing abstracts for a systematic review. Answer in JSON format with the following keys: population, intervention, control, outcome, decision."},
            {"role": "user", "content": f"{llm_prompt} {title} {abstract_text} "}
        ]
    )
    openai_response = completion.choices[0].message.content
    return openai_response


def analyze_abstract_with_gpt4T(llm_prompt, title, abstract_text):
    completion = client.chat.completions.create(
        model="gpt-4-turbo-2024-04-09" ,
        temperature=0.2,
        max_tokens=60,
        response_format={ "type": "json_object" },
        messages=[
            {"role": "system", "content": "You are a medical researcher analyzing abstracts for a systematic review. Answer in JSON format with the following keys: population, intervention, control, outcome, decision."},
            {"role": "user", "content": f"{llm_prompt} {title} {abstract_text} "}
        ]
    )
    openai_response = completion.choices[0].message.content
    return openai_response

def analyze_abstract_with_GPT4o(llm_prompt, title, abstract_text):
    completion = client.chat.completions.create(
        model="gpt-4o-2024-05-13",
        temperature=0.2,
        max_tokens=60,
        response_format={ "type": "json_object" },
        messages=[
            {"role": "system", "content": "You are a medical researcher analyzing abstracts for a systematic review. Answer in JSON format with the following keys: population, intervention, control, outcome, decision."},
            {"role": "user", "content": f"{llm_prompt} {title} {abstract_text} "}
        ]
    )
    openai_response = completion.choices[0].message.content
    return openai_response

def analyze_abstract_with_claude3_opus(llm_prompt, title, abstract_text):
    try:
        message = anthropic_client.messages.create(
            model="claude-3-opus-20240229",
            temperature=0.2,
            max_tokens=60,
            messages=[
                {"role": "user", "content": "You are a medical researcher analyzing abstracts for a systematic review. Answer in JSON format with the following keys: population, intervention, control, outcome, decision." f"{llm_prompt} {title} {abstract_text} "}
            ]
        )
        return message.content[0].text
    except Exception as e:
        print(f"Error with Claude-3 opus API: {str(e)}")
        return "API request failed"


def analyze_abstract_with_claude3_sonnet(llm_prompt, title, abstract_text):
    try:
        message = anthropic_client.messages.create(
            model="claude-3-sonnet-20240229",
            temperature=0.2,
            max_tokens=60,
            messages=[
                {"role": "user", "content": "You are a medical researcher analyzing abstracts for a systematic review. Answer in JSON format with the following keys: population, intervention, control, outcome, decision." f"{llm_prompt} {title} {abstract_text} "}
            ]
        )
        return message.content[0].text
    except Exception as e:
        print(f"Error with Claude-3 sonnet API: {str(e)}")
        return "API request failed"


def analyze_abstract_with_claude3_haiku(llm_prompt, title, abstract_text):
    try:
        message = anthropic_client.messages.create(
            model="claude-3-haiku-20240307",
            temperature=0.2,
            max_tokens=60,
            messages=[
                {"role": "user", "content": "You are a medical researcher analyzing abstracts for a systematic review. Answer in JSON format with the following keys: population, intervention, control, outcome, decision." f"{llm_prompt} {title} {abstract_text} "}
            ]
        )
        return message.content[0].text
    except Exception as e:
        print(f"Error with Claude-3 haiku API: {str(e)}")
        return "API request failed"


def parse_json_to_columns(json_string):
    keys = ['population', 'intervention', 'control', 'outcome', 'decision']
    try:
        data = json.loads(json_string)
        return {key: data.get(key, 'NA') for key in keys}
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON: {e}")
        return {key: 'parsing_error' for key in keys}

# Function to split data into batches
def split_data_into_batches(df, batch_size=100):
    num_batches = len(df) // batch_size + (1 if len(df) % batch_size != 0 else 0)
    for i in range(num_batches):
        batch = df.iloc[i*batch_size:(i+1)*batch_size]
        batch.to_csv(f'batch_{i+1}.csv', index=False)
    return num_batches


def analyze_selected_batches(batches, llm_prompt):
    for batch_id in batches:
        df = pd.read_csv(f'batch_{batch_id}.csv')

        # Add columns for each model analysis
        df['GPT35T'] = df.apply(lambda x: analyze_row_with_gpt35T(llm_prompt, x), axis=1)
        df['gpt4T'] = df.apply(lambda x: analyze_row_with_gpt4T(llm_prompt, x), axis=1)
        df['GPT4o'] = df.apply(lambda x: analyze_row_with_GPT4o(llm_prompt, x), axis=1)
        df['claude3opus'] = df.apply(lambda x: analyze_row_with_claude3_opus(llm_prompt, x), axis=1)
        df['claude3sonnet'] = df.apply(lambda x: analyze_row_with_claude3_sonnet(llm_prompt, x), axis=1)
        df['claude3haiku'] = df.apply(lambda x: analyze_row_with_claude3_haiku(llm_prompt, x), axis=1)

        # Parsing and expanding JSON results to separate columns for each model
        gpt35_columns = df['GPT35T'].apply(parse_json_to_columns).apply(pd.Series)
        gpt35_columns = gpt35_columns.rename(columns=lambda x: f'GPT35T_{x}')

        gpt4_columns = df['gpt4T'].apply(parse_json_to_columns).apply(pd.Series)
        gpt4_columns = gpt4_columns.rename(columns=lambda x: f'gpt4T_{x}')

        GPT4o_columns = df['GPT4o'].apply(parse_json_to_columns).apply(pd.Series)
        GPT4o_columns = GPT4o_columns.rename(columns=lambda x: f'GPT4o_{x}')

        claude3_opus_columns = df['claude3opus'].apply(parse_json_to_columns).apply(pd.Series)
        claude3_opus_columns = claude3_opus_columns.rename(columns=lambda x: f'claude3opus_{x}')

        claude3_sonnet_columns = df['claude3sonnet'].apply(parse_json_to_columns).apply(pd.Series)
        claude3_sonnet_columns = claude3_sonnet_columns.rename(columns=lambda x: f'claude3sonnet_{x}')

        claude3_haiku_columns = df['claude3haiku'].apply(parse_json_to_columns).apply(pd.Series)
        claude3_haiku_columns = claude3_haiku_columns.rename(columns=lambda x: f'claude3haiku_{x}')

        # Combining all into one DataFrame
        df_extended = pd.concat([df, gpt35_columns, gpt4_columns, GPT4o_columns, claude3_opus_columns, claude3_sonnet_columns, claude3_haiku_columns], axis=1)
        df_extended.to_csv(f'analyzed_batch_{batch_id}.csv', index=False)

def analyze_row_with_gpt35T(llm_prompt, x):
    if x['Abstract'] != 'Not available':
        analysis = analyze_abstract_with_gpt35T(llm_prompt, x['Title'], x['Abstract'])
        time.sleep(1)  # Sleep to avoid rate limits
        return analysis
    else:
        return 'NA'

def analyze_row_with_gpt4T(llm_prompt, x):
    if x['Abstract'] != 'Not available':
        analysis = analyze_abstract_with_gpt4T(llm_prompt, x['Title'], x['Abstract'])
        time.sleep(1)  # Sleep to avoid rate limits
        return analysis
    else:
        return 'NA'

def analyze_row_with_GPT4o(llm_prompt, x):
    if x['Abstract'] != 'Not available':
        analysis = analyze_abstract_with_GPT4o(llm_prompt, x['Title'], x['Abstract'])
        time.sleep(1)  # Sleep to avoid rate limits
        return analysis
    else:
        return 'NA'

def analyze_row_with_claude3_opus(llm_prompt, x):
    if x['Abstract'] != 'Not available':
        analysis = analyze_abstract_with_claude3_opus(llm_prompt, x['Title'], x['Abstract'])
        time.sleep(1)  # Sleep to avoid rate limits
        return analysis
    else:
        return 'NA'


def analyze_row_with_claude3_sonnet(llm_prompt, x):
    if x['Abstract'] != 'Not available':
        analysis = analyze_abstract_with_claude3_sonnet(llm_prompt, x['Title'], x['Abstract'])
        time.sleep(1)  # Sleep to avoid rate limits
        return analysis
    else:
        return 'NA'


def analyze_row_with_claude3_haiku(llm_prompt, x):
    if x['Abstract'] != 'Not available':
        analysis = analyze_abstract_with_claude3_haiku(llm_prompt, x['Title'], x['Abstract'])
        time.sleep(1)  # Sleep to avoid rate limits
        return analysis
    else:
        return 'NA'


# Splitting data into batches and analyzing them
num_batches = split_data_into_batches(df, batch_size=100) #You can chose smaller batch sizes, for example 5, here to test if the pipeline and API works first
print(f"Data split into {num_batches} batches.")



In [None]:
selected_batches = [1]  # You can specify multiple batches here (e.g. 1,2,3)
analyze_selected_batches(selected_batches, llm_prompt)

Save the analyzed batches, and conduct dataanalysis in R. You can find the analyzed csv file in the folder in the left column.