In [13]:
import pandas as pd
workdir = '/Users/fernando/Documents/Research/LLM_SR_medicine/analyses/AI_healthcare/'
df = pd.read_excel(f'{workdir}original_data/240503_PubMed_preprocessed.xlsx', sheet_name='preprocessed')

# We replace nan by zero for the boolean columns
boolean_columns = ['screening1', 'screening2']
df[boolean_columns] = df[boolean_columns].fillna(0)
df[boolean_columns] = df[boolean_columns].astype(bool)
print(df[['screening1','screening2']].value_counts())
df


screening1  screening2
True        False         39
False       False         21
True        True           6
Name: count, dtype: int64


Unnamed: 0,Authors,Title,Details,PUBMEDID,A comprehensive description of an AI functionality,An evaluation of the economic efficiency and outcomes,Quantitative outcomes in at least one healthcare system.,The title did not cover a topic related to AI in healthcare,The abstract did not contain a description of an AI in healthcare application,The abstract or full text did not elaborate on the quantitative economic outcome in one healthcare system.,screening1,screening2
0,"1: Chen PJ, Lin MC, Lai MJ, Lin JC, Lu HH, Tse...",Accurate Classification of Diminutive Colorect...,2018 Feb;154(3):568-575. doi: 10.1053/j.gastro...,29042219,,,,,,X,True,False
1,"2: Willem L, Stijven S, Vladislavleva E, Broec...",Active learning to understand infectious disea...,PLoS Comput Biol. 2014 Apr 17;10(4):e1003563. ...,24743387,,,,,,X,True,False
2,"3: Lillehaug SI, Lajoie SP.",AI in medical education--another grand challen...,Artif Intell Med. 1998 Mar;12(3):197-225. Rev...,9626957,,,,,,X,True,False
3,"4: Lee HK, Jin RC, Yuan F, Bain PA, Goffinet J...",An Analytical Framework for TJR Readmission Pr...,Biomed Health Inform. 2018 Jul 25. doi: 10.110...,30047916,X,X,X,,,,True,True
4,"5: Li X, Jia W, Yang Z, Li Y, Yuan D, Zhang H,...",Application of Intelligent Recommendation Tec...,Front Psychiatry. 2018 Sep 4;9:415. doi: 10.33...,30233432,,,,X,,,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
61,"8: Jelovsek JE, Chagin K, Gyhagen M, Hagen S, ...",Predicting risk of pelvic floor disorders 12 a...,Am J Obstet Gynecol. 2018 Feb;218(2):222.e1-22...,29056536,,,,X,,,False,False
62,"9: Shahinfar S, Page D, Guenther J, Cabrera V,...",Prediction of insemination outcomes in Holstei...,J Dairy Sci. 2014 Feb;97(2):731-42. doi: 10.3...,24290820,,,,X,,,False,False
63,"1: Lee I, Monahan S, Serban N, Griffin PM, Tom...",Estimating the Cost Savings of Preventive Dent...,Health Serv Res. 2018 Oct;53(5):3592-3616. doi...,29194610,X,x,X,,,,True,True
64,"2: Brady ES, Leider JP, Resnick BA, Alfonso YN...",Machine-Learning Algorithms to Code Public Hea...,Public Health Rep. 2017 May/Jun;132(3):350-356...,28363034,,,,,,X,True,False


In [14]:
from Bio import Entrez

# Always provide your email
Entrez.email = "fernando.miguel.delgado-chaves@uni-hamburg.de"

def fetch_details(pubmed_ids):
    ids = ','.join(pubmed_ids)
    handle = Entrez.efetch(db="pubmed", id=ids, retmode="xml")
    records = Entrez.read(handle)
    titles = []
    abstracts = []
    
    for record in records['PubmedArticle']:
        # Extract the title
        title = record['MedlineCitation']['Article']['ArticleTitle']
        # Extract the full abstract, concatenating all parts
        abstract_parts = record['MedlineCitation']['Article'].get('Abstract', {}).get('AbstractText', [])
        full_abstract = ' '.join([part for part in abstract_parts])
        titles.append(title)
        abstracts.append(full_abstract)
    return titles, abstracts

# Fetch details for all PubMed IDs
if not df.empty:
    df['PUBMEDID'] = df['PUBMEDID'].astype(str)  # Ensure IDs are in string format
    titles, abstracts = fetch_details(df['PUBMEDID'].tolist())
    df['Full_title'] = titles
    df['Abstract'] = abstracts

In [15]:
df = df[['Authors', 'Title',  'Full_title', 'Abstract', 'Details', 'PUBMEDID',
       'A comprehensive description of an AI functionality ',
       'An evaluation of the economic efficiency and outcomes',
       'Quantitative outcomes in at least one healthcare system. ',
       'The title did not cover a topic related to AI in healthcare',
       'The abstract did not contain a description of an AI in healthcare application',
       'The abstract or full text did not elaborate on the quantitative economic outcome in one healthcare system.',
       'screening1', 'screening2']]

df['Record'] = df['Full_title'].fillna('') + " " + df['Abstract'].fillna('')
df.to_pickle(f"{workdir}preprocessed_articles_filtered.pkl")
df.to_excel(f"{workdir}preprocessed_articles_filtered.xlsx")
print(df['screening1'].value_counts())
print(df['screening2'].value_counts())


screening1
True     45
False    21
Name: count, dtype: int64
screening2
False    60
True      6
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Record'] = df['Full_title'].fillna('') + " " + df['Abstract'].fillna('')


In [16]:
# Counting NaNs per column
nan_count = df.isna().sum()
print(nan_count)

Authors                                                                                                        0
Title                                                                                                          0
Full_title                                                                                                     0
Abstract                                                                                                       0
Details                                                                                                        0
PUBMEDID                                                                                                       0
A comprehensive description of an AI functionality                                                            50
An evaluation of the economic efficiency and outcomes                                                         62
Quantitative outcomes in at least one healthcare system.                                        

In [17]:
checkpoints_dict = {
    "AI Functionality Description": "Return True if the study provides a comprehensive description of an AI functionality used in healthcare; otherwise, return False.",
    "Economic Evaluation": "Return True if the study evaluates the economic efficiency and outcomes of an AI application in healthcare, specifically assessing cost-effectiveness or return on investment; otherwise, return False.",
    "Quantitative Healthcare Outcomes": "Return True if the study reports quantitative outcomes in at least one healthcare system, showing measurable impacts such as patient recovery times, treatment efficacy, or cost savings; otherwise, return False.",
    "Relevance to AI in Healthcare": "Return False if the title of the study does not explicitly cover a topic related to AI in healthcare, indicating the study is not primarily focused on AI applications within healthcare; otherwise, return True.",
    "AI Application Description": "Return False if the abstract does not contain a description of an AI application in healthcare, indicating a lack of focus on how AI technologies are implemented or their functional roles within healthcare; otherwise, return True.",
    "Economic Outcome Details": "Return False if the abstract or full text does not elaborate on the quantitative economic outcomes in one healthcare system, failing to provide specific economic data or analysis related to the AI application; otherwise, return True."
}