In [None]:
import os
import openai
import pandas as pd
from tqdm import tqdm
import pickle
from langchain.chains.question_answering import load_qa_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.document_loaders import DataFrameLoader
from langchain.text_splitter import CharacterTextSplitter
import faiss
from langchain.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
import json
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS

openai.api_key = "sk-XXXXXXX"
openai_api_key = openai.api_key
os.environ["OPENAI_API_KEY"] = openai.api_key

my_embedding_model = OpenAIEmbeddings(model="gpt-3.5-turbo")
my_llm_model = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0)



def split_documents(documents):
    doc_chunks = []

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=5000,
        separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
        chunk_overlap=400,
    )
    for doc in documents:
        chunks = text_splitter.split_text(doc.page_content)
        for i, chunk in enumerate(chunks):
            doc = Document(
                page_content=chunk, metadata={"title": doc.metadata["title"], "chunk": i, "abstract": doc.metadata["abstract"],  "year": doc.metadata["year"], "contributors": doc.metadata["contributors"], "screening1": doc.metadata["screening1"], "screening2": doc.metadata["screening2"], "refid": doc.metadata["refid"]}

                )
            # Add sources a metadata
            doc.metadata["source"] = f"{doc.metadata['page']}-{doc.metadata['chunk']}"
            doc_chunks.append(doc)

def write_file(filename, content):
    with open(filename, 'wb') as file:
        file.write(content)


def read_file(filename):
    with open(filename, 'rb') as file:
        return file.read()


def docs_to_index(docs, openai_api_key):
    index = FAISS.from_documents(
        docs, OpenAIEmbeddings(openai_api_key=openai_api_key)
    )  # Create a searchable index of the chunks

    return index

def store_index_in_db(index, name):
    faiss.write_index(index.index, "docs.index")
    # Open the file and dump to local storage
    write_file(f"{name}.index", read_file("docs.index"))
    index.index = None
    write_file(f"{name}.pkl", pickle.dumps(index))


def load_index_from_db(index_name):
    findex = read_file(f"{index_name}.index")
    write_file("docs.index", findex)
    index = faiss.read_index("docs.index")
    VectorDB = pickle.loads(read_file(f"{index_name}.pkl"))
    VectorDB.index = index
    return VectorDB

## Loading the data

In [None]:
with open("data/preprocessed_articles.pkl", "rb") as input_file:
    reviewdf = pickle.load(input_file)
    
pattern = r'(enlEndNote\d+)'
reviewdf['uniqueid'] = reviewdf['record'].str.extract(pattern)
print(len(reviewdf['uniqueid'].unique()))


print(reviewdf['screening1'].value_counts())
print(reviewdf['screening2'].value_counts())
print(reviewdf.groupby('screening1')['screening2'].value_counts())

In [None]:
# Print the shape of the dataframe 'reviewdf' to see its dimensions (rows x columns)
print(reviewdf.shape)

# The goal of this loop is to identify a column that has a unique value for every row, 
# suggesting it could be used as an article ID.

# Loop through each column of the dataframe 'reviewdf'
for column in reviewdf.columns:
    # Check if the number of unique values in the column equals 4662
    if len(reviewdf[column].unique()) == 4662:
        # If the condition is met, print the column name, 
        # the number of unique values, and the length of the first element in that column
        # This can give insight into the type of unique identifier (e.g., length might hint at a hash vs a numeric ID)
        print(f"{column}\t{len(reviewdf[column].unique())}:\t{len(reviewdf[column][0])}")
        
        
# Find the length of the largest string in 'record' column
max_length = reviewdf['record'].str.len().max()

print(max_length)
# Get the records with the longest string length
longest_records = reviewdf[reviewdf['record'].str.len() == max_length]
print(longest_records['record'].to_string())

### Embedding publications data

In [None]:
pattern = r'(enlEndNote\d+)'
reviewdf['uniqueid'] = reviewdf['record'].str.extract(pattern)

# 
selected_cols = ['uniqueid', 'record', 'database', 'source-app', 'rec-number', 'foreign-keys', 'key',  'ref-type', 'contributors', 'titles', 'title', 'secondary-title', 'short-title', 'pages', 'volume', 'number',
       'edition','date','dates', 'year', 'pub-dates', 'date',
       'isbn', 'accession-num','urls', 'electronic-resource-num','remote-database-name','authors', 'author', 'related-urls', 'url']
article_collection = reviewdf[selected_cols].copy()

article_collection

We divide the database in chunks of 20 articles, so we can analyze them in batches. These batches are embedded individually and saved in pickles. 

In [None]:
chaintype = 'stuff'
my_llm_model = ChatOpenAI(model_name='gpt-3.5-turbo-0613', temperature=0)

for start in tqdm(range(0, len(article_collection), 20)):
    end = start + 20
    articles_chunk = article_collection[start:end].copy()
    
    #print(f"vectordbs/collection_{start}_{end}")

    loader = DataFrameLoader(articles_chunk, page_content_column="record")
    documents = loader.load()
    documents = text_splitter.split_documents(documents)

    vectordb = docs_to_index(documents, openai_api_key)
    store_index_in_db(vectordb, f"vectorsdbs2/collection_{start}_{end}")
    #SAVE ALSO THE DATASET
    articles_chunk.to_pickle(f"vectorsdbs2/df_collection_{start}_{end}.pkl")
    

### Defining checkpoints schema

#### Analyzing chunks original criteria


In [None]:
def is_json(data):
    try:
        json.loads(data)
        return True
    except json.JSONDecodeError:
        return False

def restructure_dict(output, key_renaming):
    output = {key_renaming.get(old_key, old_key): value for old_key, value in output.items()}

    new_dict = {}
    for checkpoint in output.keys():
        if 'checkpoint' in checkpoint:
            checkpoint_name = checkpoint.split('_')[1]  # Remove the "checkpoint_" prefix
            new_dict[checkpoint_name] = {
                'label': output['checkpoint_'+checkpoint_name],
                'reason': output['reason_'+checkpoint_name]
            }
    return new_dict

def check_inclusion_criteria(article_dict):
    """
    Check if an article meets the inclusion criteria.
    :param article_dict: dictionary containing analysis of an article.
    :return: True if the article should be included, False otherwise.
    """
    response_boolean_vector = [bool(values['label']) for values in output.values()]

    # Loop through each checkpoint in the article's dictionary
    for checkpoint, values in article_dict.items():
        # Check if the 'label' value for this checkpoint is 'False'
        if values['label'] == 'False':
            # If it is, return False (article should not be included)
            return False, response_boolean_vector

    # If all of the inclusion criteria were met, return True (article should be included)
    return True, response_boolean_vector

checkpoints_dict = {
    "Population": "f the study population comprises patients with musculoskeletal conditions, with no majority having another primary disease or intellectual disabilities, then return True. Otherwise, return False.",
    
    "Intervention":  "If physiotherapists provided one of the intervention/control group treatments alone, then return True. If the treatment of interest was offered by an interdisciplinary team, non-health care professionals, or mostly by a different profession, then return False. If the intervention combines physiotherapy with another treatment and the other treatment is provided in a comparator group, then return True. If the study evaluates the economic aspects of E-interventions, digital interventions or eHealth interventions, then return False",
  
    "Control Group": "If there is a control group of any type - for example, wait and see, usual care, placebo, or alternative treatments, then return True. Otherwise, return False.",
    
    "Outcome": "If the outcome of the study involves or allows a full economic evaluation, potentially including cost-effectiveness ratios and cost-utility ratios or if the study provides information on the costs and clinical effects of a treatment, then return True. Otherwise, return False.",
    
    "study type": "If the article is not a conference abstract, review, study without results (like a protocol), or model-based study, then return True. Otherwise, return False.",
}



response_schemas = []

for i, (checkpoint, description) in enumerate(checkpoints_dict.items(), 1):
    response_schemas.append(
        ResponseSchema(
            name=f"checkpoint{i}",
            description=f"True/False, depending on whether {checkpoint} applies to the text."
        )
    )
    response_schemas.append(
        ResponseSchema(
            name=f"reason{i}",
            description=f"The reason for the decision made on {checkpoint}. {description}"
        )
    )

# The parser that will look for the LLM output in my schema and return it back to me
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = output_parser.get_format_instructions()


prompt = ChatPromptTemplate(
    messages=[
        HumanMessagePromptTemplate.from_template("""Given the following checkpoints with their description from the user, \
                                                   assess whether they apply or not to the text you receive as input below \
                                                   and provide a brief explanation on the why. \n
                                                    {format_instructions}\n{user_prompt}""")  
    ],
    input_variables=["user_prompt"],
    partial_variables={"format_instructions": format_instructions}
)

key_renaming ={}
keys_list = list(checkpoints_dict.keys())
for i in range(len(keys_list)):
    #print(i, keys_list[i])
    key_renaming.update({f"checkpoint{i+1}": f"checkpoint_{keys_list[i]}"})
    key_renaming.update({f"reason{i+1}": f"reason_{keys_list[i]}"})


labels_from_user = "\n%CHECKPOINTS:\n\n" + '\n\n'.join([f"{key} : {value}" for key, value in checkpoints_dict.items()])
label_query = prompt.format_prompt(user_prompt=labels_from_user)

In [None]:
filelist = []
for start in range(0, len(article_collection), 20):
    end = start + 20
    filelist.append(f"{start}_{end}")

In [None]:
record2answer = {}
missing_records = []

In [None]:
chaintype = 'stuff'
my_llm_model = ChatOpenAI(model_name='gpt-3.5-turbo-0613', temperature=0)
selected_k = 5
selected_fetch_k = 20

for file in tqdm(filelist):    
    db_chunk = pd.read_pickle(f"vectorsdbs2/df_collection_{file}.pkl")
    rec_numbers = db_chunk['uniqueid'].to_list()
    myindex = load_index_from_db(f"vectorsdbs2/collection_{file}")
    
    for recnumber in rec_numbers:
        qa_chain = RetrievalQA.from_chain_type(llm=my_llm_model,
                                           chain_type=chaintype,
                                           retriever=myindex.as_retriever(search_type="mmr",
                                                                        search_kwargs={'fetch_k': selected_fetch_k,
                                                                                       'k': selected_k,
                                                                                       'filter':{'uniqueid': recnumber}}),
                                           return_source_documents=True,
                                           verbose=False)
        try: 
            if recnumber in record2answer.keys():
                pass
            else:
                llm_response = qa_chain({"query": label_query.to_string()})
                # print(llm_response)
                output = output_parser.parse(llm_response['result'])
                output = restructure_dict(output, key_renaming)
                # print("\n-----------\n")
                # print_bullet_points(output)
                # print("\n-----------\n")
                # print(llm_response['source_documents'])
                output['sources'] = llm_response['source_documents']
                record2answer[recnumber] = output
        except:
            if recnumber in missing_records:
                pass
            else:
               # print(f"ERROR in {recnumber}:\t{llm_response['result']}")
               missing_records.append(recnumber)
                

    with open('results/record2answer_originalcrits.pkl', 'wb') as file:
        pickle.dump(record2answer, file)
        
    with open('results/missing_records_originalcrits.pkl', 'wb') as file:
        pickle.dump(missing_records, file)
        
print(f"CORRECTLY analyzed {len(record2answer)}")
print(f"INCORRECTLY analyzed {len(missing_records)}")

 

#### Analyzing chunks original criteria

In [None]:
def is_json(data):
    try:
        json.loads(data)
        return True
    except json.JSONDecodeError:
        return False

def restructure_dict(output, key_renaming):
    output = {key_renaming.get(old_key, old_key): value for old_key, value in output.items()}

    new_dict = {}
    for checkpoint in output.keys():
        if 'checkpoint' in checkpoint:
            checkpoint_name = checkpoint.split('_')[1]  # Remove the "checkpoint_" prefix
            new_dict[checkpoint_name] = {
                'label': output['checkpoint_'+checkpoint_name],
                'reason': output['reason_'+checkpoint_name]
            }
    return new_dict

def check_inclusion_criteria(article_dict):
    """
    Check if an article meets the inclusion criteria.
    :param article_dict: dictionary containing analysis of an article.
    :return: True if the article should be included, False otherwise.
    """
    response_boolean_vector = [bool(values['label']) for values in output.values()]

    # Loop through each checkpoint in the article's dictionary
    for checkpoint, values in article_dict.items():
        # Check if the 'label' value for this checkpoint is 'False'
        if values['label'] == 'False':
            # If it is, return False (article should not be included)
            return False, response_boolean_vector

    # If all of the inclusion criteria were met, return True (article should be included)
    return True, response_boolean_vector

checkpoints_dict = {
    "Population": "If the study population comprises patients with musculoskeletal conditions, with no majority having another primary disease or intellectual disabilities, then return True. Otherwise, return False.",
    
    "Intervention": "If the treatment involves physiotherapy (techniques like exercises, manual therapy, education, and modalities such as heat, cold, ultrasound, and electrical stimulation to aid in patient recovery, pain reduction, mobility enhancement, and injury prevention), or at least one of the intervention/control group treatments was provided exclusively by physiotherapists, then return True. However, if the treatment of interest was offered by an interdisciplinary team, non-health care professionals, or mostly by a different profession to physiotherapists, then return False. ",
        
    "Phisiotherapy and another treatment": "In case at least one of the intervention/control group treatments was provided exclusively by physiotherapists, if the intervention includes physiotherapy and another treatment and the other treatment is provided in a comparator group, then return True. ",
        
    "E-interventions": "If the study evaluates the economic aspects of E-interventions, digital interventions or eHealth interventions, then  return False. Otherwise, return True",
    
    "Control Group": "If there is a control group of any type - for example, wait and see, usual care, placebo, or alternative treatments, then return True. Otherwise, return False.",
    
    "Outcome": "If the outcome of the study involves or allows a full economic evaluation, potentially including cost-effectiveness ratios and cost-utility ratios or if the study provides information on the costs and clinical effects of a treatment  then return True. Otherwise, return False.",
    
    "study type": "If the article is not a conference abstract, review, study without results (like a protocol), or model-based study, then return True. Otherwise, return False.",
}



response_schemas = []

for i, (checkpoint, description) in enumerate(checkpoints_dict.items(), 1):
    response_schemas.append(
        ResponseSchema(
            name=f"checkpoint{i}",
            description=f"True/False, depending on whether {checkpoint} applies to the text."
        )
    )
    response_schemas.append(
        ResponseSchema(
            name=f"reason{i}",
            description=f"The reason for the decision made on {checkpoint}. {description}"
        )
    )

# The parser that will look for the LLM output in my schema and return it back to me
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = output_parser.get_format_instructions()


prompt = ChatPromptTemplate(
    messages=[
        HumanMessagePromptTemplate.from_template("""Given the following checkpoints with their description from the user, \
                                                   assess whether they apply or not to the text you receive as input below \
                                                   and provide a brief explanation on the why. \n
                                                    {format_instructions}\n{user_prompt}""")  
    ],
    input_variables=["user_prompt"],
    partial_variables={"format_instructions": format_instructions}
)

key_renaming ={}
keys_list = list(checkpoints_dict.keys())
for i in range(len(keys_list)):
    #print(i, keys_list[i])
    key_renaming.update({f"checkpoint{i+1}": f"checkpoint_{keys_list[i]}"})
    key_renaming.update({f"reason{i+1}": f"reason_{keys_list[i]}"})


labels_from_user = "\n%CHECKPOINTS:\n\n" + '\n\n'.join([f"{key} : {value}" for key, value in checkpoints_dict.items()])
label_query = prompt.format_prompt(user_prompt=labels_from_user)

In [None]:
filelist = []
for start in range(0, len(article_collection), 20):
    end = start + 20
    filelist.append(f"{start}_{end}")

In [None]:
record2answer = {}
missing_records = []

In [None]:
chaintype = 'stuff'
my_llm_model = ChatOpenAI(model_name='gpt-3.5-turbo-0613', temperature=0)
selected_k = 5
selected_fetch_k = 20

for file in tqdm(filelist):    
    db_chunk = pd.read_pickle(f"vectorsdbs2/df_collection_{file}.pkl")
    rec_numbers = db_chunk['uniqueid'].to_list()
    myindex = load_index_from_db(f"vectorsdbs2/collection_{file}")
    
    for recnumber in rec_numbers:
        qa_chain = RetrievalQA.from_chain_type(llm=my_llm_model,
                                           chain_type=chaintype,
                                           retriever=myindex.as_retriever(search_type="mmr",
                                                                        search_kwargs={'fetch_k': selected_fetch_k,
                                                                                       'k': selected_k,
                                                                                       'filter':{'uniqueid': recnumber}}),
                                           return_source_documents=True,
                                           verbose=False)
        try: 
            if recnumber in record2answer.keys():
                pass
            else:
                llm_response = qa_chain({"query": label_query.to_string()})
                # print(llm_response)
                output = output_parser.parse(llm_response['result'])
                output = restructure_dict(output, key_renaming)
                # print("\n-----------\n")
                # print_bullet_points(output)
                # print("\n-----------\n")
                # print(llm_response['source_documents'])
                output['sources'] = llm_response['source_documents']
                record2answer[recnumber] = output
        except:
            if recnumber in missing_records:
                pass
            else:
               # print(f"ERROR in {recnumber}:\t{llm_response['result']}")
               missing_records.append(recnumber)
                

    with open(f'results/record2answer_refinedcrits.pkl', 'wb') as file:
        pickle.dump(record2answer, file)

    with open('results/missing_records_refinedcrits.pkl', 'wb') as file:
        pickle.dump(missing_records, file)
print(f"CORRECTLY analyzed {len(record2answer)}")
print(f"INCORRECTLY analyzed {len(missing_records)}")
 