In [1]:
import os
import re
import pandas as pd
from dotenv import load_dotenv
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.docstore.document import Document
from openai import OpenAI  # Import the synchronous OpenAI client
import openai

# Load environment variables from the .env file
load_dotenv()

# Initialize the local file store for caching
store = LocalFileStore("./cache/")

# Initialize the OpenAI embedding model
openai_api_key = os.getenv("OPENAI_API_KEY")
embedder = OpenAIEmbeddings(api_key=openai_api_key)

# Step 1: Load the CSV file
csv_path = 'abstracts_output.csv'  # Replace with your actual CSV file path
df = pd.read_csv(csv_path)

# Step 2: Prepare documents and metadata
texts = df['text'].tolist()
metadata = [{"authors": row['authors'], "title": row['title'], "source": row['source'], "year": row['year']} for _, row in df.iterrows()]

# Convert each text and its metadata into a Document object
documents = [Document(page_content=text, metadata=meta) for text, meta in zip(texts, metadata)]

# Step 3: Create the FAISS vector store from the documents and cache-backed embedder
vector_store = FAISS.from_documents(documents, embedder)

# Initialize the OpenAI client
client = OpenAI(api_key=openai_api_key)

# Step 4: Use the vector store to retrieve the top 5 relevant documents
query = "Summarize the most relevant abstracts"  # Replace this with your actual query
top_k_documents = vector_store.similarity_search(query, k=2)

# Step 5: Generate the report using RAG

def generate_report(client, top_k_documents):
    final_report = "### AI Summary of Surveillance Data\n"
    references = []

    try:
        # Iterate through the retrieved documents
        for doc in top_k_documents:
            text = doc.page_content
            meta = doc.metadata

            # Prepare the prompt for summarization
            summary_prompt = "You are helping doctors use research to determine if the antibiotic is appropriate for the infection, summarize the following text in a concise paragraph form, ensuring that all sentences are complete and properly structured:"
            summary_response = client.chat_completions.create(
                model="gpt-4o-mini",
                messages=[{"role": "user", "content": f"{summary_prompt}\n{text}"}],
                max_tokens=100,
                temperature=0.1
            )
            summary_result = summary_response.choices[0].message.content.strip()

            # Ensure the summary ends with a complete sentence by avoiding any dangling punctuation
            summary_result = re.sub(r'\.\s*$', '', summary_result)

            # Extract metadata for APA citation
            authors = meta['authors'].split(", ")
            year = meta['year']
            title = meta.get('title', 'Untitled')
            source = meta.get('source', '')

            if len(authors) == 1:
                citation_in_text = f"({authors[0]}, {year})"
            elif len(authors) == 2:
                citation_in_text = f"({authors[0]} & {authors[1]}, {year})"
            else:  # Three or more authors
                citation_in_text = f"({authors[0]} et al., {year})"

            # Format the full reference in APA style with the title included
            if len(authors) == 1:
                citation_full = f"{authors[0]}. ({year}). {title}. {source}."
            elif len(authors) == 2:
                citation_full = f"{authors[0]} & {authors[1]}. ({year}). {title}. {source}."
            else:
                citation_full = f"{authors[0]}, {authors[1]}, & {authors[2]}. ({year}). {title}. {source}."

            # Add the full reference to the references list
            references.append(citation_full)

            # Combine the summary and the APA citation, and put a full stop after the citation
            paragraph = f"{summary_result} {citation_in_text}."

            # Append this paragraph to the final report
            final_report += paragraph + "\n\n"

        # Add the references section to the final report
        final_report += "\n\n### References\n"
        for ref in references:
            final_report += f"{ref}\n\n"

        return final_report
    except Exception as e:
        return f"Error generating report: {e}"

# Generate the report
final_report = generate_report(client, top_k_documents)

# Display the generated report
print(final_report)


  warn_deprecated(


Error generating report: 'OpenAI' object has no attribute 'chat_completions'


In [8]:
import os
import re
import pandas as pd
from dotenv import load_dotenv
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.docstore.document import Document
from openai import OpenAI

# Load environment variables from the .env file
load_dotenv()

# Initialize the OpenAI embedding model
openai_api_key = os.getenv("OPENAI_API_KEY")
embedder = OpenAIEmbeddings(api_key=openai_api_key)

# Step 1: Load the CSV file
csv_path = 'abstracts_output.csv'  # Replace with your actual CSV file path
df = pd.read_csv(csv_path)

# Step 2: Prepare documents and metadata
texts = df['text'].tolist()
metadata = [{"authors": row['authors'], "title": row['title'], "source": row['source'], "year": row['year']} for _, row in df.iterrows()]

# Convert each text and its metadata into a Document object
documents = [Document(page_content=text, metadata=meta) for text, meta in zip(texts, metadata)]

# Step 3: Create the FAISS vector store from the documents and cache-backed embedder
vector_store = FAISS.from_documents(documents, embedder)

# Initialize the OpenAI client with your API key
client = OpenAI(api_key=openai_api_key)

# Step 4: Use the vector store to retrieve the top 5 relevant documents
query = "Summarize the most relevant abstracts"  # Replace this with your actual query
top_k_documents = vector_store.similarity_search(query, k=2)

# Step 5: Generate the report using RAG

def generate_report(client, top_k_documents):
    final_report = "### AI Summary of Surveillance Data\n"
    references = []

    try:
        # Iterate through the retrieved documents
        for doc in top_k_documents:
            text = doc.page_content
            meta = doc.metadata

            # Prepare the prompt for summarization
            summary_prompt = "You are helping doctors use research to determine if the antibiotic is appropriate for the infection. Summarize the following text in a concise paragraph form, ensuring that all sentences are complete and properly structured:"
            
            # Use the new OpenAI client API call
            chat_completion = client.chat.completions.create(
                messages=[{"role": "user", "content": f"{summary_prompt}\n{text}"}],
                model="gpt-4",  # You can use "gpt-4" or "gpt-3.5-turbo"
                max_tokens=100,
                temperature=0.1
            )
            
            # Access the result via the attribute
            summary_result = chat_completion.choices[0].message.content.strip()

            # Ensure the summary ends with a complete sentence by avoiding any dangling punctuation
            summary_result = re.sub(r'\.\s*$', '', summary_result)

            # Extract metadata for APA citation
            authors = meta['authors'].split(", ")
            year = meta['year']
            title = meta.get('title', 'Untitled')
            source = meta.get('source', '')

            if len(authors) == 1:
                citation_in_text = f"({authors[0]}, {year})"
            elif len(authors) == 2:
                citation_in_text = f"({authors[0]} & {authors[1]}, {year})"
            else:  # Three or more authors
                citation_in_text = f"({authors[0]} et al., {year})"

            # Format the full reference in APA style with the title included
            if len(authors) == 1:
                citation_full = f"{authors[0]}. ({year}). {title}. {source}."
            elif len(authors) == 2:
                citation_full = f"{authors[0]} & {authors[1]}. ({year}). {title}. {source}."
            else:
                citation_full = f"{authors[0]}, {authors[1]}, & {authors[2]}. ({year}). {title}. {source}."

            # Add the full reference to the references list
            references.append(citation_full)

            # Combine the summary and the APA citation, and put a full stop after the citation
            paragraph = f"{summary_result} {citation_in_text}."

            # Append this paragraph to the final report
            final_report += paragraph + "\n\n"

        # Add the references section to the final report
        final_report += "\n\n### References\n"
        for ref in references:
            final_report += f"{ref}\n\n"

        return final_report
    except Exception as e:
        return f"Error generating report: {e}"

# Generate the report
final_report = generate_report(client, top_k_documents)

# Display the generated report
print(final_report)


### AI Summary of Surveillance Data
The study aimed to identify predictive factors for antimicrobial resistance and develop a treatment algorithm for afebrile outpatients with complicated cystitis. A retrospective study was conducted on 2,891 outpatients diagnosed with afebrile complicated cystitis from 2012 to 2018. The study used univariate analyses and multivariable regression models to predict resistance to various antibiotics. The results showed that the overall prevalence of resistance for the antibiotics ranged from 6.9% to 25 (Cooley LF et al., 2020).

The study suggests that repurposing FDA-approved compounds could be the quickest way to reduce the impact of diseases caused by flaviviruses. Three fluoroquinolones, enoxacin, difloxacin, and ciprofloxacin, were found to limit the replication of flaviviruses such as Zika, dengue, Langat, and Modoc in HEK-293 cells. Enoxacin was found to suppress Zika virus replication at an intermediate stage, while cipro (Scroggs SLP et al., 202

In [9]:
import os
import re
import pandas as pd
from dotenv import load_dotenv
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.docstore.document import Document
from openai import OpenAI

# Load environment variables from the .env file
load_dotenv()

# Initialize the OpenAI embedding model
openai_api_key = os.getenv("OPENAI_API_KEY")
embedder = OpenAIEmbeddings(api_key=openai_api_key)

# Step 1: Load the CSV file
csv_path = 'abstracts_output.csv'  # Replace with your actual CSV file path
df = pd.read_csv(csv_path)

# Step 2: Prepare documents and metadata
texts = df['text'].tolist()
metadata = [{"authors": row['authors'], "title": row['title'], "source": row['source'], "year": row['year']} for _, row in df.iterrows()]

# Convert each text and its metadata into a Document object
documents = [Document(page_content=text, metadata=meta) for text, meta in zip(texts, metadata)]

# Step 3: Create the FAISS vector store from the documents and cache-backed embedder
vector_store = FAISS.from_documents(documents, embedder)

# Initialize the OpenAI client with your API key
client = OpenAI(api_key=openai_api_key)

# Function to answer open-ended questions using retrieved documents
def answer_question(client, query):
    # Retrieve the top 5 relevant documents
    top_k_documents = vector_store.similarity_search(query, k=5)

    # Combine the content of the retrieved documents for context
    context = "\n\n".join([doc.page_content for doc in top_k_documents])

    # Prepare the prompt for the open-ended question
    open_ended_prompt = f"Use the following context to answer the question:\n\n{context}\n\nQuestion: {query}\nAnswer:"

    # Use the OpenAI client to generate the answer based on the query and the context
    response = client.chat.completions.create(
        messages=[{"role": "user", "content": open_ended_prompt}],
        model="gpt-4",  # You can use "gpt-4" or "gpt-3.5-turbo"
        max_tokens=300,
        temperature=0.1
    )

    # Extract the answer from the response
    answer = response.choices[0].message.content.strip()

    return answer

# Example of using the function to answer a question
question = "What are the key findings related to antibiotic resistance?"
answer = answer_question(client, question)

# Display the generated answer
print(f"Question: {question}\nAnswer: {answer}")


Question: What are the key findings related to antibiotic resistance?
Answer: 1. Antimicrobial resistance (AMR) in Neisseria gonorrhoeae is a serious global public health problem. Resistance to ceftriaxone, the last option for first-line empirical monotherapy of gonorrhoea, has been reported from many countries. In 2018, the first gonococcal isolates with ceftriaxone resistance plus high-level azithromycin resistance were identified in England and Australia.

2. In most countries, resistance to ciprofloxacin is exceedingly high, azithromycin resistance is present and decreased susceptibility or resistance to ceftriaxone has emerged.

3. For afebrile outpatients presenting with complicated cystitis, overall prevalence of resistance for trimethoprim-sulfamethoxazole, ciprofloxacin, nitrofurantoin, first-generation cephalosporin, and third-generation cephalosporin was 25.6%, 19.5%, 19.1%, 15.0%, and 6.9%, respectively.

4. Resistance to ciprofloxacin in the rectal vaults of older northwes

In [10]:
import os
import re
import pandas as pd
from dotenv import load_dotenv
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.docstore.document import Document
import random

# Load the new CSV files
fda_df = pd.read_csv('FDA.csv')
aware_df = pd.read_csv('WHO_AWARE.csv', encoding='ISO-8859-1')

# List of antibiotics with formatting for FDA.csv where spaces are replaced with dots
antibiotics = [
    'Amikacin', 'Amoxycillin.clavulanate', 'Ampicillin', 'Azithromycin', 'Cefepime', 
    'Cefoxitin', 'Ceftazidime', 'Ceftriaxone', 'Clarithromycin', 'Clindamycin', 
    'Erythromycin', 'Imipenem', 'Levofloxacin', 'Linezolid', 'Meropenem', 
    'Metronidazole', 'Minocycline', 'Penicillin', 'Piperacillin.tazobactam', 
    'Tigecycline', 'Vancomycin', 'Ampicillin.sulbactam', 'Aztreonam', 
    'Aztreonam.avibactam', 'Cefixime', 'Ceftaroline', 'Ceftaroline.avibactam', 
    'Ceftazidime.avibactam', 'Ciprofloxacin', 'Colistin', 'Daptomycin', 
    'Doripenem', 'Ertapenem', 'Gatifloxacin', 'Gentamicin', 'Moxifloxacin', 
    'Oxacillin', 'Quinupristin.dalfopristin', 'Sulbactam', 'Teicoplanin', 
    'Tetracycline', 'Trimethoprim.sulfa', 'Ceftolozane.tazobactam', 
    'Cefoperazone.sulbactam', 'Meropenem.vaborbactam', 'Cefpodoxime', 
    'Ceftibuten', 'Ceftibuten.avibactam', 'Tebipenem'
]

# Randomly select an antibiotic from the list
selected_antibiotic = random.choice(antibiotics)

# Filter rows for the selected antibiotic from both FDA.csv and WHO_AWARE.csv
selected_fda_df = fda_df[fda_df['Antibiotic'] == selected_antibiotic.replace(' ', '.')]
selected_aware_df = aware_df[aware_df['Antibiotic'] == selected_antibiotic]

# Combine rows from both datasets
background_info = ""
if not selected_fda_df.empty:
    background_info += "FDA Data:\n" + selected_fda_df.to_string(index=False) + "\n\n"
if not selected_aware_df.empty:
    background_info += "WHO AWARE Data:\n" + selected_aware_df.to_string(index=False) + "\n\n"

# Print out the background information for testing
background_info


"FDA Data:\n  Antibiotic                 Brand Name               Generic Name                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          

In [1]:
import os
import pandas as pd
from dotenv import load_dotenv
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document
from openai import OpenAI
import spacy

# Load spaCy's English tokenizer
nlp = spacy.load('en_core_web_sm')

# Load environment variables from the .env file
load_dotenv()

# Initialize the OpenAI embedding model
openai_api_key = os.getenv("OPENAI_API_KEY")
embedder = OpenAIEmbeddings(api_key=openai_api_key)

# Load the CSV files (FDA, WHO AWARE, and QA data)
fda_df = pd.read_csv('FDA.csv')
aware_df = pd.read_csv('WHO_AWARE.csv', encoding='ISO-8859-1')
qa_df = pd.read_csv('antibiotics_qa.csv')

# List of antibiotics
antibiotics = [
    'Amikacin', 'Amoxycillin.clavulanate', 'Ampicillin', 'Azithromycin', 'Cefepime', 
    'Cefoxitin', 'Ceftazidime', 'Ceftriaxone', 'Clarithromycin', 'Clindamycin', 
    'Erythromycin', 'Imipenem', 'Levofloxacin', 'Linezolid', 'Meropenem', 
    'Metronidazole', 'Minocycline', 'Penicillin', 'Piperacillin.tazobactam', 
    'Tigecycline', 'Vancomycin', 'Ampicillin.sulbactam', 'Aztreonam', 
    'Aztreonam.avibactam', 'Cefixime', 'Ceftaroline', 'Ceftaroline.avibactam', 
    'Ceftazidime.avibactam', 'Ciprofloxacin', 'Colistin', 'Daptomycin', 
    'Doripenem', 'Ertapenem', 'Gatifloxacin', 'Gentamicin', 'Moxifloxacin', 
    'Oxacillin', 'Quinupristin.dalfopristin', 'Sulbactam', 'Teicoplanin', 
    'Tetracycline', 'Trimethoprim.sulfa', 'Ceftolozane.tazobactam', 
    'Cefoperazone.sulbactam', 'Meropenem.vaborbactam', 'Cefpodoxime', 
    'Ceftibuten', 'Ceftibuten.avibactam', 'Tebipenem'
]

# Randomly select an antibiotic
selected_antibiotic = antibiotics[0]  # For this example, we're just picking the first one

# Filter rows for the selected antibiotic from both FDA.csv and WHO_AWARE.csv
selected_fda_df = fda_df[fda_df['Antibiotic'] == selected_antibiotic.replace(' ', '.')]
selected_aware_df = aware_df[aware_df['Antibiotic'] == selected_antibiotic]

# Combine rows from both datasets to create background information
background_info = ""
if not selected_fda_df.empty:
    background_info += "FDA Data:\n" + selected_fda_df.to_string(index=False) + "\n\n"
if not selected_aware_df.empty:
    background_info += "WHO AWARE Data:\n" + selected_aware_df.to_string(index=False) + "\n\n"

# Load abstracts for FAISS vector store
csv_path = 'abstracts_output.csv'  # Replace with your actual CSV file path
df = pd.read_csv(csv_path)
texts = df['text'].tolist()
metadata = [{"authors": row['authors'], "title": row['title'], "source": row['source'], "year": row['year']} for _, row in df.iterrows()]

# Convert each text and its metadata into a Document object
documents = [Document(page_content=text, metadata=meta) for text, meta in zip(texts, metadata)]

# Create the FAISS vector store from the documents
vector_store = FAISS.from_documents(documents, embedder)

# Initialize the OpenAI client with your API key
client = OpenAI(api_key=openai_api_key)


  warn_deprecated(


In [2]:
# Tokenize text using spaCy
def spacy_tokenize(text):
    doc = nlp(text)
    return [token.text for token in doc]

# Function to generate an answer using OpenAI
def answer_question(client, question, background_info):
    # Retrieve the top 3 relevant documents to reduce memory consumption
    top_k_documents = vector_store.similarity_search(question, k=3)

    # Combine the content of the retrieved documents for context
    context = "\n\n".join([doc.page_content for doc in top_k_documents])

    # Prepare the prompt for the question with background information
    open_ended_prompt = f"Use the following context and background information to answer the question:\n\n{context}\n\nBackground Information:\n{background_info}\n\nQuestion: {question}\nAnswer:"

    # Use the OpenAI client to generate the answer based on the question and context
    response = client.chat.completions.create(
        messages=[{"role": "user", "content": open_ended_prompt}],
        model="gpt-4",  # You can use "gpt-4" or "gpt-3.5-turbo"
        max_tokens=300,
        temperature=0.1
    )

    # Extract the generated answer
    generated_answer = response.choices[0].message.content.strip()

    return generated_answer


In [None]:
# Pick one question from the QA dataset
question_row = qa_df.iloc[0]  # For this example, we are selecting the first question

question = question_row['Question']
reference_answer = question_row['Answer']

# Generate the answer using RAG
generated_answer = answer_question(client, question, background_info)

# Create a DataFrame to store the result
result_df = pd.DataFrame([{
    'Question': question,
    'Reference Answer': reference_answer,
    'Generated Answer': generated_answer
}])

# Save the result to a CSV file
output_csv = 'output_qa_with_one_answer.csv'
result_df.to_csv(output_csv, index=False)

print(f"Saved the answer to {output_csv}")
