# Embedding a whole set of pdfs in an automatic way

In [3]:
#import necessary packages
import os
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import subprocess
import openai
openai.api_key = ""
os.environ["OPENAI_API_KEY"] = openai.api_key

## Retrieving the original list of articles

In [4]:
# Read the XLSX file
dataframes = pd.read_excel('WhitepaperD23_classification.xlsx', sheet_name=['reviews', 'dbs', 'dbs_urls', 'tools'])

# Do further processing with the DataFrame
# For example, print the first few rows

In [5]:
file_list = os.listdir('all_papers/')
# Create a dictionary to map file names to paths
file_path_dict = {}
for file_path in file_list:
    file_name = file_path.split('/')[-1]
    file_path_dict[file_name] = f"all_papers/{file_path}"
    

# Function to find the closest matching file name
def get_closest_match(title):
    closest_match = process.extractOne(title, file_path_dict.keys(), scorer=fuzz.ratio)
    if closest_match[1] >= 50:  # Adjust the threshold as per your requirements
        return file_path_dict[closest_match[0]]
    else:
        return None

for page, df in dataframes.items():
    print(page)
    df['pdf_path'] = df['Title'].apply(get_closest_match)

reviews
dbs
dbs_urls
tools


In [6]:
### Retrieving missing pdfs
for page, df in dataframes.items():
    print(page, df['pdf_path'].isna().value_counts())
    

reviews False    86
Name: pdf_path, dtype: int64
dbs False    66
Name: pdf_path, dtype: int64
dbs_urls False    66
Name: pdf_path, dtype: int64
tools False    229
Name: pdf_path, dtype: int64


## Loading pdfs and adding metadata

In [24]:
df.loc[df.Title == 'Cancer driver drug interaction explorer', 'pdf_path'] = "/Users/fernando/Documents/Research/ChatGPT_REPO4EU/data/d23_repo4eu/data/all_papers/Hartung et al. 2022 - Cancer driver drug interaction explorer.pdf"
df.loc[df.Title == 'Drug Repositioning and Target Finding Based on Clinical Evidence', 'pdf_path']  = "/Users/fernando/Documents/Research/ChatGPT_REPO4EU/data/d23_repo4eu/data/all_papers/Kaneko and Nagashima 2020 - Drug Repositioning and Target Finding Based on Clinical Evidence.pdf"

In [25]:
df.head()

In [15]:
from langchain.document_loaders import PyPDFLoader
from tqdm import tqdm

In [16]:
list_of_documents = []

for index, row in tqdm(df.iterrows()):
    if "data" not in row['pdf_path']:
        filepath = "/Users/fernando/Documents/Research/ChatGPT_REPO4EU/data/d23_repo4eu/data/" + row['pdf_path']
    else:
        filepath = row['pdf_path']
    
    try:
        loader = PyPDFLoader(filepath)
        document = loader.load_and_split()
        
        for i in range(len(document)):
            document[i].page_content = document[i].page_content.split("\nReferences\n")[0]
            document[i].metadata.update(row.to_dict())
        list_of_documents.extend(document)
    except:
        print(f"article not found!\n{row['Title']}")

220it [02:19,  1.09it/s]Multiple definitions in dictionary at byte 0x26cf2 for key /MediaBox
Multiple definitions in dictionary at byte 0x2713f for key /MediaBox
Multiple definitions in dictionary at byte 0x272fd for key /MediaBox
Multiple definitions in dictionary at byte 0x2753e for key /MediaBox
Multiple definitions in dictionary at byte 0x276b7 for key /MediaBox
Multiple definitions in dictionary at byte 0x278e8 for key /MediaBox
Multiple definitions in dictionary at byte 0x27a6e for key /MediaBox
Multiple definitions in dictionary at byte 0x27c84 for key /MediaBox
Multiple definitions in dictionary at byte 0x27e6d for key /MediaBox
Multiple definitions in dictionary at byte 0x280a6 for key /MediaBox
Multiple definitions in dictionary at byte 0x2832f for key /MediaBox
Multiple definitions in dictionary at byte 0x285c0 for key /MediaBox
229it [02:23,  1.60it/s]


### Embedding and vector-storing

In [9]:
sublist_docs = list_of_documents[:5]

#### FAISS implementation

In [10]:
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings

my_embedding_model = OpenAIEmbeddings()

vectordb = FAISS.from_documents(
    documents=list_of_documents,
    embedding=my_embedding_model)

In [9]:
import faiss
import pickle

def write_file(filename, content):
    with open(filename, 'wb') as file:
        file.write(content)


def read_file(filename):
    with open(filename, 'rb') as file:
        return file.read()

def store_index_in_db(index, name):
    faiss.write_index(index.index, "docs.index")
    # Open the file and dump to local storage
    write_file(f"{name}.index", read_file("docs.index"))
    index.index = None
    write_file(f"{name}.pkl", pickle.dumps(index))


def load_index_from_db(index_name):
    findex = read_file(f"{index_name}.index")

    write_file("docs.index", findex)
    index = faiss.read_index("docs.index")
    VectorDB = pickle.loads(read_file(f"{index_name}.pkl"))
    VectorDB.index = index

    return VectorDB


store_index_in_db(vectordb, "repo4euD21")

In [18]:
index = load_index_from_db("repo4euD21")


In [19]:
print(len(list_of_documents))
print(len(index.docstore._dict))

4372
4372


## Similarity searches

In [41]:
question = "what is drug repurposing?"
docs = index.similarity_search(question, k=3)
docs

[Document(page_content='3 \n 1. Introduction  1 \nDrug repurposing (DR) is the process of identifying new therapeutic applications for existing drugs 2 \n[1]. Over the past few years, pharmaceutical industries have hugely invested in the repositioning  of 3 \napproved  and withdrawn  drugs  as traditional  drug development  is an extremely  expensive, 4 \nlaborious, time -consuming, and highly failure -prone avenue  [2-6]. DR especially finds its 5 \napplication in rare and  neglected diseases where there are very few or no drugs available for 6 \ntreatment  [7]. The US -FDA has provided a list of approved pharmaceuticals that can be promising 7 \ndrug candidates for  repurposing in rare disea ses [8]. Nonetheless, it has always been an endeavour  to 8 \nidentify drugs  that are equipotent  to such orphan  drugs.  DR also finds  its application  in 9 \ninfectious  diseases such  as tuberculosis [ 9-15], HIV and other communicable diseases where multi - 10 \ndrug resistance is a  major 

## Question answering

In [53]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.embeddings.openai import OpenAIEmbeddings
import os 
import openai

openai.api_key = "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"
openai.api_key = "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"

os.environ["OPENAI_API_KEY"] = openai.api_key


my_embedding_model = OpenAIEmbeddings()
vectordb2 = FAISS.load_local("repo4euD21.index", my_embedding_model)



RuntimeError: Error in faiss::FileIOReader::FileIOReader(const char *) at /Users/runner/work/faiss-wheels/faiss-wheels/faiss/faiss/impl/io.cpp:68: Error: 'f' failed: could not open repo4euD21.index/index.faiss for reading: Not a directory

In [49]:
from langchain.prompts import ChatPromptTemplate
from langchain.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate

# Build prompt

prompt = ChatPromptTemplate(
    messages=[
        HumanMessagePromptTemplate.from_template("""You are great at answering questions about drug repurposing in a concise\
                                                    and easy to understand manner. \
                                                    When you don't know the answer to a question you admit that you don't know\
                                                    Here is a question:\
                                                    {user_prompt}""")  
    ],
    input_variables=["user_prompt"])


label_query = prompt.format_prompt(user_prompt="Who is Yajie Meng?")



In [50]:

# Create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm=ChatOpenAI(temperature = 0.0, model='gpt-3.5-turbo'),
                                  chain_type="stuff",
                                  retriever=vectordb2.as_retriever(),
                                  return_source_documents=True,
                                  verbose=True)

# Cite sources
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['DOI'])
        
# Question
query = "Who is Yajie Meng?"
llm_response = qa_chain(prompt.format_prompt(user_prompt=query).to_string())
process_llm_response(llm_response)

NameError: name 'vectordb2' is not defined

## Implementing a chatbot with memory

In [45]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory

llm = ChatOpenAI(temperature=0.0)
memory = ConversationBufferMemory()

conversation = ConversationChain(
    llm=llm, 
    memory = memory,
    verbose=True
)

In [46]:
memory.save_context({"input": "Not much, just hanging"}, 
                    {"output": "Cool"})


conversation.predict(input="Hi, my name is Fernando")



[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

Current conversation:

Human: Hi, my name is Fernando
AI:[0m

[1m> Finished chain.[0m


"Hello Fernando! It's nice to meet you. How can I assist you today?"

In [47]:
conversation.predict(input="What is my name?")



[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

Current conversation:
Human: Hi, my name is Fernando
AI: Hello Fernando! It's nice to meet you. How can I assist you today?
Human: What is my name?
AI:[0m

[1m> Finished chain.[0m


'Your name is Fernando.'