In [1]:
import os
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain import PromptTemplate, LLMChain
from langchain.chains import RetrievalQA, RetrievalQAWithSourcesChain

app_dir = "/Users/chloe/Documents/linkr"

# Readme.Md
loader = UnstructuredMarkdownLoader(app_dir + "/Readme.Md", encoding = "utf-8")
docs = loader.load()

# DESCRIPTION
loader = TextLoader(app_dir + "/DESCRIPTION", encoding = "utf-8")
docs.extend(loader.load())

# R help files
for dirpath, dirnames, filenames in os.walk(app_dir + "/R"):
    for file in filenames:
        if file.startswith("help_") and file.endswith(".R"):
            loader = TextLoader(os.path.join(dirpath, file), encoding = "utf-8")
            docs.extend(loader.load())

# Rd doc files
for dirpath, dirnames, filenames in os.walk(app_dir + "/man"):
    for file in filenames:
        if file.endswith(".Rd"):
            loader = TextLoader(os.path.join(dirpath, file), encoding = "utf-8")
            docs.extend(loader.load())

# R files
for dirpath, dirnames, filenames in os.walk(app_dir + "/R"):
    for file in filenames:
        if not file.startswith("help_") and file.endswith(".R"):
            loader = TextLoader(os.path.join(dirpath, file), encoding = "utf-8")
            docs.extend(loader.load())

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap  = 0, length_function = len)

documents = text_splitter.split_documents(docs)

embedding_function = SentenceTransformerEmbeddings(model_name = "all-MiniLM-L6-v2")

vectorstore = Chroma.from_documents(documents, embedding_function)

llm = ChatOpenAI(model_name = "gpt-3.5-turbo", api_key = "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", temperature = 0)

qa = RetrievalQA.from_chain_type(llm, retriever = vectorstore.as_retriever())

prompt = "What is LinkR ?"
qa({"query": prompt})


  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange
  warn_deprecated(
  warn_deprecated(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


{'query': 'What is LinkR ?',
 'result': "I don't have enough context to provide a specific answer to your question about LinkR."}

## Document loading

In [2]:
docs = loader.load

In [3]:
len(documents)

4344

In [8]:
page = documents[0]

In [9]:
print(page.page_content[0:500])

LinkR

Introduction

LinkR is a web application that allows for visualization and
analysis of healthcare data.

The application is coded in R using the Shiny library. It uses the
common data model
OMOP.

Who is the application for?

The application is for:

healthcare professionals, an intuitive interface allows healthcare
  professionals to analyze data and conduct studies without advanced
  programming knowledge


In [5]:
docs.metadata

{'source': '/Users/chloe/Documents/linkr/Readme.Md'}

## Document splitting

In [28]:
app_dir = "/Users/chloe/Documents/linkr"
loader = UnstructuredMarkdownLoader(app_dir + "/Readme.Md", encoding = "utf-8")
docs = loader.load()
txt = ' '.join([d.page_content for d in docs])

In [30]:
from langchain.text_splitter import MarkdownHeaderTextSplitter
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
]
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on
)

In [31]:
md_header_splits = markdown_splitter.split_text(txt)

In [33]:
md_header_splits[0]

Document(page_content='LinkR  \nIntroduction  \nLinkR is a web application that allows for visualization and\nanalysis of healthcare data.  \nThe application is coded in R using the Shiny library. It uses the\ncommon data model\nOMOP.  \nWho is the application for?  \nThe application is for:  \nhealthcare professionals, an intuitive interface allows healthcare\nprofessionals to analyze data and conduct studies without advanced\nprogramming knowledge  \ndata scientists and statisticians, LinkR provides access to a full\nR and Python environment, allowing data scientists and statisticians\nto exploit all the features of advanced data analysis  \nhealthcare students, integrated tutorials in the application\nprovide healthcare students with an opportunity to learn and practice\ndata analysis in the healthcare field  \nThus, LinkR facilitates collaborative work.  \nQuick overview  \nUse an intuitive graphical interface to visualize aggregated\npatient data. Generate and, if required, modify

## Vectorstores and embeddings

In [None]:
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key = os.environ.get('OPENAI_API_KEY')

In [46]:
app_dir = "/Users/chloe/Documents/linkr"
loaders = [
    # Duplicate documents on purpose - messy data
    UnstructuredMarkdownLoader(app_dir + "/Readme.Md", encoding = "utf-8"),
    TextLoader(app_dir + "/DESCRIPTION", encoding = "utf-8"),
    #for dirpath, dirnames, filenames in os.walk(app_dir + "/R"):
    #for file in filenames:
        #if file.startswith("help_") and file.endswith(".R"):
    TextLoader(os.path.join(dirpath, file), encoding = "utf-8")
]

docs = []
for loader in loaders:
    docs.extend(loader.load())

In [47]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

In [48]:
splits = text_splitter.split_documents(docs)

In [49]:
len(splits)

21

In [50]:
from langchain.embeddings.openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings(openai_api_key="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")

sentence1 = "LinkR is a medical data analysis tool"
sentence2 = "To use LinkR, use the docuementation"
sentence3 = "LinkR can be used by health professionals and patients"
embedding1 = embedding.embed_query(sentence1)
embedding2 = embedding.embed_query(sentence2)
embedding3 = embedding.embed_query(sentence3)
import numpy as np
np.dot(embedding1, embedding2)
0.9631853877103518
np.dot(embedding1, embedding3)
0.7709997651294671
np.dot(embedding2, embedding3)

  warn_deprecated(


0.8616088450916822

In [51]:
import os

# Désactiver le parallélisme pour éviter les deadlocks
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [52]:
from langchain.vectorstores import Chroma
persist_directory = '/Users/chloe/Documents/linkr/docs/chroma'
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)
print(vectordb._collection.count())

1550


In [53]:
question = "is there a document where I can find help?"
docs = vectordb.similarity_search(question,k=3) # k is number of documents 
len(docs)

3

In [None]:
docs[0].page_content

'load_help_page(r)\n    \n    r$help_scripts_modal_title <- i18n$t("choose_dataset_scripts")\n    \n    if (language == "fr"){\n      r$help_scripts_modal_text <- div(\n        tags$h3(tags$i(class = "fa fa-gear", style = "color: steelblue;"), " ", strong("Configurer le set de données")),\n        p("La colonne de droite référencie les scripts ", strong("disponibles"), " pour cet set de données, non utilisés."),\n        p("La colonne de gauche référencie les scripts ", strong("utilisés"), " pour ce set de données."),\n        p("Lorsque vous chargez un set de données, tous les scripts de la colonne ", tags$em("Scripts choisis"), " seront ", strong("éxécutés au lancement du set de données"), "."),\n        p("Cliquez sur un script et glissez-le dans la colonne correspondante."),\n        tags$h3(tags$i(class = "fa fa-sd-card", style = "color: steelblue;"), " ", strong("Mémoire cache")),\n        p("L\'éxécution de certains scripts peut prendre du temps au chargement d\'un set de donnée

In [None]:
vectordb.persist()

  warn_deprecated(


In [54]:
question = "what did they say about documentation?"
docs = vectordb.similarity_search(question,k=5)

In [56]:
docs[1]

Document(metadata={'source': '/Users/chloe/Documents/linkr/R/help_settings_data_management.R'}, page_content='if (language == "fr"){\n      r[[paste0("help_settings_data_management_", prefix, "_modal_text")]] <- div(\n        tags$h3(tags$i(class = "fa fa-check", style = "color: steelblue;"), " ", strong("Version du set de données")),\n        p("Dès lors que vous ", strong("modifiez"), " les options ou le code d\'un set de données, ", strong("mettez à jour"), " la version,",\n          " ceci permettra aux utilisateurs de ", strong("mettre à jour"), " leur copie de votre set de données, s\'ils l\'ont téléchargée depuis votre dépôt git."),\n        tags$h3(tags$i(class = "fa fa-check", style = "color: steelblue;"), " ", strong("Auteur(s)")),\n        p("Indiquez ", strong("qui a contribué"), " à la création de ce set (un ou plusieurs auteurs, séparés par des virgules)."),\n        tags$h3(tags$i(class = "fa fa-check", style = "color: steelblue;"), " ", strong("Nom, catégorie et descrip

In [57]:
question = "what did they say about documentation?"
docs = vectordb.similarity_search(question,k=5)
for doc in docs:
    print(doc.metadata)

{'source': '/Users/chloe/Documents/linkr/R/help_vocabularies.R'}
{'source': '/Users/chloe/Documents/linkr/R/help_settings_data_management.R'}
{'source': '/Users/chloe/Documents/linkr/R/help_vocabularies.R'}
{'source': '/Users/chloe/Documents/linkr/R/help_settings_data_management.R'}
{'source': '/Users/chloe/Documents/linkr/R/help_vocabularies.R'}


In [58]:
print(docs[4].page_content)

tags$p(tags$i(class = "fa fa-check", style = "color: steelblue;"), " ", strong("Terminologie de droite")),
          p("Vous pouvez filtrer la terminologie de droite en cochant ", tags$em("N'afficher que les concepts utilisés"), ", afin de retirer les concepts qui n'ont jamais été utilisés dans ", strong("aucun set de données"), " chargé sur l'application."),
          p("Le nombre de lignes pour la terminologie de droite concerne ", strong("tous les sets de données"), " chargés sur l'application."),
          p("Est également renseigné le ", strong("nombre de sets de données"), " contenant chaque concept."),
          p("L'intérêt est de pouvoir identifier rapidement les ", strong("concepts fréquemment utilisés"), "."),
          p("En effet, il est parfois difficile d'identifier le ", strong("concept standard"), " correspondant au concept que l'on veut aligner."),
          p("Par exemple, la requête ", tags$a(href = "https://athena.ohdsi.org/search-terms/terms?vocabulary=LOINC&page=

## Retrieval

In [60]:
#%pip install lark

In [None]:
import os
import openai
import sys
from dotenv import load_dotenv, find_dotenv

# Ajouter le chemin au sys.path si nécessaire
sys.path.append('../..')

# Charger le fichier .env pour les variables d'environnement
_ = load_dotenv(find_dotenv())

# Configurer la clé API OpenAI en la récupérant depuis les variables d'environnement
openai.api_key = os.environ.get('OPENAI_API_KEY')

In [63]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
persist_directory = '/Users/chloe/Documents/linkr/docs/chroma'
embedding = OpenAIEmbeddings(openai_api_key="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")
vectordb = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding
)
print(vectordb._collection.count())

1550


  warn_deprecated(


In [66]:
question = "what did they say about drugs?"
docs_ss = vectordb.similarity_search(question,k=3)

In [67]:
docs_ss[0].page_content[:100]

'tags$li(tags$em("drug_strength"), " : table gathering information about drugs")\n        ),\n        p'

In [68]:
docs_ss[1].page_content[:100]

"}\n  \n  # What's a vocabulary ?\n  \n  observeEvent(r$help_vocabularies_page_1, {\n    \n    load_help_pa"

In [69]:
#With MMR

docs_mmr = vectordb.max_marginal_relevance_search(question,k=3)
docs_mmr[0].page_content[:100]

'tags$li(tags$em("drug_strength"), " : table gathering information about drugs")\n        ),\n        p'

In [70]:
docs_mmr[1].page_content[:100]

"}\n  \n  # What's a vocabulary ?\n  \n  observeEvent(r$help_vocabularies_page_1, {\n    \n    load_help_pa"

In [75]:
from langchain.llms import OpenAI
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

In [76]:
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))

In [78]:
# Wrap our vectorstore
llm = OpenAI(temperature=0, openai_api_key="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")
compressor = LLMChainExtractor.from_llm(llm)

In [79]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever(search_type = "mmr")
)

In [80]:
question = "what did they say about drugs?"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)

  warn_deprecated(


Document 1:

- "drug_strength"
- "table gathering information about drugs"
- "vocabulary tables"
- "OMOP documentation"
- "Request database"
- "Requêtez ici la base de données"
- "Type de connexion"
- "Base de données"
- "principale ou publique"
- "Exemple de requête"
- "nom des tables"
- "Tables de la BDD"
- "raccourcis"
----------------------------------------------------------------------------------------------------
Document 2:

- Une terminologie est un dictionnaire de concepts, associant des codes à des noms.
- Quelques exemples de concepts issus d'une terminologie :
- 42503 - Fréquence cardiaque : ici le concept ayant pour code 42503 est associé au nom Fréquence cardiaque
- 800902 - Noradrénaline : ici le concept ayant pour code 800902 est associé au nom Noradrénaline
- Un entrepôt de données de santé stocke des données en associant des valeurs à des concepts.
- Quelques exemples :
- patient 409 - datetime 13-01-2022 14:44:32 - concept 42503 - valeur 54
------------------------

## Question answering

In [81]:
import os
import openai
import sys
from dotenv import load_dotenv, find_dotenv

# Ajouter le chemin au sys.path si nécessaire
sys.path.append('../..')

# Charger le fichier .env pour les variables d'environnement
_ = load_dotenv(find_dotenv())

# Configurer la clé API OpenAI en la récupérant depuis les variables d'environnement
openai.api_key = os.environ.get('OPENAI_API_KEY')

In [82]:
import datetime
current_date = datetime.datetime.now().date()
if current_date < datetime.date(2023, 9, 2):
    llm_name = "gpt-3.5-turbo-0301"
else:
    llm_name = "gpt-3.5-turbo"
print(llm_name)

gpt-3.5-turbo


In [83]:
from langchain.embeddings.openai import OpenAIEmbeddings

# Initialiser les embeddings OpenAI avec la clé API
embedding = OpenAIEmbeddings(openai_api_key='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')


In [84]:
from langchain.vectorstores import Chroma

# Définir le répertoire de persistance
persist_directory = '/Users/chloe/Documents/linkr/docs/chroma'

# Créer l'instance de Chroma
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)


In [85]:
print(vectordb._collection.count())

1550


In [87]:
question = "What is LinkR?"
docs = vectordb.similarity_search(question,k=3)
len(docs)

3

In [89]:
question = "Quelles sont les informations sur la fréquence cardiaque?"
docs = vectordb.similarity_search(question,k=3)
len(docs)

3

In [90]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name="gpt-3.5-turbo", openai_api_key='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', temperature=0)

In [91]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)

In [92]:
result = qa_chain({"query": question})

In [93]:
result["result"]

'Les informations sur la fréquence cardiaque sont associées au concept ayant le code 42503.'

In [94]:
from langchain.prompts import PromptTemplate

# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

In [95]:
# Run chain
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [110]:
question = "Où sont les données sur la fréquence cardiaque?"

In [111]:
result = qa_chain({"query": question})

In [112]:
result["result"]

'Les données sur la fréquence cardiaque sont généralement affichées dans l\'onglet "Hémodynamique" d\'une étude structurée par onglets. Merci de poser la question!'

In [113]:
result["source_documents"][0]

Document(metadata={'source': '/Users/chloe/Documents/linkr/R/help_settings_app_database.R'}, page_content='tags$li(tags$em("drug_strength"), " : table gathering information about drugs")\n        ),\n        p("For more information on vocabulary tables, visit the ", \n          tags$a(href = "https://ohdsi.github.io/CommonDataModel/cdm60.html#CONCEPT", "OMOP documentation", target = "_blank"), "."),\n        br()\n      )\n    }\n  })\n  \n  # Request database\n  \n  observeEvent(r$help_settings_app_database_page_4, {\n    \n    load_help_page(r)\n    \n    r$help_settings_app_database_modal_title <- i18n$t("db_request_card")\n    \n    if (language == "fr"){\n      r$help_settings_app_database_modal_text <- div(\n        p(tags$i(class = "fa fa-check", style = "color: steelblue;"), " ", strong("Requêtez ici la base de données"), \n          ", en choisissant le ", tags$em("Type de connexion"), " (locale ou distante), et la ", \n          tags$em("Base de données"), " (principale ou pu

In [114]:
qa_chain_mr = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    chain_type="map_reduce"
)
result = qa_chain_mr({"query": question})
result["result"]

'Les données sur la fréquence cardiaque peuvent être trouvées dans l\'onglet "Hémodynamique" d\'un dossier clinique, ainsi que dans la rubrique "Données de fréquence cardiaque" de l\'application.'

In [116]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)
question = "Where can I find information about drugs?"
result = qa_chain({"query": question})
result["result"]

'You can find information about drugs in the `d$dataset_drug_strength` variable, which contains information about medications from the OMOP drug_strength table.'

In [120]:
import os
import openai
import sys
sys.path.append('../..')

import panel as pn  # GUI
pn.extension()
embedding = OpenAIEmbeddings(openai_api_key='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')
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key = os.environ.get('OPENAI_API_KEY')

In [121]:
import datetime
current_date = datetime.datetime.now().date()
if current_date < datetime.date(2023, 9, 2):
    llm_name = "gpt-3.5-turbo-0301"
else:
    llm_name = "gpt-3.5-turbo"
print(llm_name)

gpt-3.5-turbo


In [126]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
persist_directory = '/Users/chloe/Documents/linkr/docs/chroma'
embedding = OpenAIEmbeddings(openai_api_key='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')
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)

In [127]:
question = "What are major topics of this app?"
docs = vectordb.similarity_search(question,k=3)
len(docs)

3

In [129]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name=llm_name, temperature=0, openai_api_key='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')
llm.predict("Hello world!")

  warn_deprecated(


'Hello! How can I assist you today?'

In [130]:
# Build prompt
from langchain.prompts import PromptTemplate
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context", "question"],template=template,)

# Run chain
from langchain.chains import RetrievalQA
question = "How to analyze health data?"
qa_chain = RetrievalQA.from_chain_type(llm,
                                       retriever=vectordb.as_retriever(),
                                       return_source_documents=True,
                                       chain_type_kwargs={"prompt": QA_CHAIN_PROMPT})


result = qa_chain({"query": question})
result["result"]

'To analyze health data using LinkR, users can choose to load patient-level or aggregated data and then use the intuitive graphical interface to visualize the data. They can generate and modify corresponding R code directly from the figures created. Thanks for asking!'

In [131]:
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

In [133]:
from langchain.chains import ConversationalRetrievalChain
retriever=vectordb.as_retriever()
qa = ConversationalRetrievalChain.from_llm(
    llm,
    retriever=retriever,
    memory=memory
)
question = "Can I analyze diseases?"
result = qa({"question": question})
result['answer']

'Yes, diseases can be analyzed using LinkR. The application allows healthcare professionals, data scientists, statisticians, and healthcare students to analyze healthcare data, which includes the ability to study diseases and their impact.'

In [135]:
question = "are there any prerequisites needed?"
result = qa({"question": question})
result['answer']

'To analyze diseases using LinkR, users would benefit from having a basic understanding of healthcare data and data analysis concepts. However, the application is designed to be user-friendly and intuitive, so advanced programming knowledge is not required. The application is suitable for healthcare professionals, data scientists, statisticians, and healthcare students, providing tutorials for learning and practicing data analysis in the healthcare field.'