In [3]:
# add open ai secret key to the environment
import os, re
from secret import MY_API_KEY
import spacy
from spacypdfreader.spacypdfreader import pdf_reader
from haystack import Pipeline, Document
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.converters import PyPDFToDocument
from haystack.components.preprocessors import DocumentCleaner
from haystack.components.preprocessors import DocumentSplitter
from haystack.components.writers import DocumentWriter
from pathlib import Path

document_store = InMemoryDocumentStore()
pdf_path = './data/MASTER_TremfyaPM_08Nov2022_annotated.pdf'


 
def load_pdf_spacy(pdf_path):
    nlp = spacy.load('en_core_web_sm')
    doc = pdf_reader(pdf_path, nlp)
    return doc

spacy_doc = load_pdf_spacy(pdf_path)

docs = []
RE_EXCESS_NEWLINE = re.compile(r"\n(?=[a-z])") # A \n followed by a lowercase character

for p in range(1, spacy_doc._.page_range[1] + 1):
    docs.append(Document(
        content=RE_EXCESS_NEWLINE.sub("", spacy_doc._.page(p).text), 
        meta={
            'source': spacy_doc._.pdf_file_name,
            'page': p, 
            'drug-name': 'Tremfya',
            'material-type': 'PM', # can use marketing material, etc
            }))


In [4]:
splitter = DocumentSplitter(split_by='passage', split_length= 4, split_overlap=1) # initial 2 and 0
final_docs = splitter.run(docs)
print("Initial number of pages: ", len(docs))
print("Final number of chunks: ", len(final_docs['documents']))

Initial number of pages:  47
Final number of chunks:  290


In [5]:
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.document_stores.types import DuplicatePolicy

doc_embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
doc_embedder.warm_up()

docs_with_embeddings = doc_embedder.run(final_docs['documents'])
document_store.write_documents(docs_with_embeddings["documents"], policy=DuplicatePolicy.SKIP)

Batches:   0%|          | 0/10 [00:00<?, ?it/s]

290

# Initialzing the Retrieval System




In [6]:
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.builders import PromptBuilder
import os
from getpass import getpass
from haystack.components.generators import OpenAIGenerator

os.environ["OPENAI_API_KEY"] = MY_API_KEY

text_embedder = SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
retriever = InMemoryEmbeddingRetriever(document_store=document_store)

template = """
Given the following information, answer the question.

Context:
{% for document in documents %}
    {{ document.content }}
{% endfor %}

Question: {{question}}
Answer:
"""

prompt_builder = PromptBuilder(template=template)
generator = OpenAIGenerator(model="gpt-3.5-turbo", generation_kwargs={'temperature': 0.2})
# from openai import OpenAI
# client = OpenAI()

# completion = client.chat.completions.create(
#   model="gpt-3.5-turbo",
#   messages=[
#     {"role": "system", "content": "You are a helpful assistant."},
#     {"role": "user", "content": "Hello!"}
#   ]
# )

# print(completion.choices[0].message)

basic_rag_pipeline = Pipeline()
# Add components to your pipeline
basic_rag_pipeline.add_component("text_embedder", text_embedder)
basic_rag_pipeline.add_component("retriever", retriever)
basic_rag_pipeline.add_component("prompt_builder", prompt_builder)
basic_rag_pipeline.add_component("llm", generator)

# Now, connect the components to each other
basic_rag_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
basic_rag_pipeline.connect("retriever", "prompt_builder.documents")
basic_rag_pipeline.connect("prompt_builder", "llm")
basic_rag_pipeline.warm_up()

In [7]:
question = "What are the relevant warnings, side effects, and precautions for Tremfya?"

response = basic_rag_pipeline.run({"text_embedder": {"text": question}, "prompt_builder": {"question": question}})

print(response["llm"]["replies"][0])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


1. Do not use Tremfya if you have any symptoms of infection unless instructed by your healthcare provider.
2. Look out for infections and allergic reactions.
3. Common side effects may include injection site reactions, respiratory tract infections, and increased transaminases.
4. Less common clinical trial adverse reactions may include candida infections, migraine, and urticaria.
5. Geriatric patients may have limited data on the use of Tremfya.
6. Tremfya is contraindicated in patients with known serious hypersensitivity to guselkumab or any of the components.


In [8]:
response

  'meta': [{'model': 'gpt-3.5-turbo-0125',
    'index': 0,
    'finish_reason': 'stop',
    'usage': {'completion_tokens': 136,
     'prompt_tokens': 1397,
     'total_tokens': 1533}}]}}

In [9]:
text_embedder = SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
retriever = InMemoryEmbeddingRetriever(document_store=document_store, top_k=5)
prompt_builder = PromptBuilder(template=template)

# test the output of retriver
basic_rag_pipeline = Pipeline()
# Add components to your pipeline
basic_rag_pipeline.add_component("text_embedder", text_embedder)
basic_rag_pipeline.add_component("retriever", retriever)
basic_rag_pipeline.add_component("prompt_builder", prompt_builder)
# basic_rag_pipeline.add_component("llm", generator)

# Now, connect the components to each other
basic_rag_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
basic_rag_pipeline.connect("retriever", "prompt_builder.documents")
# basic_rag_pipeline.connect("prompt_builder", "llm")

question = "The indications of Tremfya. Tremfya is used to cure"
response = basic_rag_pipeline.run({"text_embedder": {"text": question}, "prompt_builder": {"question": question}})

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [10]:
print(response['prompt_builder']['prompt'])


Given the following information, answer the question.

Context:

    Look out for infections and allergic reactions

 Do not use TREMFYA®/TREMFYA One-Press® if you have any symptoms of infection 
unless you are instructed by your healthcare provider.

TRE11082022CPM_SNDS 259046.docx
EDMS-ERI-121169578 v15.0

Page 28 of 47



    Infections

TREMFYA®/TREMFYA One-Press® is a selective immunomodulatory agent which has the potential to increase the risk of infection. Infections have been observed in clinical trials in plaque psoriasis (23% vs 21% for placebo; ≤ 0.2% serious infections in both groups) and psoriatic arthritis (21% in both TREMFYA® and placebo groups; ≤ 0.8% serious infections in both groups). The most common type of infection reported was respiratory tract infection. (See 
8 ADVERSE REACTIONS, Infections)

Treatment with TREMFYA®/TREMFYA One-Press® should not be initiated in patients with any clinically important active infection until the infection resolves or is adequate

# Create Template for email

* Head or Hero Section: The introductory portion that grabs attention.
* Body: Includes the core message and call-to-actions (CTAs).
* References/Footnotes: Any references or additional information links.
* Important Safety Information: Mandatory for compliance with health regulations.
* Footer: Contains generic brand or company information, which remains constant across most emails.


In [11]:
approved_claims = [
    'When your patient presents with moderate-to-severe plaque psoriasis, SAY TREMFYA®',
    'TREMFYA®/TREMFYA ONE-PRESS® (guselkumab injection) is indicated for the treatment of adult patients with moderate-to-severe plaque psoriasis who are candidates for systemic therapy or phototherapy.',
    'TREMFYA® demonstrated a superior PASI 90 response vs. COSENTYX at Week 48 (ITT population)',
    'Indication not previously mentioned and clinical use:TREMFYA®/TREMFYA ONE-PRESS® is also indicated for the treatment of adult patients with active psoriatic arthritis. TREMFYA®/TREMFYA ONE-PRESS® can be used alone or in combination with a conventional disease-modifying antirheumatic drug (cDMARD) (e.g., methotrexate).'
]

user_inputs = {
    'brand': 'Tremfya',
    'email_title': 'PLACEHOLDER_TITLE',
    'region': 'NORTH AMERICA',
    'user_type': 'HCP', # or PATIENT
    'brand_voice': 'FORMAL', # or INFORMAL
    'email_goal': 'EDUCATIONAL', # or PROMOTIONAL, AWARNESS, Call to Action
}

NULL = 'NULL'
output_template = {
    'title': user_inputs['email_title'],
    'body': NULL,
    'reference': NULL, # source names of the files used in RAG or filtered based on user_inputs
    'safety': NULL,
    'footer': NULL
}

llm_system_prompt = 'You are a marketer working for a pharmaceutical company.\
    Your job is to generate core marketing claims based of a Product monograph. \
    The facts stated in the product monograph are dry material and must be turned into marketing \
    content while maintaining regulatory compliance and key factual information.'

Outcome of meeting

Three stage process

dry material in product monograph - 1st stage
marketing jargonned marketing claims dervied from the monograph - 2nd stage
final email = contains a combination of 2nd (claims) and safety or reference(from 1st stage) - 3rd stage


I can experiment :
1- how to get from 1st to 2nd using LLM when prompting the whole or part of the PM to the model--- can we get new claims out of it?
2- use the already used claims in email and get the safety information from the PM (basically populate the email for now)


experiment 1:
Tested prompt engineering with this context

System: You are a marketer working for a pharmaceutical company. Your job is to generate core marketing claims based of a Product monograph. The facts stated in the product monograph are dry material and must be turned into marketing content while maintaining regulatory compliance and key factual information. 

Based of the Product Monograph (PM) stated below. Extract at least one marketing claim for that drug.

PM:
''' '''
Claims:
1- When your patient presents with moderate-to-severe plaque psoriasis, SAY TREMFYA®
2- TREMFYA®/TREMFYA ONE-PRESS® (guselkumab injection) is indicated for the treatment of adult patients with moderate-to-severe plaque psoriasis who are candidates for systemic therapy or phototherapy.

-------------------------
Possible use of BM25 for retrieval of the relevant information from the PM
https://haystack.deepset.ai/tutorials/34_extractive_qa_pipeline
And then use the 

In [10]:
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.builders import PromptBuilder
import os
from getpass import getpass
from haystack.components.generators import OpenAIGenerator

os.environ["OPENAI_API_KEY"] = MY_API_KEY

text_embedder = SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
retriever = InMemoryEmbeddingRetriever(document_store=document_store)

template = """

Product Monograph:
{% for document in documents %}
    {{ document.content }}
{% endfor %}

Based of the Product Monograph (PM) stated Above. Extract at least three marketing claim for that drug.

Example:
{% for claim in approved_claims %}
    {{ loop.index }}. {{ claim }}
{% endfor %}

Marketing Claims:
"""

prompt_builder = PromptBuilder(template=template)
generator = OpenAIGenerator(model="gpt-3.5-turbo",system_prompt=llm_system_prompt, generation_kwargs={'temperature': 0.2})


basic_rag_pipeline = Pipeline()
# Add components to your pipeline
basic_rag_pipeline.add_component("text_embedder", text_embedder)
basic_rag_pipeline.add_component("retriever", retriever)
basic_rag_pipeline.add_component("prompt_builder", prompt_builder)
basic_rag_pipeline.add_component("llm", generator)

# Now, connect the components to each other
basic_rag_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
basic_rag_pipeline.connect("retriever", "prompt_builder.documents")
basic_rag_pipeline.connect("prompt_builder", "llm")

question = "The indications of Tremfya"
response = basic_rag_pipeline.run({"text_embedder": {"text": question}, "prompt_builder": {'approved_claims': approved_claims}})

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [11]:
response['llm']['replies'][0].split('\n')

['1. Trust TREMFYA®/TREMFYA One-Press® to effectively treat moderate-to-severe plaque psoriasis in adult patients who are suitable for systemic therapy or phototherapy.',
 '',
 '2. Experience the power of TREMFYA® in achieving superior PASI 90 response compared to placebo, showcasing its efficacy in managing plaque psoriasis.',
 '',
 '3. Choose TREMFYA®/TREMFYA One-Press® for the treatment of active psoriatic arthritis in adult patients, either as a standalone therapy or in combination with a conventional disease-modifying antirheumatic drug for comprehensive care.']

In [12]:
response['llm']['replies'][0].split('\n')

['1. Trust TREMFYA®/TREMFYA One-Press® to effectively treat moderate-to-severe plaque psoriasis in adult patients who are suitable for systemic therapy or phototherapy.',
 '',
 '2. Experience the power of TREMFYA® in achieving superior PASI 90 response compared to placebo, showcasing its efficacy in managing plaque psoriasis.',
 '',
 '3. Choose TREMFYA®/TREMFYA One-Press® for the treatment of active psoriatic arthritis in adult patients, either as a standalone therapy or in combination with a conventional disease-modifying antirheumatic drug for comprehensive care.']

In [13]:
# build a retrieval pipeline to extract page number of a claim
find_ref_pipeline = Pipeline()
retriever = InMemoryEmbeddingRetriever(document_store=document_store, top_k=5)
text_embedder = SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")

# Add components to your pipeline
find_ref_pipeline.add_component("text_embedder", text_embedder)
find_ref_pipeline.add_component("retriever", retriever)

# Now, connect the components to each other
find_ref_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
find_ref_pipeline.warm_up()

In [14]:
claim_to_ref = "Don't let infections hold you back - Choose TREMFYA®/TREMFYA One-Press® for effective treatment of moderate-to-severe plaque psoriasis."
response = find_ref_pipeline.run({"text_embedder": {"text": claim_to_ref}})
# response['retriever']['documents']
for doc in response['retriever']['documents']:
    print(doc.content)
    print("PAGE NUMBER#################",doc.meta['page'])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Infections

TREMFYA®/TREMFYA One-Press® is a selective immunomodulatory agent which has the potential to increase the risk of infection. Infections have been observed in clinical trials in plaque psoriasis (23% vs 21% for placebo; ≤ 0.2% serious infections in both groups) and psoriatic arthritis (21% in both TREMFYA® and placebo groups; ≤ 0.8% serious infections in both groups). The most common type of infection reported was respiratory tract infection. (See 
8 ADVERSE REACTIONS, Infections)

Treatment with TREMFYA®/TREMFYA One-Press® should not be initiated in patients with any clinically important active infection until the infection resolves or is adequately treated. 

Instruct patients treated with TREMFYA®/TREMFYA One-Press® to seek medical advice if signs or symptoms of clinically important chronic or acute infection occur. If a patient develops a clinically important or serious infection or is not responding to standard therapy, monitor the patient closely and discontinue TREMFY

In [15]:
response['retriever']['documents']


[Document(id=bab7f7b5b440b549990301cd019bab27869925a502cdf08a760c4959bd324577, content: 'Infections
 
 TREMFYA®/TREMFYA One-Press® is a selective immunomodulatory agent which has the potentia...', meta: {'source': 'data\\MASTER_TremfyaPM_08Nov2022_annotated.pdf', 'page': 7, 'drug-name': 'Tremfya', 'material-type': 'PM', 'source_id': '45cb171b161c23ef54900256b6d98e39b56197f40a61c98a959883bf7895bd98'}, score: 0.8596294450483619),
 Document(id=b7b59f961463db37ef36084a7f6a107c90574673e9b43ba50d0d13cfdfb47966, content: 'TREMFYA®/TREMFYA One-Press® (guselkumab injection) is indicated for:
 the treatment of adult patients...', meta: {'source': 'data\\MASTER_TremfyaPM_08Nov2022_annotated.pdf', 'page': 4, 'drug-name': 'Tremfya', 'material-type': 'PM', 'source_id': 'd12990e5789bb24e8ca21bd0650ef098ba098108e743b7614f0fdefe5a53b3c1'}, score: 0.8395703991704484),
 Document(id=8aea02a424d2618d401448d938167770499b536a5ad0bb5383ad508a14774be9, content: 'Read this carefully before you start taking TREM

In [1]:
from haystack.components.readers import ExtractiveReader
import accelerate
# Extractive reader

reader = ExtractiveReader()
reader.warm_up()

reader.run(query="What are the indications of the Tremfya drug?", documents=final_docs['documents'], top_k=2)

NameError: name 'final_docs' is not defined

In [13]:
from haystack.nodes import BM25Retriever
from haystack.nodes import FARMReader

reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)
retriever = BM25Retriever(document_store=document_store)


ModuleNotFoundError: No module named 'haystack.nodes'