# Extracting claims from the email data

This would create a list of claims extracted from the TREMFYA marketing email. Every claim's corresponding reference is also provided.

In [168]:
import pandas as pd
import numpy as np
from PyPDF2 import PdfReader
import re
from pdfplumber import pdf
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer, util
import torch
import sys,time,random
import spacy
from spacypdfreader.spacypdfreader import pdf_reader

claims = pd.read_excel(r'data\Tremfya_email_claims.xlsx')
claims.head()

# using regex extract all text not between []
ext_claims = []
ext_refs = []
for i in range(len(claims)):
    my_str = claims.iloc[i].squeeze()
    ext_claims.append(re.sub(r'\[.*?\]', '', my_str))
    ext_refs.append(re.findall(r'\[.*?\]', my_str)[-1])

claims['text'] = ext_claims
claims['text'] = claims['text'].str.replace('\n', '').str.strip()
claims['reference'] = ext_refs
claims[['text', 'reference']].head(10)


Unnamed: 0,text,reference
0,When your patient presents with moderate-to-se...,[PM p. 4A]
1,TREMFYA®/TREMFYA ONE-PRESS® (guselkumab inject...,[PM p. 4A]
2,HEAD-TO-HEAD RESULTS: TREMFYA® 48-WEEK EFFICAC...,"[ECLIPSE p 5A, 5B, 6A]"
3,"In a phase 3, multicentre, randomized, double-...",[COPY]
4,<“PASI 90” icon> TREMFYA® demonstrated a super...,"[ECLIPSE p 5A, 5B, 6A]"
5,TREMFYA®: THE FIRST IL-23 INHIBITOR WITH INDIC...,"[DOF Letter, March 2, 2021, p1A]"
6,Indication not previously mentioned and clinic...,[PM p. 4B]
7,Do not initiate treatment in patients with any...,[PM p. 7A]
8,Discontinue treatment if patient develops a se...,[PM p. 7B]
9,Evaluate patients for tuberculosis infection p...,[PM p. 7C]


In [169]:
def progressBar(count_value, total, suffix=''):
    bar_length = 100
    filled_up_Length = int(round(bar_length* count_value / float(total)))
    percentage = round(100.0 * count_value/float(total),1)
    bar = '=' * filled_up_Length + '-' * (bar_length - filled_up_Length)
    sys.stdout.write('[%s] %s%s ...%s\r' %(bar, percentage, '%', suffix))
    sys.stdout.flush()

# Loading the llm model here
# llm = GPT4All("orca-mini-3b.ggmlv3.q4_0.bin")

def get_query():
    query = input("Enter your question\n")
    progressBar(1, 7)
    return query


def load_split_pdf(pdf_path):
    pdf_loader = PdfReader(open(pdf_path, "rb"))
    pdf_text = ""
    for page_num in range(len(pdf_loader.pages)):
        pdf_page = pdf_loader.pages[page_num]
        pdf_text += pdf_page.extract_text()
    progressBar(2, 7)
    return pdf_text

def load_pdf_spacy(pdf_path):
    nlp = spacy.load('en_core_web_sm')
    doc = pdf_reader(pdf_path, nlp)
    return doc



def split_text_using_RCTS(pdf_text):
    text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2096,
    chunk_overlap=1024, #not sure about the effect of these
    keep_separator=True #not sure about the effect of these
    )
    split_texts = text_splitter.split_text(pdf_text)
    paragraphs = []
    for text in split_texts:
        # text.replace(r'(?<!\n)(\n)(?=\s*[A-Z][a-z])', '\n')
        splitted_text = text.split('\n\n')
        splitted_text = [text.replace('\n', '').strip() for text in splitted_text]
        paragraphs.extend(splitted_text) 
    progressBar(3, 7)
    return paragraphs, split_texts



def Initialize_sentence_transformer():
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embedder = SentenceTransformer(model_name)
    progressBar(3, 7)
    return embedder


def encode_each_paragraph(paragraphs, embedder):
    corpus_embeddings = embedder.encode(paragraphs, convert_to_tensor=True)
    progressBar(5, 7)
    return corpus_embeddings

def choose_most_relevant_sentence(embedder, corpus_embeddings, query, k:int=5):
    query_embedding = embedder.encode(query, convert_to_tensor=True)
    top_k = min(k, len(corpus_embeddings))

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)
    progressBar(6, 7)
    return top_results



def find_paragraph_in_pdf(paragraph, pdf_doc):
    for page_number in range(1, pdf_doc._.page_range[1] + 1):
        page = pdf_doc._.page(page_number)
        # the replace would cause the index to be off a bit but it's the only way to deal with the new lines in middle of the paragraph
        res = page.text.replace('\n','').find(paragraph)
        if res != -1:
            return page_number, res, res + len(paragraph)
    return None

def print_results(query, top_results, paragraphs, pdf_doc):
    progressBar(7, 7)
    print("Query (claim):", query)
    print("\nTop 10 most similar sentences in pdf:")
    for score, idx in zip(top_results[0], top_results[1]):
        print(f"(Score: {score:.4f})", paragraphs[idx],)
        res = find_paragraph_in_pdf(paragraphs[idx], pdf_doc)
        if res:
            print(f"Found in page {res[0]} (start: {res[1]}, end: {res[2]})")
        else:
            print("Not found in pdf")


In [145]:
# # Using spacy, made the paragraphs much accurate
# pdf_path = 'data/MASTER_TremfyaPM_08Nov2022_annotated.pdf'
# # doc = load_pdf_spacy(pdf_path)
# text = doc._.page(4).text
# paragraphs, split_texts = split_text_using_RCTS(text)
# paragraphs

# Read pdf and find similar sentences to claim

By specifying the row number in that initial claims data set and giving the input location of the pdf, the code will extract the reference for that claim using semantic similarity and would find the most similar sentence in the pdf with it's page number and location in the page.

The code would also provide the similarity score between the claim and the sentence and print the top 10 results based on that.

A few claims were tested and the results were found to be accurate.


In [164]:
pdf_path = 'data/MASTER_TremfyaPM_08Nov2022_annotated.pdf'
query = claims['text'][1]
pdf_doc = load_pdf_spacy(pdf_path)
paragraphs, _ = split_text_using_RCTS(pdf_doc.text)
embedder = Initialize_sentence_transformer()
corpus_embedding = encode_each_paragraph(paragraphs=paragraphs, embedder=embedder)
top_results = choose_most_relevant_sentence(embedder, corpus_embedding, query, k = 10)
print_results(query, top_results, paragraphs, pdf_doc)



Query: TREMFYA®/TREMFYA ONE-PRESS® (guselkumab injection) is indicated for the treatment of adult patients with moderate-to-severe plaque psoriasis who are candidates for systemic therapy or phototherapy.2

Top 10 most similar sentences in corpus:
(Score: 0.9558) TREMFYA®/TREMFYA One-Press® (guselkumab injection) should be prescribed by physicians who have sufficient knowledge of plaque psoriasis or psoriatic arthritis and who have fully familiarized themselves with the efficacy/safety profile of the drug.
Found in page 1 (start: 275, end: 522)
(Score: 0.8822) TREMFYA®/TREMFYA One-Press® (guselkumab injection) is indicated for:
Found in page 4 (start: 67, end: 135)
(Score: 0.8822) TREMFYA®/TREMFYA One-Press® (guselkumab injection) is indicated for:
Found in page 4 (start: 67, end: 135)
(Score: 0.8822) TREMFYA®/TREMFYA One-Press® (guselkumab injection) is indicated for:
Found in page 4 (start: 67, end: 135)
(Score: 0.8822) TREMFYA®/TREMFYA One-Press® (guselkumab injection) is indicated 

In [166]:
# now we can use the same code to find the relevant paragraph for the other queries
i = 3
query = claims['text'][i]
print('Query reference:', claims['reference'][i])
top_results = choose_most_relevant_sentence(embedder, corpus_embedding, query, k = 10)
print_results(query, top_results, paragraphs, pdf_doc)

Query reference: [COPY]

Query: In a phase 3, multicentre, randomized, double-blind, comparator-controlled study (ECLIPSE) with fixed-sequence non-inferiority/superiority testing:

Top 10 most similar sentences in corpus:
(Score: 0.5537) VOYAGE 1 A phase 3, multicenter, randomized, double-blind, placebo and active comparator controlled study
Found in page 17 (start: 1131, end: 1236)
(Score: 0.5537) VOYAGE 1 A phase 3, multicenter, randomized, double-blind, placebo and active comparator controlled study
Found in page 17 (start: 1131, end: 1236)
(Score: 0.5448) A phase 3, multicenter, randomized, double-blind, placebo-controlled study
Found in page 21 (start: 2276, end: 2350)
(Score: 0.5448) A phase 3, multicenter, randomized, double-blind, placebo-controlled study
Found in page 21 (start: 2276, end: 2350)
(Score: 0.5448) A phase 3, multicenter, randomized, double-blind, placebo-controlled study
Found in page 21 (start: 2276, end: 2350)
(Score: 0.5427) VOYAGE 2 A phase 3, multicenter, randomized, double-blind, p

In [167]:
i = 6
query = claims['text'][i]
print('Query reference:', claims['reference'][i])
top_results = choose_most_relevant_sentence(embedder, corpus_embedding, query, k = 10)
print_results(query, top_results, paragraphs, pdf_doc)

Query reference: [PM p. 4B]
Query: Indication not previously mentioned and clinical use:TREMFYA®/TREMFYA ONE-PRESS® is also indicated for the treatment of adult patients with active psoriatic arthritis. TREMFYA®/TREMFYA ONE-PRESS® can be used alone or in combination with a conventional disease-modifying antirheumatic drug (cDMARD) (e.g., methotrexate).

Top 10 most similar sentences in corpus:
(Score: 0.9521) the treatment of adult patients with active psoriatic arthritis. TREMFYA®/TREMFYA One-Press® can be used alone or in combination with a conventional disease-modifying antirheumatic drug (cDMARD) (e.g., methotrexate).
Found in page 4 (start: 355, end: 571)
(Score: 0.9521) the treatment of adult patients with active psoriatic arthritis. TREMFYA®/TREMFYA One-Press® can be used alone or in combination with a conventional disease-modifying antirheumatic drug (cDMARD) (e.g., methotrexate).
Found in page 4 (start: 355, end: 571)
(Score: 0.9489) TREMFYA®/TREMFYA One-Press® is used to trea