In [3]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.prompts import PromptTemplate

from langchain.chains import RetrievalQA


##### Read PDFs from directory

In [5]:
loader = PyPDFDirectoryLoader( "./abstracts" )

documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200 )

docs = text_splitter.split_documents( documents )

docs

[Document(page_content='Chronic Cigarette Smoke-Induced Epigenomic Changes Precede \nSensitization of Bronchial Epithelial Cells to Single Step \nTransformation by KRAS  Mutations\nMichelle Vaz1, Stephen Y Hwang1, Ioannis Kagiampakis1, Jillian Phallen1, Ashwini Patil2, \nHeather M O’Hagan3, Lauren Murphy1, Cynthia A Zahnow1, Edward Gabrielson4, Victor E \nVelculescu1, Hariharan P Easwaran1,5, and Stephen B Baylin1,5,6\n1Department of Oncology, The Sidney Kimmel Comprehensive Cancer Center, The Johns \nHopkins University School of Medicine, Baltimore, MD 21287, USA.\n2Krieger School of Arts and Sciences, Baltimore, MD 21218, USA\n3Medical Sciences, Indiana University School of Medicine, Bloomington, IN 47405, USA Melvin \nand Bren Simon Cancer Center, Indianapolis, IN 46202, USA\n4Department of Pathology, The Johns Hopkins University School of Medicine, Baltimore, MD \n21287\nSUMMARY\nWe define how chronic cigarette smoke-induced time-dependent epigenetic alterations can', metadata={'so

##### Create embeddings

In [43]:
embeddings = HuggingFaceBgeEmbeddings( 
    model_name = "BAAI/bge-base-en-v1.5",
    model_kwargs = { "device": "cpu" },
    encode_kwargs = { "normalize_embeddings": True } 
    )

  from tqdm.autonotebook import tqdm, trange


In [44]:
import numpy as np
np.array( embeddings.embed_query( docs[0].page_content ) )

array([ 3.50231095e-03,  1.21326803e-03, -4.24672700e-02, -3.18344794e-02,
        6.85387403e-02, -1.35410978e-02,  6.97116181e-02, -1.08452002e-02,
        3.17623541e-02, -1.43871587e-02, -3.50103080e-02, -4.77693751e-02,
       -3.97049785e-02,  2.01160237e-02,  1.23318825e-02,  1.57423280e-02,
        4.80135866e-02,  2.19877213e-02, -2.09611412e-02,  9.60289128e-03,
       -1.62273925e-02,  1.64985587e-03,  3.49574350e-02,  1.79122966e-02,
        4.16326486e-02, -3.84121835e-02,  7.84762669e-03, -1.98261924e-02,
       -3.70973051e-02,  4.15858924e-02,  7.58353667e-03, -7.26739690e-03,
        2.47193128e-02, -7.39467563e-03,  3.05940658e-02,  6.66504428e-02,
       -3.62948403e-02, -7.62054371e-03,  1.71883758e-02, -3.32807004e-03,
       -3.80035602e-02, -1.67258251e-02,  1.93804689e-02, -4.97406758e-02,
       -2.97006462e-02, -3.08749196e-03, -1.67792458e-02,  7.16347992e-02,
        2.58623436e-02, -3.08769159e-02,  7.99884647e-03,  3.98144079e-03,
        4.66150865e-02,  

##### Create vector store

In [47]:
vectorstore = FAISS.from_documents(docs, embeddings)

#### Query against documents

In [49]:
query="What tumors frequently occur in smokers?"

relevant_docs=vectorstore.similarity_search(query)

print(relevant_docs[0].page_content)


Vaz et al. show that long-term exposure of untransformed human bronchial epithelial cells to 
cigarette smoke condensate induces epigenetic changes, which are consistent with those 
commonly seen in smoking related non-small cell lung cancer, that sensitize the cells to 
transformation with a single KRAS mutation.
INTRODUCTION
It is well established that chronic exposure to various forms of stress can cause epigenetic as 
well as genetic alterations ultimately leading to the development of cancer. Cigarette smoke 
plays a key role in the development of lung cancer, which remains the leading cause of 
cancer-related deaths worldwide ( Torre et al., 2015 ). The effect of cigarette smoke and its 
components in contributing to epigenetic changes in lung cancer is well documented 
(Belinsky et al., 2002 ; Damiani et al., 2008 ; Liu et al., 2010 ; Tellez et al., 2011 ; Tessema et 
al., 2014 ). In addition, a number of mutations seen in lung cancer patients are attributed to


##### Create retriever

In [50]:
retriever=vectorstore.as_retriever(search_type="similarity",search_kwargs={"k":3})
print(retriever)

tags=['FAISS', 'HuggingFaceBgeEmbeddings'] vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x28f6d6d10> search_kwargs={'k': 3}


##### HuggingFace LLM

In [55]:
import os
from dotenv import load_dotenv

load_dotenv()

os.environ['HUGGINGFACEHUB_API_TOKEN']=os.getenv('HUGGINGFACEHUB_API_TOKEN')

In [None]:
from langchain_community.llms import HuggingFaceHub

hf=HuggingFaceHub(
    repo_id="mistralai/Mistral-7B-v0.1",
    model_kwargs={"temperature":0.1,"max_length":500}

)
query="What tumors frequently occur in smokers?"
hf.invoke(query)