In [1]:
!pip install "pydantic>=2.0,<2.12.3" "gradio>=5.0"

Collecting pydantic<2.12.3,>=2.0
  Downloading pydantic-2.12.2-py3-none-any.whl.metadata (85 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/85.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.8/85.8 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting pydantic-core==2.41.4 (from pydantic<2.12.3,>=2.0)
  Downloading pydantic_core-2.41.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.3 kB)
Downloading pydantic-2.12.2-py3-none-any.whl (460 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m460.6/460.6 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydantic_core-2.41.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m55.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydantic-core, pydantic
  Attempting uninstall: pydantic-co

In [1]:
!pip install -U --quiet \
    langchain \
    langchain-community \
    langchain-openai \
    langchain-huggingface \
    langchain-text-splitters \
    faiss-cpu \
    pypdf \
    sentence-transformers \
    pydantic


In [3]:
import os
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

# FIXED: Import from langchain_classic instead of langchain
from langchain_classic.chains import RetrievalQA

# The rest of your setup...
from langchain_classic.prompts import PromptTemplate

In [3]:
from google.colab import files
uploaded = files.upload()

Saving pcos-diagnosisAndTreatment.txt to pcos-diagnosisAndTreatment (1).txt
Saving pcos-symptomsAndCauses.txt to pcos-symptomsAndCauses (1).txt


In [7]:
###################################---Loading Documents---#######################################

In [4]:
documents = []

for file in os.listdir():
    if file.endswith(".pdf"):
        loader = PyPDFLoader(file)
        documents.extend(loader.load())
    elif file.endswith(".txt"):
        loader = TextLoader(file)
        documents.extend(loader.load())

print(f"Loaded {len(documents)} documents")

Loaded 2 documents


In [6]:
###################################---Turning into Chunks---#######################################

In [5]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=600,
    chunk_overlap=100,
)

In [6]:
chunks =splitter.split_documents(documents)
print(f"Created {len(chunks)} chunks")
print(f"The first chunk:\n{chunks[0]}")

Created 108 chunks
The first chunk:
page_content='There's no single test to specifically diagnose polycystic ovary syndrome (PCOS). Your health care provider is likely to start with a discussion of your symptoms, medications and any other medical conditions. Your provider also may ask about your menstrual periods and any weight changes. A physical exam includes checking for signs of excess hair growth, insulin resistance and acne.

Your health care provider might then recommend:' metadata={'source': 'pcos-diagnosisAndTreatment.txt'}


In [15]:
###################################---Creating Embedding Model---#######################################

In [7]:
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
###################################---Storing embeddings in vector database---#######################################

In [8]:
vector_db = FAISS.from_documents(chunks, embedding_model)

In [9]:
retriever = vector_db.as_retriever(search_kwargs={"k": 3})

In [None]:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "The Token Key"

In [12]:
!pip install -q transformers

In [None]:
###################################---Adding LLM---#######################################

In [14]:
from langchain_classic.llms import HuggingFacePipeline
from transformers import pipeline

# Use text2text generation model (Flan-T5)
pipe = pipeline(
    "text2text-generation",
    model="google/flan-t5-base",
    tokenizer="google/flan-t5-base",
    max_length=512
)

llm = HuggingFacePipeline(pipeline=pipe)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cpu
  llm = HuggingFacePipeline(pipeline=pipe)


In [None]:
###################################---Adding Custom Propmts---#######################################

In [15]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)

In [16]:
query = "What are the main symptoms of PCOS?"
result = qa_chain(query)

print("Answer:\n", result["result"])

  result = qa_chain(query)


Answer:
 irregular menstrual periods, high testosterone or related symptoms (like excess facial hair), or polycystic ovaries found on an ultrasound
