In [25]:
!pip install langchain
!pip install langchain_community
!pip install langchain-google-genai



In [43]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain.vectorstores import Chroma
from langchain.prompts import PromptTemplate

In [None]:
# 1. Load the PDF
pdf_loader = PyPDFLoader("./Ch.01_Introduction_ to_computers.pdf")
pdf_documents = pdf_loader.load()
pdf_documents[0]

Document(metadata={'producer': 'Microsoft® Word 2013', 'creator': 'Microsoft® Word 2013', 'creationdate': '2015-02-25T19:41:04+02:00', 'author': 'just', 'moddate': '2015-02-25T19:41:04+02:00', 'source': '/content/sample_data/Ch.01_Introduction_ to_computers.pdf', 'total_pages': 10, 'page': 0, 'page_label': '1'}, page_content='1 \nChapter One \nIntroduction to Computer \n \nComputer \nA computer is an electronic device, operating under the control of instructions stored \nin its own memory that can accept data (input), process the data according to specified \nrules, produce information (output), and store the information for future use1. \n \nFunctionalities of a computer2  \nAny digital computer carries out five functions in gross terms:  \n \n \n \n \n \n \n \n \n \nComputer Components \nAny kind of computers consists of HARDWARE AND SOFTWARE. \n \nHardware: \nComputer hardware is the collection of  physical elements that constitutes a computer \nsystem. Computer hardware refers to t

In [34]:
# 2. Split into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(pdf_documents)
chunks[0]

Document(metadata={'producer': 'Microsoft® Word 2013', 'creator': 'Microsoft® Word 2013', 'creationdate': '2015-02-25T19:41:04+02:00', 'author': 'just', 'moddate': '2015-02-25T19:41:04+02:00', 'source': '/content/sample_data/Ch.01_Introduction_ to_computers.pdf', 'total_pages': 10, 'page': 0, 'page_label': '1'}, page_content='1 \nChapter One \nIntroduction to Computer \n \nComputer \nA computer is an electronic device, operating under the control of instructions stored \nin its own memory that can accept data (input), process the data according to specified \nrules, produce information (output), and store the information for future use1. \n \nFunctionalities of a computer2  \nAny digital computer carries out five functions in gross terms:  \n \n \n \n \n \n \n \n \n \nComputer Components \nAny kind of computers consists of HARDWARE AND SOFTWARE. \n \nHardware: \nComputer hardware is the collection of  physical elements that constitutes a computer \nsystem. Computer hardware refers to t

In [None]:
# 3. Generate embeddings using Gemini
GOOGLE_API_KEY="REPLACE-YOUR-API-KEY"
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004", google_api_key=GOOGLE_API_KEY)
embedded_vectors = embeddings.embed_documents([doc.page_content for doc in chunks])
#embedded_vectors

In [None]:
# 4. Store embeddings to chromadb (vector dataset)
persist_dir = "./chroma_data"

vectordb = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory=persist_dir  # Optional: to save to disk
)

In [52]:
# 5. Perform a similarity search (example)
query = "Types of softwares"
result_docs = vectordb.similarity_search(query, k=5)
context_text = "\n\n".join([doc.page_content for doc in result_docs])

prompt_for_gemini = """
You are an intelligent assistant tasked with answering questions based on provided context.

**Context:**
{context}

**Question:**
{question}

**Instructions:**
Using only the provided context, answer the question accurately and comprehensively. If the information needed to answer the question is not present in the context, state that the answer cannot be found in the provided information.
"""

prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_for_gemini
)

llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash-8b", temperature=0.7, google_api_key = GOOGLE_API_KEY)

input_data = {"context": context_text, "question": query}

# Format the prompt using the template
formatted_prompt = prompt_template.format(**input_data)
# print(formatted_prompt)
# Invoke the Gemini model with the formatted prompt
response = llm.invoke(formatted_prompt)

# Print the response
print(response.content)

Software is categorized into two major types:

A. **System software:**  This provides the basic, non-task-specific functions of the computer.  It manages hardware components, allowing other software and users to interact without needing to know low-level details.  System software includes the operating system, disk formatters, file managers, display managers, text editors, user authentication tools, and networking and device control software.

B. **Application software:**  This is used by users to accomplish specific tasks beyond simply running the computer system.  Application software can be a single program, a small collection of programs (a software package), a larger collection of related programs (a software suite), or a software system.  Examples include image viewers, spreadsheets, word processors, database management systems, presentation software (like PowerPoint), and graphics software (like Photoshop).
