In [14]:
import os
from dotenv import load_dotenv

load_dotenv()

MODEL = "llama2"

In [15]:
from langchain_community.llms import Ollama
from langchain_community.embeddings import OllamaEmbeddings

model = Ollama(model=MODEL)
embeddings = OllamaEmbeddings(model=MODEL)

In [16]:
from langchain_core.output_parsers import StrOutputParser

parser = StrOutputParser()

In [17]:
from langchain.prompts import PromptTemplate

template = """
Answer the question based on the context below. If you can't 
answer the question, reply "I don't know".

Context: {context}

Question: {question}
"""

prompt = PromptTemplate.from_template(template)
prompt.format(context="Here is some context", question="Here is a question")

'\nAnswer the question based on the context below. If you can\'t \nanswer the question, reply "I don\'t know".\n\nContext: Here is some context\n\nQuestion: Here is a question\n'

In [18]:

chain = prompt | model | parser



In [21]:
chain.input_schema.schema()

{'title': 'PromptInput',
 'type': 'object',
 'properties': {'context': {'title': 'Context', 'type': 'string'},
  'question': {'title': 'Question', 'type': 'string'}}}

In [19]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("test_scraped.pdf")
pages = loader.load_and_split()
pages

[Document(page_content='Skip to main content\nBootstrap\nDocs\nExamples\nIcons\nThemes\nBlog\n \nGitHub\n \nTwitter\n \nOpen Collective\nBootstrap \nBootstrap \nv5.3 \n(switch to other versions)\nv5 releases\nLatest (5.3.x)\nv5.2.3\nv5.1.3\nv5.0.2\nPrevious releases\nv4.6.x\nv3.4.1\nv2.3.2\nAll versions\nToggle theme\nLight\nDark\nAuto\nNew! \nNever-Ending Support for Bootstrap \n \nBootstrap\nBuild fast, responsive sites with Bootstrap\nPowerful, extensible, and feature-packed frontend toolkit. Build and customize with Sass, utilize prebuilt grid system and components, and bring projects to life with powerful JavaScript plugins.\nnpm i bootstrap@5.3.3\nRead the docs\nCurrently \nv5.3.3 \n· \nDownload \n· \nAll releases\nGet started any way you want\nJump right into building with Bootstrap—use the CDN, install it via package manager, or download the source code.\nRead installation docs\nInstall via package manager\nInstall Bootstrap’s source Sass and JavaScript files via npm, RubyGems,

In [22]:
from langchain_community.vectorstores import DocArrayInMemorySearch

vectorstore = DocArrayInMemorySearch.from_documents(pages, embedding=embeddings)

In [23]:
retriever = vectorstore.as_retriever()

In [24]:
from operator import itemgetter

chain = (
    {
        "context": itemgetter("question") | retriever,
        "question": itemgetter("question"),
    }
    | prompt
    | model
    | parser
)

chain.invoke({"question":"How to install Bootstrap ?"})

'To install Bootstrap, you have several options:\n\n1. **CDN (Content Delivery Network)**: You can use a CDN like jsDelivr to include Bootstrap\'s compiled CSS and JavaScript files in your project. This is the easiest way to get started with Bootstrap, as it doesn\'t require any installation or setup. Simply copy the link provided by the CDN and paste it into your HTML file.\n2. **npm (package manager)**: You can install Bootstrap via npm by running the following command in your terminal:\n```\nnpm i bootstrap@5.3.3\n```\nThis will download and install Bootstrap\'s source code, including its CSS and JavaScript files.\n3. **Gem (RubyGems)**: If you\'re using Ruby on Rails or another Gem-based framework, you can install Bootstrap via Gem by running the following command in your terminal:\n```\ngem install bootstrap\n```\n4. **Composer (PHP)**: If you\'re using PHP and Composer, you can install Bootstrap by adding the following line to your `composer.json` file:\n```json\n{\n    "require"

In [None]:
questions = [
    "What is the purpose of the course?",
    "How many hours of live sessions?",
    "How many coding assignments are there in the program?",
    "Is there a program certificate upon completion?",
    "What programming language will be used in the program?",
    "How much does the program cost?",
]

for question in questions:
    print(f"Question: {question}")
    print(f"Answer: {chain.invoke({'question': question})}")
    print()

In [2]:
import pdfkit

# Set the path to the wkhtmltopdf executable
path_to_wkhtmltopdf = 'C:/Users/LENOVO/wkhtmltopdf/bin/wkhtmltopdf.exe'
config = pdfkit.configuration(wkhtmltopdf=path_to_wkhtmltopdf)


In [25]:
import requests
from bs4 import BeautifulSoup
import pdfkit
from urllib.parse import urljoin, urlparse
from collections import deque

# Set the path to the wkhtmltopdf executable
path_to_wkhtmltopdf = 'C:/Users/LENOVO/wkhtmltopdf/bin/wkhtmltopdf.exe'
config = pdfkit.configuration(wkhtmltopdf=path_to_wkhtmltopdf)

def fetch_page_content(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        return None

def is_valid_url(url, base_domain):
    parsed_url = urlparse(url)
    return parsed_url.scheme in ('http', 'https') and parsed_url.netloc == base_domain

def scrape_website(start_url, max_pages):
    base_domain = urlparse(start_url).netloc
    visited_urls = set()
    queue = deque([start_url])
    scraped_content = []
    
    while queue and len(scraped_content) < max_pages:
        current_url = queue.popleft()
        if current_url in visited_urls:
            continue
        
        print(f"Scraping: {current_url}")
        content = fetch_page_content(current_url)
        if content:
            soup = BeautifulSoup(content, 'html.parser')
            scraped_content.append(soup.prettify())
            visited_urls.add(current_url)

            # Find and enqueue valid links on the current page
            for link in soup.find_all('a', href=True):
                new_url = urljoin(current_url, link['href'])
                if is_valid_url(new_url, base_domain) and new_url not in visited_urls:
                    queue.append(new_url)

    return scraped_content

def save_to_pdf(content_list, output_file):
    html_content = "<html><body>"
    for content in content_list:
        html_content += content + "<hr>"
    html_content += "</body></html>"
    
    pdfkit.from_string(html_content, output_file, configuration=config)


In [None]:
if __name__ == "__main__":
    start_url = "https://getbootstrap.com/"  # Change this to the starting URL of the target website
    max_pages = 1  # Change this to the desired number of pages to scrape
    output_file = "test_scraped.pdf"

    scraped_content = scrape_website(start_url, max_pages)
    if scraped_content:
        save_to_pdf(scraped_content, output_file)
        print(f"Scraped content has been saved to {output_file}")
    else:
        print("Failed to scrape any content.")