In [259]:
from typing import (
    TYPE_CHECKING,
    Any,
    Callable,
    Dict,
    Iterable,
    List,
    Optional,
    Tuple,
    Type,
)

In [260]:
from qvdb import VectorDB

qvdb = VectorDB(is_persistent=False)

In [261]:
import requests
from bs4 import BeautifulSoup
from langchain.text_splitter import RecursiveCharacterTextSplitter
import warnings
from requests.packages.urllib3.exceptions import InsecureRequestWarning

# Suppress warnings about insecure HTTPS requests.
warnings.simplefilter('ignore', InsecureRequestWarning)

def extract_url_content(url: str) -> List[str]:
        """
        Extracts the content from the given URL.

        Args:
            url (str): The URL to extract content from.

        Returns:
            str: The extracted content.
        """
        response = requests.get(url, verify=False)
        soup = BeautifulSoup(response.text, 'html.parser')
        text = ' '.join([p.text for p in soup.find_all('p')])  # Extract text from <p> tags

        # We need to split it up into smaller pieces
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
        text_splits = text_splitter.split_text(text)

        return text_splits

In [262]:
from googlesearch import search

def search_and_store(qvdb: VectorDB, query: str, num_results: int = 5) -> None:
        """
        Performs a Google Search for documents related to the given query,
        extracts their content, and stores them in the vector store.

        Args:
            query (str): The query to search for documents.
            num_results (int): The number of URLs to retrieve. Defaults to 5.

        Returns:
            None
        """
        for index, url in enumerate(search(query, num_results)):
            print(f"<info> ({index}) Extracting content from: {url}")
            text_splits = extract_url_content(url=url)
            for jindex, text in enumerate(text_splits):
                qvdb.add(
                    documents=[text],
                    metadatas=[{"source": url}],
                    ids=[f"id{index}.{jindex}"]
                )

In [263]:
#model = 'mistral'
model= 'llama3'
system="""
You are a helpful AI assistant that helps answer questions based on documents and sources.
You may combine your own knowledge with the information in the documents to answer the questions.
If you make use of the provided documents then add the sources to the answer.
Try to keep the column width of the response to 72 characters.

Example of the documents:

SOURCE: https://www.example.com
DOCUMENT: This is an example document.

"""
template = """
Here is the question: {question}

Here are the documents and the corresponding sources:
{documents}

"""

In [264]:
#question = "What makes Virus different from Bacteria?"
#question = "How do I print just a few lines of a file using the sed command?"
#question = "Elaborate on who came up with the Periodical System?"
#question = "With the increasing problem of antibiotic resistance, what are the alternatives to antibiotics?"
#question = "Who invented the Erlang programming language and what was the reason behind it?"
question = "What are the main differences between the GPT and the BERT models?"

In [265]:
search_and_store(qvdb=qvdb, query=question, num_results=5)
results = qvdb.query(question, num_results=5)
print(results)



<info> (0) Extracting content from: https://blog.invgate.com/gpt-3-vs-bert
<info> (1) Extracting content from: https://softteco.com/blog/bert-vs-chatgpt
<info> (2) Extracting content from: https://medium.com/@prudhvithtavva/bert-vs-gpt-a-tale-of-two-transformers-that-revolutionized-nlp-11fff8e61984
<info> (3) Extracting content from: https://www.baeldung.com/cs/bert-vs-gpt-3-architecture
<info> (4) Extracting content from: https://medium.com/@reyhaneh.esmailbeigi/bert-gpt-and-bart-a-short-comparison-5d6a57175fca
<info> (5) Extracting content from: https://symbl.ai/developers/blog/gpt-3-versus-bert-a-high-level-comparison/
<info> (6) Extracting content from: https://www.quora.com/What-are-the-differences-between-Googles-Bert-and-OpenAIs-GPT-2-artificial-intelligence-models
{'ids': [['id0.4', 'id0.5', 'id0.3', 'id1.17', 'id1.18']], 'distances': [[0.17108333110809326, 0.19292718172073364, 0.2135852575302124, 0.21542876958847046, 0.21935486793518066]], 'metadatas': [[{'source': 'https://bl

In [266]:
docs=[]
for d, s in zip(results['documents'][0], results['metadatas'][0]):
    docs.append(f"SOURCE: {s['source']}\nDOCUMENT: {d}\n")
documents = ' '.join(docs)

In [267]:
#print(documents)

In [268]:
import ollama

prompt = template.format(question=question, documents=documents)
output = ollama.generate(model=model, system=system, prompt=prompt, stream=False)
response = output['response'].strip()

In [269]:
print(response)

The main differences between GPT (Generative Pre-trained Transformer) and BERT (Bidirectional Encoder Representations from Transformers) models are:

* Training datasets: GPT-3 was trained on 45TB of data, while BERT was trained on 3TB of data.
* Architecture: GPT-3 is an autoregressive model, while BERT is bidirectional. GPT-3 only considers the left context when making predictions, whereas BERT takes into account both left and right context.
* Size: GPT-3 has 1.5 billion parameters, while BERT has 340 million parameters. GPT-3 is significantly larger than its predecessor due to its much more extensive training dataset size.

Sources:
https://blog.invgate.com/gpt-3-vs-bert
https://softteco.com/blog/bert-vs-chatgpt


In [270]:
qvdb.reset()