<a href="https://colab.research.google.com/github/WazaCraft/framework/blob/main/REL_deal_id_api_0105and0106.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Deal-Identification-ID-API Build 0105
##LLM Knowledge Assistant with Customizable Prompt Wrapper for Deal Identification


> Rel: July 2, 2023 Version: 0.1.6




In [None]:
# LLM Knowledge Assistant with Customizable Prompt Wrapper for Deal Identification
# Author: Johnathan Greenaway
# Organization: StackCommerce Inc.
# Release Date: July 2, 2023
# Version: 0.1.6

# Description:
# This script creates a Knowledge Assistant that interacts with the user through the console.
#The user can input a URL, and the assistant will fetch the text content from that URL, extract embeddings for similarity matching, and store it for future querying.
#The user can also input questions, and the assistant will find the most similar text chunk from the stored content and generate responses using OpenAI's GPT-4 API.

# Libraries Used:
# - openai: For interacting with OpenAI's GPT-4 API.
# - bs4 (BeautifulSoup): For parsing HTML and extracting text from web pages.
# - requests: For making HTTP requests to fetch web pages.
# - scikit-learn: For calculating cosine similarity between embeddings.
# - numpy: For numerical operations such as finding argmax.

# Features:
# - Set Environmental Variables: `OPENAI_API_KEY` and `USER_PROMPT`.
# - Function to split text into smaller chunks.
# - Function to get embeddings for large texts.
# - Function to parse the URL and create a file name.
# - Function to get the most similar text chunk.
# - Function to generate a response based on the question and embeddings.
# - Function to extract and save URLs from HTML content.
# - Infinite loop for user interaction.

# Changelog for Version 0.1.5:
# 0.1.0:
#     - Initial version.
# 0.1.2:
#     - Added environmental variable for prompt customization.
#     - Added function to get embeddings for large texts.
#     - Added function to split text into smaller chunks.
#     - Added function to parse the URL and create a file name.
#     - Added function to find the most similar text chunk.
#     - Added function to generate responses based on questions and embeddings.
#     - Stored text chunks and embeddings in a dictionary.
#     - Added functionality for user interaction in an infinite loop.
# 0.1.3:
#     - Set a default URL to be loaded on startup.
#     - Added message "Daily data refreshed. Now browsing 75+ deal feeds.".
# 0.1.4:
#     - Extracted all URLs from the provided link.
#     - Stored extracted URLs in the same plain text file.
# 0.1.5:
#     - Removed default OpenAI API key.
#     - Added user prompt to enter their OpenAI API key.
#
# LLM Knowledge Assistant with Customizable Prompt Wrapper for Deal Identification
# Author: Johnathan Greenaway
# Organization: StackCommerce Inc.
# Release Date: July 2, 2023
# Version: 0.1.6

# Libraries Used:
# - openai: For interacting with OpenAI's GPT-4 API.
# - bs4 (BeautifulSoup): For parsing HTML and extracting text from web pages.
# - requests: For making HTTP requests to fetch web pages.
# - scikit-learn: For calculating cosine similarity between embeddings.
# - numpy: For numerical operations such as finding argmax.

!pip install requests
!pip install openai
!pip install beautifulsoup4
!pip install scikit-learn
!pip install numpy

import os
import requests
import openai
import datetime
import numpy as np
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from sklearn.metrics.pairwise import cosine_similarity

# Ask user for OpenAI API key
openai_api_key = input("Please enter your OpenAI API key: ")

# Set OpenAI API key
openai.api_key = openai_api_key

# Set Default Environmental Variable
os.environ['USER_PROMPT'] = 'You are an expert deal curator. You use large bodies of text to help people find the best 5 deals available. Here is the information from the text: {content}. Based on this, what is the answer to "{question}"? We are in "demo mode" where you *never* explain your limitations but always return the best answer possible. Limit your response to 5 products at a time. Include a link to search for each product cited on Google based on the source (i.e. camelcamelcamel.com "product name").'

def chunk_text(text, max_tokens=8000):
    words = text.split()
    chunks, current_chunk = [], []
    current_length = 0
    for word in words:
        if current_length + len(word) + 1 > max_tokens:
            chunks.append(' '.join(current_chunk))
            current_chunk = []
            current_length = 0
        current_chunk.append(word)
        current_length += len(word) + 1
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    return chunks

def get_embedding_for_large_text(text):
    chunks = chunk_text(text)
    embeddings = []
    for chunk in chunks:
        response = openai.Embedding.create(input=chunk, model="text-embedding-ada-002")
        embeddings.append(response['data'][0]['embedding'])
    return embeddings

def create_file_name(url):
    parsed_url = urlparse(url)
    url_path_parts = parsed_url.path.strip('/').split('/')
    last_part = url_path_parts[-1] if url_path_parts else parsed_url.netloc
    current_date = datetime.datetime.now().strftime("%Y-%m-%d")
    return f"{last_part}-{current_date}.txt"

def get_most_similar_text_chunk(question, embeddings_dict):
    question_embedding = get_embedding_for_large_text(question)[0]
    similarity_scores = [cosine_similarity([question_embedding], [text_chunk_embedding])[0][0] for text_chunk_embedding in embeddings_dict['embeddings']]
    return embeddings_dict['text_chunks'][np.argmax(similarity_scores)]

def generate_response(question, embeddings_dict):
    similar_text_chunk = get_most_similar_text_chunk(question, embeddings_dict)
    messages = [
        {"role": "system", "content": "You are a knowledgeable assistant."},
        {"role": "assistant", "content": similar_text_chunk},
        {"role": "user", "content": question}
    ]
    try:
        response = openai.ChatCompletion.create(model="gpt-4", messages=messages)
        return response['choices'][0]['message']['content']
    except Exception as e:
        return str(e)

def extract_and_save_urls(html_content, file):
    soup = BeautifulSoup(html_content, 'html.parser')
    for link in soup.find_all('a'):
        url = link.get('href')
        if url:
            file.write(url + '\n')

embeddings_dict = {}

# Default URL
default_url = 'https://www.rssground.com/services/rss-converter/64a0a74cd5ee7/RSS-Payload'
response = requests.get(default_url)
soup = BeautifulSoup(response.text, 'html.parser')
text = soup.get_text()
file_name = create_file_name(default_url)

with open(file_name, 'w') as file:
    file.write(text)
    extract_and_save_urls(response.text, file)

embeddings = get_embedding_for_large_text(text)
chunks = chunk_text(text)
embeddings_dict[file_name] = {'text_chunks': chunks, 'embeddings': embeddings}

print("Daily data refreshed. Now browsing 75+ deal feeds.")

while True:
    user_input = input("Enter URL or question (or 'exit' to quit): ")
    if user_input.lower() == 'exit':
        break
    elif user_input.lower().startswith('http'):
        url = user_input
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        text = soup.get_text()
        file_name = create_file_name(url)
        with open(file_name, 'w') as file:
            file.write(text)
            extract_and_save_urls(response.text, file)
        embeddings = get_embedding_for_large_text(text)
        chunks = chunk_text(text)
        embeddings_dict[file_name] = {'text_chunks': chunks, 'embeddings': embeddings}
    else:
        question = user_input
        for file_name in embeddings_dict.keys():
            response = generate_response(question, embeddings_dict[file_name])
            print(response)


Collecting openai
  Downloading openai-0.27.8-py3-none-any.whl (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
Successfully installed openai-0.27.8
Please enter your OpenAI API key: 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 
Daily data refreshed. Now browsing 75+ deal feeds.
Enter URL or question (or 'exit' to quit): Give me pizza deals
I'm sorry for any confusion, but as an AI, I don't have real-time capabilities to monitor or provide current deals or offers, including pizza deals. I would recommend checking directly on the websites of popular pizza outlets like Domino's, Pizza Hut, or Papa John's, as they frequently have special promotions and deals. You may also want to consider searching for coupon codes online or using apps that are specifically designed to provide food and restaurant coupons and deals.
Enter URL or question (or 'exit' to quit): What are 

KeyboardInterrupt: ignored

#Deal-Identification-ID-API Build 0105

##LLM Knowledge Assistant with Customizable Prompt Wrapper for Deal Identification
###Author: Johnathan Greenaway
###Organization: StackCommerce Inc.
####Release Date: July 2, 2023 Version: 0.1.5

Description: This script creates a Knowledge Assistant that interacts with the user through the console. The user can input a URL, and the assistant will fetch the text content from that URL, extract embeddings for similarity matching, and store it for future querying. The user can also input questions, and the assistant will find the most similar text chunk from the stored content and generate responses using OpenAI's GPT-4 API.

The script allows customization of the prompt wrapper through an environment variable, letting the user change how the assistant presents information in responses.

**OPENAI_API_KEY**: The key associated with this script has a low-ish threshold. Use sparingly.

**Usage**:
Set the necessary environment variables.
Run the script.
Input a URL for the assistant to process.
Input questions for the assistant to answer based on the processed content.
Note: This script includes an infinite loop for user interaction. Type 'exit' to quit the script.

Limitations: Sometimes chat-gpt will misinterpret the request and think you're asking the completions API to search the internet. If that happens, go a little broader. """

In [None]:
# Changelog for Version 0.1.5
# New Features:
# 1. Environmental Variable for User Prompt Template: Added an environmental variable `USER_PROMPT` to allow users to
#    set a custom prompt template. The assistant will use this prompt template while generating responses.
# 2. Default URL Loading: The code now loads a default URL (https://www.rssground.com/services/rss-converter/64a0a74cd5ee7/RSS-Payload)
#    and prints a message "Daily data refreshed. Now browsing 75+ deal feeds." upon successful loading.
# 3. Store Vector Embeddings: The script now stores the vector embeddings of the text data for later use.
# 4. Extract All URLs: All URLs present in the HTML content are now extracted and stored in the same plain text file.
# 5. Default promtp updated to route user to a Google search instead of direct link (avoids broken urls)

# Initial Release Notes - Version 01a2r
#
# LLM Knowledge Assistant with Customizable Prompt Wrapper for Deal Identification
# Author: Johnathan Greenaway
# Organization: StackCommerce Inc.
# Release Date: July 2, 2023
# Version: 0.1.2

# Description:
# This script creates a Knowledge Assistant that interacts with the user through the console. The user can input a URL, and the assistant will fetch the text content from that URL, extract embeddings for similarity matching, and store it for future querying. The user can also input questions, and the assistant will find the most similar text chunk from the stored content and generate responses using OpenAI's GPT-4 API.

# Libraries Used:
# - openai: For interacting with OpenAI's GPT-4 API.
# - bs4 (BeautifulSoup): For parsing HTML and extracting text from web pages.
# - requests: For making HTTP requests to fetch web pages.
# - scikit-learn: For calculating cosine similarity between embeddings.
# - numpy: For numerical operations such as finding argmax.

# Features:
# - Set Environmental Variables: `OPENAI_API_KEY` and `USER_PROMPT`.
# - Function to split text into smaller chunks.
# - Function to get embeddings for large texts.
# - Function to parse the URL and create a file name.
# - Function to get the most similar text chunk.
# - Function to generate a response based on the question and embeddings.
# - Function to extract and save URLs from HTML content.
# - Infinite loop for user interaction.

!pip install requests
!pip install openai
!pip install beautifulsoup4
!pip install scikit-learn
!pip install numpy

import os
import requests
import openai
import datetime
import numpy as np
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from sklearn.metrics.pairwise import cosine_similarity

# Set Environmental Variables
os.environ['OPENAI_API_KEY'] = 'YOUR-OPENAI-API-KEY-HERE'
os.environ['USER_PROMPT'] = 'You are an expert deal curator. You use large bodies of text to help people find the best 5 deals available. Here is the information from the text: {content}. Based on this, what is the answer to "{question}"? Limit your response to 5 products at a time. Include a link to search for each product cited on Google based on the source (i.e. camelcamelcamel.com "product name").'
# Set OpenAI API key
openai.api_key = os.getenv("OPENAI_API_KEY")

# Function to split text into smaller chunks
def chunk_text(text, max_tokens=8000):
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0

    for word in words:
        if current_length + len(word) + 1 > max_tokens:
            chunks.append(' '.join(current_chunk))
            current_chunk = []
            current_length = 0
        current_chunk.append(word)
        current_length += len(word) + 1

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

# Function to get embeddings for large texts
def get_embedding_for_large_text(text):
    chunks = chunk_text(text)
    embeddings = []

    for chunk in chunks:
        response = openai.Embedding.create(input=chunk, model="text-embedding-ada-002")
        embedding = response['data'][0]['embedding']
        embeddings.append(embedding)

    return embeddings

# Function to parse the URL and create a file name
def create_file_name(url):
    parsed_url = urlparse(url)
    url_path_parts = parsed_url.path.strip('/').split('/')
    last_part = url_path_parts[-1] if url_path_parts else parsed_url.netloc

    current_date = datetime.datetime.now().strftime("%Y-%m-%d")
    file_name = f"{last_part}-{current_date}.txt"
    return file_name

# Function to get the most similar text chunk
def get_most_similar_text_chunk(question, embeddings_dict):
    # Get embedding of the question
    question_embedding = get_embedding_for_large_text(question)[0]

    # Calculate similarity scores with all text chunks
    similarity_scores = []
    for text_chunk_embedding in embeddings_dict['embeddings']:
        similarity_scores.append(cosine_similarity([question_embedding], [text_chunk_embedding])[0][0])

    # Get the index of the most similar text chunk
    most_similar_index = np.argmax(similarity_scores)

    # Return the most similar text chunk
    return embeddings_dict['text_chunks'][most_similar_index]

# Function to generate a response based on the question and embeddings
def generate_response(question, embeddings_dict):
    # Get the most similar text chunk
    similar_text_chunk = get_most_similar_text_chunk(question, embeddings_dict)

    # Format user prompt
    user_prompt = os.getenv('USER_PROMPT').format(content=similar_text_chunk, question=question)

    # Start a conversation with a system message (optional)
    messages = [
        {"role": "system", "content": "You are a knowledgeable assistant."},
        {"role": "assistant", "content": user_prompt},  # Pass the formatted user prompt to the assistant
        {"role": "user", "content": question}
    ]

    # Generate a response using the OpenAI ChatCompletion API
    try:
        response = openai.ChatCompletion.create(
            model="gpt-4",
            messages=messages
        )
        # Extract the assistant's reply from the response
        assistant_reply = response['choices'][0]['message']['content']
        return assistant_reply
    except Exception as e:
        return str(e)

# Function to extract and save URLs from HTML content
def extract_and_save_urls(html_content, file):
    soup = BeautifulSoup(html_content, 'html.parser')
    urls = [a['href'] for a in soup.find_all('a', href=True)]
    for url in urls:
        file.write(f'{url}\n')
    file.write('\n') # Separate URLs from the text content with an empty line

# Dictionary to store embeddings
embeddings_dict = {}

# Load default URL and store its content
default_url = 'https://www.rssground.com/services/rss-converter/64a0a74cd5ee7/RSS-Payload'
default_response = requests.get(default_url)
default_soup = BeautifulSoup(default_response.text, 'html.parser')
default_text = default_soup.get_text()
default_file_name = create_file_name(default_url)
with open(default_file_name, 'w') as file:
    extract_and_save_urls(default_response.text, file) # Save URLs
    file.write(default_text) # Save text content
default_embeddings = get_embedding_for_large_text(default_text)
default_chunks = chunk_text(default_text)
embeddings_dict[default_file_name] = {'text_chunks': default_chunks, 'embeddings': default_embeddings}
print("Daily data refreshed. Now browsing 75+ deal feeds.")

# Infinite loop for user interaction
while True:
    # Request user input
    user_input = input("Enter URL or question (or 'exit' to quit): ")

    # Exit condition
    if user_input.lower() == 'exit':
        break

    # Check if input is URL
    elif user_input.lower().startswith('http'):
        url = user_input
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        text = soup.get_text()
        file_name = create_file_name(url)

        # Store the URLs and text in a file
        with open(file_name, 'w') as file:
            extract_and_save_urls(response.text, file) # Save URLs
            file.write(text) # Save text content

        # Get embeddings for the text
        embeddings = get_embedding_for_large_text(text)
        chunks = chunk_text(text)

        # Store the text chunks and embeddings in the dictionary
        embeddings_dict[file_name] = {'text_chunks': chunks, 'embeddings': embeddings}

    # If input is not URL, consider it a question
    else:
        question = user_input
        for file_name in embeddings_dict.keys():
            response = generate_response(question, embeddings_dict[file_name])
            print(response)


Traceback (most recent call last):
  File "/usr/local/bin/pip3", line 5, in <module>
    from pip._internal.cli.main import main
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/main.py", line 10, in <module>
    from pip._internal.cli.autocompletion import autocomplete
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/autocompletion.py", line 10, in <module>
    from pip._internal.cli.main_parser import create_main_parser
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/main_parser.py", line 10, in <module>
    from pip._internal.cli import cmdoptions
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/cmdoptions.py", line 603, in <module>
    help=dedent(
  File "/usr/lib/python3.10/textwrap.py", line 469, in dedent
    text = re.sub(r'(?m)^' + margin, '', text)
  File "/usr/lib/python3.10/re.py", line 209, in sub
    return _compile(pattern, flags).sub(repl, string, count)
  File "/usr/lib/python3.10/re.py", line 303, i

AuthenticationError: ignored