<a href="https://colab.research.google.com/github/WazaCraft/framework/blob/main/REL_deal_id_api_0106.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Deal-Identification-ID-API 0.1.6
##LLM Knowledge Assistant with Customizable Prompt Wrapper for Deal Identification


> Rel: July 2, 2023 Version: 0.1.6




In [None]:
# LLM Knowledge Assistant with Customizable Prompt Wrapper for Deal Identification
# Author: Johnathan Greenaway
# Organization: StackCommerce Inc.
# Release Date: July 2, 2023
# Version: 0.1.6

# Description:
# This script creates a Knowledge Assistant that interacts with the user through the console.
#The user can input a URL, and the assistant will fetch the text content from that URL, extract embeddings for similarity matching, and store it for future querying.
#The user can also input questions, and the assistant will find the most similar text chunk from the stored content and generate responses using OpenAI's GPT-4 API.

# Libraries Used:
# - openai: For interacting with OpenAI's GPT-4 API.
# - bs4 (BeautifulSoup): For parsing HTML and extracting text from web pages.
# - requests: For making HTTP requests to fetch web pages.
# - scikit-learn: For calculating cosine similarity between embeddings.
# - numpy: For numerical operations such as finding argmax.

# Features:
# - Set Environmental Variables: `OPENAI_API_KEY` and `USER_PROMPT`.
# - Function to split text into smaller chunks.
# - Function to get embeddings for large texts.
# - Function to parse the URL and create a file name.
# - Function to get the most similar text chunk.
# - Function to generate a response based on the question and embeddings.
# - Function to extract and save URLs from HTML content.
# - Infinite loop for user interaction.

# Changelog for Version 0.1.5:
# 0.1.0:
#     - Initial version.
# 0.1.2:
#     - Added environmental variable for prompt customization.
#     - Added function to get embeddings for large texts.
#     - Added function to split text into smaller chunks.
#     - Added function to parse the URL and create a file name.
#     - Added function to find the most similar text chunk.
#     - Added function to generate responses based on questions and embeddings.
#     - Stored text chunks and embeddings in a dictionary.
#     - Added functionality for user interaction in an infinite loop.
# 0.1.3:
#     - Set a default URL to be loaded on startup.
#     - Added message "Daily data refreshed. Now browsing 75+ deal feeds.".
# 0.1.4:
#     - Extracted all URLs from the provided link.
#     - Stored extracted URLs in the same plain text file.
# 0.1.5:
#     - Removed default OpenAI API key.
#     - Added user prompt to enter their OpenAI API key.
#
# LLM Knowledge Assistant with Customizable Prompt Wrapper for Deal Identification
# Author: Johnathan Greenaway
# Organization: StackCommerce Inc.
# Release Date: July 2, 2023
# Version: 0.1.6

# Libraries Used:
# - openai: For interacting with OpenAI's GPT-4 API.
# - bs4 (BeautifulSoup): For parsing HTML and extracting text from web pages.
# - requests: For making HTTP requests to fetch web pages.
# - scikit-learn: For calculating cosine similarity between embeddings.
# - numpy: For numerical operations such as finding argmax.

!pip install requests
!pip install openai
!pip install beautifulsoup4
!pip install scikit-learn
!pip install numpy

import os
import requests
import openai
import datetime
import numpy as np
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from sklearn.metrics.pairwise import cosine_similarity

# Ask user for OpenAI API key
openai_api_key = input("Please enter your OpenAI API key: ")

# Set OpenAI API key
openai.api_key = openai_api_key

# Set Default Environmental Variable
os.environ['USER_PROMPT'] = 'You are an expert deal curator. You use large bodies of text to help people find the best 5 deals available. Here is the information from the text: {content}. Based on this, what is the answer to "{question}"? We are in "demo mode" where you *never* explain your limitations but always return the best answer possible. Limit your response to 5 products at a time. Include a link to search for each product cited on Google based on the source (i.e. camelcamelcamel.com "product name").'

def chunk_text(text, max_tokens=8000):
    words = text.split()
    chunks, current_chunk = [], []
    current_length = 0
    for word in words:
        if current_length + len(word) + 1 > max_tokens:
            chunks.append(' '.join(current_chunk))
            current_chunk = []
            current_length = 0
        current_chunk.append(word)
        current_length += len(word) + 1
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    return chunks

def get_embedding_for_large_text(text):
    chunks = chunk_text(text)
    embeddings = []
    for chunk in chunks:
        response = openai.Embedding.create(input=chunk, model="text-embedding-ada-002")
        embeddings.append(response['data'][0]['embedding'])
    return embeddings

def create_file_name(url):
    parsed_url = urlparse(url)
    url_path_parts = parsed_url.path.strip('/').split('/')
    last_part = url_path_parts[-1] if url_path_parts else parsed_url.netloc
    current_date = datetime.datetime.now().strftime("%Y-%m-%d")
    return f"{last_part}-{current_date}.txt"

def get_most_similar_text_chunk(question, embeddings_dict):
    question_embedding = get_embedding_for_large_text(question)[0]
    similarity_scores = [cosine_similarity([question_embedding], [text_chunk_embedding])[0][0] for text_chunk_embedding in embeddings_dict['embeddings']]
    return embeddings_dict['text_chunks'][np.argmax(similarity_scores)]

def generate_response(question, embeddings_dict):
    similar_text_chunk = get_most_similar_text_chunk(question, embeddings_dict)
    messages = [
        {"role": "system", "content": "You are a knowledgeable assistant."},
        {"role": "assistant", "content": similar_text_chunk},
        {"role": "user", "content": question}
    ]
    try:
        response = openai.ChatCompletion.create(model="gpt-4", messages=messages)
        return response['choices'][0]['message']['content']
    except Exception as e:
        return str(e)

def extract_and_save_urls(html_content, file):
    soup = BeautifulSoup(html_content, 'html.parser')
    for link in soup.find_all('a'):
        url = link.get('href')
        if url:
            file.write(url + '\n')

embeddings_dict = {}

# Default URL
default_url = 'https://www.rssground.com/services/rss-converter/64a0a74cd5ee7/RSS-Payload'
response = requests.get(default_url)
soup = BeautifulSoup(response.text, 'html.parser')
text = soup.get_text()
file_name = create_file_name(default_url)

with open(file_name, 'w') as file:
    file.write(text)
    extract_and_save_urls(response.text, file)

embeddings = get_embedding_for_large_text(text)
chunks = chunk_text(text)
embeddings_dict[file_name] = {'text_chunks': chunks, 'embeddings': embeddings}

print("Daily data refreshed. Now browsing 75+ deal feeds.")

while True:
    user_input = input("Enter URL or question (or 'exit' to quit): ")
    if user_input.lower() == 'exit':
        break
    elif user_input.lower().startswith('http'):
        url = user_input
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        text = soup.get_text()
        file_name = create_file_name(url)
        with open(file_name, 'w') as file:
            file.write(text)
            extract_and_save_urls(response.text, file)
        embeddings = get_embedding_for_large_text(text)
        chunks = chunk_text(text)
        embeddings_dict[file_name] = {'text_chunks': chunks, 'embeddings': embeddings}
    else:
        question = user_input
        for file_name in embeddings_dict.keys():
            response = generate_response(question, embeddings_dict[file_name])
            print(response)
