<a href="https://colab.research.google.com/github/WazaCraft/framework/blob/main/REL_deal_id_api_0107.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Deal-Identification-ID-API 0.1.7
##LLM Knowledge Assistant with Customizable Prompt Wrapper for Deal Identification


> Rel: July 2, 2023 Version: 0.1.7




In [None]:

!pip install openai
!pip install bs4
!pip install requests
!pip install beautifulsoup4
!pip install scikit-learn
!pip install Flask

In [None]:
#0.1.7.56
#Fixed embedding naming

import os
import requests
import openai
import datetime
import numpy as np
import pickle
import socket
import threading
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from sklearn.metrics.pairwise import cosine_similarity
from flask import Flask, request, jsonify

app = Flask(__name__)

openai_api_key = input("Please enter your OpenAI API key: ")
openai.api_key = openai_api_key
os.environ['USER_PROMPT'] = 'You are an expert deal curator. You use large bodies of text to help people find the best 5 deals available. Here is the information from the text: {content}. Based on this, what is the answer to "{question}"? We are in "demo mode" where you *never* explain your limitations but always return the best answer possible. Limit your response to 5 products at a time. Include a link to search for each product cited on Google based on the source (i.e. camelcamelcamel.com "product name").'
def chunk_text(text, max_tokens=8000):
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    for word in words:
        if current_length + len(word) + 1 > max_tokens:
            chunks.append(' '.join(current_chunk))
            current_chunk = []
            current_length = 0
        current_chunk.append(word)
        current_length += len(word) + 1
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    return chunks

def get_embedding_for_large_text(text):
    chunks = chunk_text(text)
    embeddings = []
    for chunk in chunks:
        response = openai.Embedding.create(input=chunk, model="text-embedding-ada-002")
        embedding = response['data'][0]['embedding']
        embeddings.append(embedding)
    return embeddings

def create_file_name(url, extension='txt'):
    parsed_url = urlparse(url)
    url_path_parts = parsed_url.path.strip('/').split('/')
    last_part = url_path_parts[-1] if url_path_parts else parsed_url.netloc
    current_date = datetime.datetime.now().strftime("%Y-%m-%d")
    return f"{last_part}-{current_date}.{extension}"

def get_most_similar_text_chunk(question, embeddings_dict):
    question_embedding = get_embedding_for_large_text(question)[0]
    similarity_scores = []
    for text_chunk_embedding in embeddings_dict['embeddings']:
        similarity_scores.append(cosine_similarity([question_embedding], [text_chunk_embedding])[0][0])
    most_similar_index = np.argmax(similarity_scores)
    return embeddings_dict['text_chunks'][most_similar_index]

def generate_response(question, embeddings_dict):
    similar_text_chunk = get_most_similar_text_chunk(question, embeddings_dict)
    messages = [
        {"role": "system", "content": "You are a knowledgeable assistant."},
        {"role": "assistant", "content": similar_text_chunk},
        {"role": "user", "content": question}
    ]
    try:
        response = openai.ChatCompletion.create(model="gpt-4", messages=messages)
        assistant_reply = response['choices'][0]['message']['content']
        return assistant_reply
    except Exception as e:
        return str(e)

def extract_and_save_urls(html_content, file):
    soup = BeautifulSoup(html_content, 'html.parser')
    for link in soup.find_all('a'):
        url = link.get('href')
        if url:
            file.write(url + '\n')

def save_embeddings_to_file(embeddings_dict, file_name):
    with open(file_name, 'wb') as file:
        pickle.dump(embeddings_dict, file)

def load_embeddings_from_file(file_name):
    with open(file_name, 'rb') as file:
        return pickle.load(file)

embeddings_dict = {}

url = 'https://www.rssground.com/services/rss-converter/64a0a74cd5ee7/RSS-Payload'
response = requests.get(url)
text = response.text
file_name = create_file_name(url)

with open(file_name, 'w') as file:
    file.write(text)
    extract_and_save_urls(text, file)

embeddings = get_embedding_for_large_text(text)
chunks = chunk_text(text)
embeddings_file_name = create_file_name(url, extension='pkl')
embeddings_dict[embeddings_file_name] = {'text_chunks': chunks, 'embeddings': embeddings}
save_embeddings_to_file(embeddings_dict, embeddings_file_name)

print("Daily data refreshed. Now browsing 75+ deal feeds.")

@app.route('/ask', methods=['GET'])
def ask_question():
    question = request.args.get('question')
    if question:
        responses = []
        for embeddings_file_name in embeddings_dict.keys():
            response = generate_response(question, embeddings_dict[embeddings_file_name])
            responses.append(response)
        return jsonify(responses)
    return jsonify({"error": "No question provided"})

def run_web_api(port):
    app.run(port=port)

def is_port_in_use(port):
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
        return s.connect_ex(('localhost', port)) == 0

api_thread = None

while True:
    user_input = input("Enter URL or question or 'deal-id up' or 'deal-id down' (or 'exit' to quit): ")

    if user_input.lower() == 'exit':
        break

    elif user_input.lower() == 'deal-id up':
        if api_thread is None or not api_thread.is_alive():
            port = 5000
            while is_port_in_use(port):
                port = int(input(f"Port {port} is in use. Please enter a different port: "))
            api_thread = threading.Thread(target=run_web_api, args=(port,))
            api_thread.daemon = True
            api_thread.start()
        else:
            print("Server is already running")

    elif user_input.lower() == 'deal-id down':
        if api_thread and api_thread.is_alive():
            print("Stopping the server.")
            requests.post(f'http://localhost:{port}/shutdown')
            api_thread.join()
        else:
            print("Server is not running")

    elif user_input.lower().startswith('http'):
        url = user_input
        response = requests.get(url)
        text = response.text
        file_name = create_file_name(url)

        with open(file_name, 'w') as file:
            file.write(text)
            extract_and_save_urls(text, file)

        embeddings = get_embedding_for_large_text(text)
        chunks = chunk_text(text)
        embeddings_file_name = create_file_name(url, extension='pkl')
        embeddings_dict[embeddings_file_name] = {'text_chunks': chunks, 'embeddings': embeddings}
        save_embeddings_to_file(embeddings_dict, embeddings_file_name)

    else:
        question = user_input
        for embeddings_file_name in embeddings_dict.keys():
            response = generate_response(question, embeddings_dict[embeddings_file_name])
            print(response)


INFO:werkzeug:127.0.0.1 - - [03/Jul/2023 02:55:54] "GET /ask?question=What%20are%20the%20latest%20deals%20available? HTTP/1.1" 200 -


KeyboardInterrupt: ignored

In [None]:
GET http://localhost:8732/ask?question=What%20are%20the%20latest%20deals%20available?


In [None]:
#0.1.7.4
#Reintroduced Pickle
#To do: add vector selection

import os
import requests
import openai
import datetime
import numpy as np
import pickle
import socket
import threading
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from sklearn.metrics.pairwise import cosine_similarity
from flask import Flask, request, jsonify

app = Flask(__name__)

openai_api_key = input("Please enter your OpenAI API key: ")
openai.api_key = openai_api_key
os.environ['USER_PROMPT'] = 'Here is the info from the text: {content}. Based on this, what is the answer to "{question}"?'

def chunk_text(text, max_tokens=8000):
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    for word in words:
        if current_length + len(word) + 1 > max_tokens:
            chunks.append(' '.join(current_chunk))
            current_chunk = []
            current_length = 0
        current_chunk.append(word)
        current_length += len(word) + 1
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    return chunks

def get_embedding_for_large_text(text):
    chunks = chunk_text(text)
    embeddings = []
    for chunk in chunks:
        response = openai.Embedding.create(input=chunk, model="text-embedding-ada-002")
        embedding = response['data'][0]['embedding']
        embeddings.append(embedding)
    return embeddings

def create_file_name(url):
    parsed_url = urlparse(url)
    url_path_parts = parsed_url.path.strip('/').split('/')
    last_part = url_path_parts[-1] if url_path_parts else parsed_url.netloc
    current_date = datetime.datetime.now().strftime("%Y-%m-%d")
    file_name = f"{last_part}-{current_date}.txt"
    return file_name

def get_most_similar_text_chunk(question, embeddings_dict):
    question_embedding = get_embedding_for_large_text(question)[0]
    similarity_scores = []
    for text_chunk_embedding in embeddings_dict['embeddings']:
        similarity_scores.append(cosine_similarity([question_embedding], [text_chunk_embedding])[0][0])
    most_similar_index = np.argmax(similarity_scores)
    return embeddings_dict['text_chunks'][most_similar_index]

def generate_response(question, embeddings_dict):
    similar_text_chunk = get_most_similar_text_chunk(question, embeddings_dict)
    messages = [
        {"role": "system", "content": "You are a knowledgeable assistant."},
        {"role": "assistant", "content": similar_text_chunk},
        {"role": "user", "content": question}
    ]
    try:
        response = openai.ChatCompletion.create(model="gpt-4", messages=messages)
        assistant_reply = response['choices'][0]['message']['content']
        return assistant_reply
    except Exception as e:
        return str(e)

def extract_and_save_urls(html_content, file):
    soup = BeautifulSoup(html_content, 'html.parser')
    for link in soup.find_all('a'):
        url = link.get('href')
        if url:
            file.write(url + '\n')

def save_embeddings_to_file(embeddings_dict, file_name):
    with open(file_name, 'wb') as file:
        pickle.dump(embeddings_dict, file)

def load_embeddings_from_file(file_name):
    with open(file_name, 'rb') as file:
        return pickle.load(file)

embeddings_dict = {}

url = 'https://www.rssground.com/services/rss-converter/64a0a74cd5ee7/RSS-Payload'
response = requests.get(url)
text = response.text
file_name = create_file_name(url)

with open(file_name, 'w') as file:
    file.write(text)
    extract_and_save_urls(text, file)

embeddings = get_embedding_for_large_text(text)
chunks = chunk_text(text)
embeddings_dict[file_name] = {'text_chunks': chunks, 'embeddings': embeddings}
save_embeddings_to_file(embeddings_dict, 'embeddings.pkl')

print("Daily data refreshed. Now browsing 75+ deal feeds.")

@app.route('/ask', methods=['GET'])
def ask_question():
    question = request.args.get('question')
    if question:
        responses = []
        for file_name in embeddings_dict.keys():
            response = generate_response(question, embeddings_dict[file_name])
            responses.append(response)
        return jsonify(responses)
    return jsonify({"error": "No question provided"})

def run_web_api(port):
    app.run(port=port)

def is_port_in_use(port):
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
        return s.connect_ex(('localhost', port)) == 0

api_thread = None

while True:
    user_input = input("Enter URL or question or 'deal-id up' or 'deal-id down' (or 'exit' to quit): ")

    if user_input.lower() == 'exit':
        break

    elif user_input.lower() == 'deal-id up':
        if api_thread is None or not api_thread.is_alive():
            port = 5000
            while is_port_in_use(port):
                port = int(input(f"Port {port} is in use. Please enter a different port: "))
            api_thread = threading.Thread(target=run_web_api, args=(port,))
            api_thread.daemon = True
            api_thread.start()
        else:
            print("Server is already running")

    elif user_input.lower() == 'deal-id down':
        if api_thread and api_thread.is_alive():
            print("Stopping the server.")
            requests.post(f'http://localhost:{port}/shutdown')
            api_thread.join()
        else:
            print("Server is not running")

    elif user_input.lower().startswith('http'):
        url = user_input
        response = requests.get(url)
        text = response.text
        file_name = create_file_name(url)

        with open(file_name, 'w') as file:
            file.write(text)
            extract_and_save_urls(text, file)

        embeddings = get_embedding_for_large_text(text)
        chunks = chunk_text(text)
        embeddings_dict[file_name] = {'text_chunks': chunks, 'embeddings': embeddings}

    else:
        question = user_input
        for file_name in embeddings_dict.keys():
            response = generate_response(question, embeddings_dict[file_name])
            print(response)



Please enter your OpenAI API key: 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
Daily data refreshed. Now browsing 75+ deal feeds.


KeyboardInterrupt: ignored

In [None]:
#0.1.7.3
#Deal-Identification-ID-API 0.1.7
#LLM Knowledge Assistant with Customizable Prompt Wrapper for Deal Identification
#Added commands to enable / disable Flask server for REST API
#Added reconfig server port path
#
import os
import requests
import openai
import datetime
import numpy as np
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from sklearn.metrics.pairwise import cosine_similarity
from flask import Flask, request, jsonify
import threading
import socket

app = Flask(__name__)

openai_api_key = input("Please enter your OpenAI API key: ")
openai.api_key = openai_api_key
os.environ['USER_PROMPT'] = 'Here is the info from the text: {content}. Based on this, what is the answer to "{question}"?'

def chunk_text(text, max_tokens=8000):
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    for word in words:
        if current_length + len(word) + 1 > max_tokens:
            chunks.append(' '.join(current_chunk))
            current_chunk = []
            current_length = 0
        current_chunk.append(word)
        current_length += len(word) + 1
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    return chunks

def get_embedding_for_large_text(text):
    chunks = chunk_text(text)
    embeddings = []
    for chunk in chunks:
        response = openai.Embedding.create(input=chunk, model="text-embedding-ada-002")
        embedding = response['data'][0]['embedding']
        embeddings.append(embedding)
    return embeddings

def create_file_name(url):
    parsed_url = urlparse(url)
    url_path_parts = parsed_url.path.strip('/').split('/')
    last_part = url_path_parts[-1] if url_path_parts else parsed_url.netloc
    current_date = datetime.datetime.now().strftime("%Y-%m-%d")
    file_name = f"{last_part}-{current_date}.txt"
    return file_name

def get_most_similar_text_chunk(question, embeddings_dict):
    question_embedding = get_embedding_for_large_text(question)[0]
    similarity_scores = []
    for text_chunk_embedding in embeddings_dict['embeddings']:
        similarity_scores.append(cosine_similarity([question_embedding], [text_chunk_embedding])[0][0])
    most_similar_index = np.argmax(similarity_scores)
    return embeddings_dict['text_chunks'][most_similar_index]

def generate_response(question, embeddings_dict):
    similar_text_chunk = get_most_similar_text_chunk(question, embeddings_dict)
    messages = [
        {"role": "system", "content": "You are a knowledgeable assistant."},
        {"role": "assistant", "content": similar_text_chunk},
        {"role": "user", "content": question}
    ]
    try:
        response = openai.ChatCompletion.create(model="gpt-4", messages=messages)
        assistant_reply = response['choices'][0]['message']['content']
        return assistant_reply
    except Exception as e:
        return str(e)

def extract_and_save_urls(html_content, file):
    soup = BeautifulSoup(html_content, 'html.parser')
    for link in soup.find_all('a'):
        url = link.get('href')
        if url:
            file.write(url + '\n')

embeddings_dict = {}

default_url = 'https://www.rssground.com/services/rss-converter/64a0a74cd5ee7/RSS-Payload'
response = requests.get(default_url)
text = response.text
file_name = create_file_name(default_url)

with open(file_name, 'w') as file:
    file.write(text)
    extract_and_save_urls(text, file)

embeddings = get_embedding_for_large_text(text)
chunks = chunk_text(text)
embeddings_dict[file_name] = {'text_chunks': chunks, 'embeddings': embeddings}

print("Daily data refreshed. Now browsing 75+ deal feeds.")

@app.route('/ask', methods=['GET'])
def ask_question():
    question = request.args.get('question')
    if question:
        responses = []
        for file_name in embeddings_dict.keys():
            response = generate_response(question, embeddings_dict[file_name])
            responses.append(response)
        return jsonify(responses)
    return jsonify({"error": "No question provided"})

def run_web_api(port):
    app.run(port=port)

def is_port_in_use(port):
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
        return s.connect_ex(('localhost', port)) == 0

api_thread = None

while True:
    user_input = input("Enter URL or question or 'deal-id up' or 'deal-id down' (or 'exit' to quit): ")

    if user_input.lower() == 'exit':
        break

    elif user_input.lower() == 'deal-id up':
        if api_thread is None or not api_thread.is_alive():
            port = 5000
            while is_port_in_use(port):
                port = int(input(f"Port {port} is in use. Please enter a different port: "))
            api_thread = threading.Thread(target=run_web_api, args=(port,))
            api_thread.daemon = True
            api_thread.start()
        else:
            print("Server is already running")

    elif user_input.lower() == 'deal-id down':
        if api_thread and api_thread.is_alive():
            print("Stopping the server.")
            requests.post(f'http://localhost:{port}/shutdown')
            api_thread.join()
        else:
            print("Server is not running")

    elif user_input.lower().startswith('http'):
        url = user_input
        response = requests.get(url)
        text = response.text
        file_name = create_file_name(url)

        with open(file_name, 'w') as file:
            file.write(text)
            extract_and_save_urls(text, file)

        embeddings = get_embedding_for_large_text(text)
        chunks = chunk_text(text)
        embeddings_dict[file_name] = {'text_chunks': chunks, 'embeddings': embeddings}

    else:
        question = user_input
        for file_name in embeddings_dict.keys():
            response = generate_response(question, embeddings_dict[file_name])
            print(response)


Please enter your OpenAI API key: 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
Daily data refreshed. Now browsing 75+ deal feeds.
Enter URL or question or 'deal-id up' or 'deal-id down' (or 'exit' to quit): deal-id up
Port 5000 is in use. Please enter a different port: 7000
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:7000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


KeyboardInterrupt: ignored

In [None]:
import requests

def ask_question(question):
    response = requests.get("http://localhost:7000/ask", params={'question': question})
    return response.json()


In [None]:
response = ask_question("Your question here")
print(response)

In [None]:
#0.1.7.2
Added Flask

import os
import requests
import openai
import datetime
import numpy as np
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from sklearn.metrics.pairwise import cosine_similarity
from flask import Flask, request, jsonify
import threading

app = Flask(__name__)

openai_api_key = input("Please enter your OpenAI API key: ")
openai.api_key = openai_api_key
os.environ['USER_PROMPT'] = 'Here is the info from the text: {content}. Based on this, what is the answer to "{question}"?'

def chunk_text(text, max_tokens=8000):
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    for word in words:
        if current_length + len(word) + 1 > max_tokens:
            chunks.append(' '.join(current_chunk))
            current_chunk = []
            current_length = 0
        current_chunk.append(word)
        current_length += len(word) + 1
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    return chunks

def get_embedding_for_large_text(text):
    chunks = chunk_text(text)
    embeddings = []
    for chunk in chunks:
        response = openai.Embedding.create(input=chunk, model="text-embedding-ada-002")
        embedding = response['data'][0]['embedding']
        embeddings.append(embedding)
    return embeddings

def create_file_name(url):
    parsed_url = urlparse(url)
    url_path_parts = parsed_url.path.strip('/').split('/')
    last_part = url_path_parts[-1] if url_path_parts else parsed_url.netloc
    current_date = datetime.datetime.now().strftime("%Y-%m-%d")
    file_name = f"{last_part}-{current_date}.txt"
    return file_name

def get_most_similar_text_chunk(question, embeddings_dict):
    question_embedding = get_embedding_for_large_text(question)[0]
    similarity_scores = []
    for text_chunk_embedding in embeddings_dict['embeddings']:
        similarity_scores.append(cosine_similarity([question_embedding], [text_chunk_embedding])[0][0])
    most_similar_index = np.argmax(similarity_scores)
    return embeddings_dict['text_chunks'][most_similar_index]

def generate_response(question, embeddings_dict):
    similar_text_chunk = get_most_similar_text_chunk(question, embeddings_dict)
    messages = [
        {"role": "system", "content": "You are a knowledgeable assistant."},
        {"role": "assistant", "content": similar_text_chunk},
        {"role": "user", "content": question}
    ]
    try:
        response = openai.ChatCompletion.create(model="gpt-4", messages=messages)
        assistant_reply = response['choices'][0]['message']['content']
        return assistant_reply
    except Exception as e:
        return str(e)

def extract_and_save_urls(html_content, file):
    soup = BeautifulSoup(html_content, 'html.parser')
    for link in soup.find_all('a'):
        url = link.get('href')
        if url:
            file.write(url + '\n')

embeddings_dict = {}

default_url = 'https://www.rssground.com/services/rss-converter/64a0a74cd5ee7/RSS-Payload'
response = requests.get(default_url)
text = response.text
file_name = create_file_name(default_url)

with open(file_name, 'w') as file:
    file.write(text)
    extract_and_save_urls(text, file)

embeddings = get_embedding_for_large_text(text)
chunks = chunk_text(text)
embeddings_dict[file_name] = {'text_chunks': chunks, 'embeddings': embeddings}

print("Daily data refreshed. Now browsing 75+ deal feeds.")

@app.route('/ask', methods=['GET'])
def ask_question():
    question = request.args.get('question')
    if question:
        responses = []
        for file_name in embeddings_dict.keys():
            response = generate_response(question, embeddings_dict[file_name])
            responses.append(response)
        return jsonify(responses)
    return jsonify({"error": "No question provided"})

def run_web_api():
    app.run(port=5000)

api_thread = threading.Thread(target=run_web_api)
api_thread.daemon = True
api_thread.start()

while True:
    user_input = input("Enter URL or question (or 'exit' to quit): ")

    if user_input.lower() == 'exit':
        break

    elif user_input.lower().startswith('http'):
        url = user_input
        response = requests.get(url)
        text = response.text
        file_name = create_file_name(url)

        with open(file_name, 'w') as file:
            file.write(text)
            extract_and_save_urls(text, file)

        embeddings = get_embedding_for_large_text(text)
        chunks = chunk_text(text)
        embeddings_dict[file_name] = {'text_chunks': chunks, 'embeddings': embeddings}

    else:
        question = user_input
        for file_name in embeddings_dict.keys():
            response = generate_response(question, embeddings_dict[file_name])
            print(response)


ModuleNotFoundError: ignored

In [None]:
# LLM Knowledge Assistant with Customizable Prompt Wrapper for Deal Identification
# Author: Johnathan Greenaway
# Organization: StackCommerce Inc.
# Release Date: July 2, 2023
# Version: 0.1.6

# Description:
# This script creates a Knowledge Assistant that interacts with the user through the console.
#The user can input a URL, and the assistant will fetch the text content from that URL, extract embeddings for similarity matching, and store it for future querying.
#The user can also input questions, and the assistant will find the most similar text chunk from the stored content and generate responses using OpenAI's GPT-4 API.

# Libraries Used:
# - openai: For interacting with OpenAI's GPT-4 API.
# - bs4 (BeautifulSoup): For parsing HTML and extracting text from web pages.
# - requests: For making HTTP requests to fetch web pages.
# - scikit-learn: For calculating cosine similarity between embeddings.
# - numpy: For numerical operations such as finding argmax.

# Features:
# - Set Environmental Variables: `OPENAI_API_KEY` and `USER_PROMPT`.
# - Function to split text into smaller chunks.
# - Function to get embeddings for large texts.
# - Function to parse the URL and create a file name.
# - Function to get the most similar text chunk.
# - Function to generate a response based on the question and embeddings.
# - Function to extract and save URLs from HTML content.
# - Infinite loop for user interaction.

# Changelog for Version 0.1.5:
# 0.1.0:
#     - Initial version.
# 0.1.2:
#     - Added environmental variable for prompt customization.
#     - Added function to get embeddings for large texts.
#     - Added function to split text into smaller chunks.
#     - Added function to parse the URL and create a file name.
#     - Added function to find the most similar text chunk.
#     - Added function to generate responses based on questions and embeddings.
#     - Stored text chunks and embeddings in a dictionary.
#     - Added functionality for user interaction in an infinite loop.
# 0.1.3:
#     - Set a default URL to be loaded on startup.
#     - Added message "Daily data refreshed. Now browsing 75+ deal feeds.".
# 0.1.4:
#     - Extracted all URLs from the provided link.
#     - Stored extracted URLs in the same plain text file.
# 0.1.5:
#     - Removed default OpenAI API key.
#     - Added user prompt to enter their OpenAI API key.
#
# LLM Knowledge Assistant with Customizable Prompt Wrapper for Deal Identification
# Author: Johnathan Greenaway
# Organization: StackCommerce Inc.
# Release Date: July 2, 2023
# Version: 0.1.6

# Libraries Used:
# - openai: For interacting with OpenAI's GPT-4 API.
# - bs4 (BeautifulSoup): For parsing HTML and extracting text from web pages.
# - requests: For making HTTP requests to fetch web pages.
# - scikit-learn: For calculating cosine similarity between embeddings.
# - numpy: For numerical operations such as finding argmax.

!pip install requests
!pip install openai
!pip install beautifulsoup4
!pip install scikit-learn
!pip install numpy

import os
import requests
import openai
import datetime
import numpy as np
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from sklearn.metrics.pairwise import cosine_similarity

# Ask user for OpenAI API key
openai_api_key = input("Please enter your OpenAI API key: ")

# Set OpenAI API key
openai.api_key = openai_api_key

# Set Default Environmental Variable
os.environ['USER_PROMPT'] = 'You are an expert deal curator. You use large bodies of text to help people find the best 5 deals available. Here is the information from the text: {content}. Based on this, what is the answer to "{question}"? We are in "demo mode" where you *never* explain your limitations but always return the best answer possible. Limit your response to 5 products at a time. Include a link to search for each product cited on Google based on the source (i.e. camelcamelcamel.com "product name").'

def chunk_text(text, max_tokens=8000):
    words = text.split()
    chunks, current_chunk = [], []
    current_length = 0
    for word in words:
        if current_length + len(word) + 1 > max_tokens:
            chunks.append(' '.join(current_chunk))
            current_chunk = []
            current_length = 0
        current_chunk.append(word)
        current_length += len(word) + 1
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    return chunks

def get_embedding_for_large_text(text):
    chunks = chunk_text(text)
    embeddings = []
    for chunk in chunks:
        response = openai.Embedding.create(input=chunk, model="text-embedding-ada-002")
        embeddings.append(response['data'][0]['embedding'])
    return embeddings

def create_file_name(url):
    parsed_url = urlparse(url)
    url_path_parts = parsed_url.path.strip('/').split('/')
    last_part = url_path_parts[-1] if url_path_parts else parsed_url.netloc
    current_date = datetime.datetime.now().strftime("%Y-%m-%d")
    return f"{last_part}-{current_date}.txt"

def get_most_similar_text_chunk(question, embeddings_dict):
    question_embedding = get_embedding_for_large_text(question)[0]
    similarity_scores = [cosine_similarity([question_embedding], [text_chunk_embedding])[0][0] for text_chunk_embedding in embeddings_dict['embeddings']]
    return embeddings_dict['text_chunks'][np.argmax(similarity_scores)]

def generate_response(question, embeddings_dict):
    similar_text_chunk = get_most_similar_text_chunk(question, embeddings_dict)
    messages = [
        {"role": "system", "content": "You are a knowledgeable assistant."},
        {"role": "assistant", "content": similar_text_chunk},
        {"role": "user", "content": question}
    ]
    try:
        response = openai.ChatCompletion.create(model="gpt-4", messages=messages)
        return response['choices'][0]['message']['content']
    except Exception as e:
        return str(e)

def extract_and_save_urls(html_content, file):
    soup = BeautifulSoup(html_content, 'html.parser')
    for link in soup.find_all('a'):
        url = link.get('href')
        if url:
            file.write(url + '\n')

embeddings_dict = {}

# Default URL
default_url = 'https://www.rssground.com/services/rss-converter/64a0a74cd5ee7/RSS-Payload'
response = requests.get(default_url)
soup = BeautifulSoup(response.text, 'html.parser')
text = soup.get_text()
file_name = create_file_name(default_url)

with open(file_name, 'w') as file:
    file.write(text)
    extract_and_save_urls(response.text, file)

embeddings = get_embedding_for_large_text(text)
chunks = chunk_text(text)
embeddings_dict[file_name] = {'text_chunks': chunks, 'embeddings': embeddings}

print("Daily data refreshed. Now browsing 75+ deal feeds.")

while True:
    user_input = input("Enter URL or question (or 'exit' to quit): ")
    if user_input.lower() == 'exit':
        break
    elif user_input.lower().startswith('http'):
        url = user_input
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        text = soup.get_text()
        file_name = create_file_name(url)
        with open(file_name, 'w') as file:
            file.write(text)
            extract_and_save_urls(response.text, file)
        embeddings = get_embedding_for_large_text(text)
        chunks = chunk_text(text)
        embeddings_dict[file_name] = {'text_chunks': chunks, 'embeddings': embeddings}
    else:
        question = user_input
        for file_name in embeddings_dict.keys():
            response = generate_response(question, embeddings_dict[file_name])
            print(response)
