In [32]:
import requests
from bs4 import BeautifulSoup
import json
import sys
import openai
import os

OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
openai.api_key = OPENAI_API_KEY


In [57]:
# Taken from src/ml/openai_wrappers.py
from ctypes import Union
from typing import Optional


CURRENT_OPENAI_DAVINCI_MODEL = "text-davinci-003"
CURRENT_OPENAI_CHAT_GPT_MODEL = "gpt-3.5-turbo"
CURRENT_OPENAI_LATEST_GPT_MODEL = "gpt-4"
DEFAULT_SUFFIX = None
DEFAULT_MAX_TOKENS = 16
DEFAULT_TEMPERATURE = 1
DEFAULT_TOP_P = 1
DEFAULT_N = 1
DEFAULT_FREQUENCY_PENALTY = 0
DEFAULT_STOP = None

def wrapped_chat_gpt_completion(
    messages: list,
    history: Optional[list] = [],
    max_tokens: Optional[int] = DEFAULT_MAX_TOKENS,
    temperature: Optional[float] = DEFAULT_TEMPERATURE,
    top_p: Optional[float] = DEFAULT_TOP_P,
    n: Optional[int] = DEFAULT_N,
    frequency_penalty: Optional[float] = DEFAULT_FREQUENCY_PENALTY,
):
    """
    Generates a completion using the GPT-3.5-turbo model.

    messages needs to be in the format:
    [
        {
            "role": "user",
            "content": "Hello, how are you?"
        },
        {
            "role": "assistant",
            "content": "I am doing well, how about you?"
        }
        ...
    ]
    """
    if history:
        messages = history + messages

    response = openai.ChatCompletion.create(
        model=CURRENT_OPENAI_LATEST_GPT_MODEL,
        messages=messages,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        n=n,
        frequency_penalty=frequency_penalty,
    )
    if response is None or response["choices"] is None or len(response["choices"]) == 0:
        return [], ""

    choices = response["choices"]
    top_choice = choices[0]
    preview = top_choice["message"]["content"].strip()

    messages = messages + [{"role": "assistant", "content": preview}]
    return messages, preview


In [58]:

# This is a placeholder function as direct interaction with GPT-4 is not possible in this context.
# In a real-world scenario, you would replace this with an API call or integration with GPT-4.
def gpt4_interpret_html(html_content: str):
    # Sample output, just for demonstration. You'll replace this with GPT-4 inference.
    completion = wrapped_chat_gpt_completion(
        messages=[
            {
                "role": "user",
                "content":  """Leverage this HTML content: 
{html_content}

----
INSTRUCTIONS:
Extract key details from the HTML content and return a JSON object with the following fields:
- description: (mandatory) a short description of the company ex. 'We are a CRM company'
- summary: (mandatory) a one-line summary of the company ex. 'We are a CRM company that helps small businesses manage their customers'
- products: (mandatory) a list of 3-4 main products offered by the company ex. 'CRM', 'ERP', 'HR'
- industries: (mandatory) a list of industries the company operates in ex. 'healthcare', 'finance', 'education'
- target_profiles: (mandatory) a list of target customer profiles for the company ex. 'small business owners', 'developers', 'hospital administrators'
- company type: (mandatory) the type of company. ex. 'B2B', 'B2C', 'B2B2C'
- location: (mandatory) the location the company targets. ex. 'United States', 'Canada', 'Global'
- highlights: (mandatory) list of any 2-3 notable facts about the company related to company-things like fundraises, recent news, etc. ex. 'raised $100M in Series A', 'recently acquired by Google'
- linkedin_url: the LinkedIn URL for the company ex. 'https://www.linkedin.com/company/acme'
- crunchbase_url: the Crunchbase URL for the company ex. 'https://www.crunchbase.com/organization/acme'
- twitter_url: the Twitter URL for the company ex. 'https://twitter.com/acme'
- instagram_url: the Instagram URL for the company ex. 'https://www.instagram.com/acme/'
- email: the email address for the company ex. 'johnny@acm.ecom'
- address: the address for the company ex. '123 Main St, San Francisco, CA 94105'

Fill in all the fields above with the correct values. If a field is not applicable, leave it blank.

You must fill in all the mandatory fields.
----
OUTPUT:""".format(
                    html_content=html_content
                )
            }
        ],
        max_tokens=500,
        temperature=0.9,
        top_p=1.0,
        frequency_penalty=0.0
    )
    return completion

def get_website_details(url: str) -> str | bool:
    # Get raw HTML content
    response = requests.get(url)
    if response.status_code != 200:
        raise ValueError(f"Failed to fetch content from {url}. Status Code: {response.status_code}")

    html_content = response.text

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    # get text 
    clean_text = " ".join([text for text in soup.stripped_strings])
    # add all URLs found on website to text too
    # for link in soup.find_all('a'):
        # keywords_to_check = ['press', 'jobs', 'careers', 'about', 'contact', 'team', 'company', 'blog', 'twitter', 'facebook', 'instagram', 'linkedin', 'crunchbase']
        # if link.get('href') is not None and any(keyword in link.get('href') for keyword in keywords_to_check):
        #     clean_text = link.get('href') + " " + clean_text

    # max to 3700 chars
    clean_text = clean_text[:4000]
    

    # Use GPT-4 to interpret the HTML content
    print("Cleaned text:")
    print(clean_text)
    gpt4_results = gpt4_interpret_html(clean_text)

    return gpt4_results[1]

# Example
url = "https://www.moveworks.com"
data_str = json.dumps(get_website_details(url), indent=4)
formatted = json.loads(data_str)
print("Formatted output:")
print(formatted)


Cleaned text:
Moveworks: The Enterprise Copilot Platform Platform Solutions Customer Stories Resources Company Request demo Overview Product Overview How it Works LLM Stack Enterprise Copilot Key Features Creator Studio Employee Experience Insights Multilingual Support Moveworks API Integration Partners Triage Performance Dashboards Experiences Answers Approvals Concierge Control Center Employee Communications Groups Access Software Access By Team IT HR Finance Facilities Employee Communications By Use Case HR Service Desk Identity Access Management IT Service Desk IT Service Management Knowledge Management By Initiative Cost Reduction Employee Onboarding Multilingual Support Self Service Resource Center Blog Help Center About us Careers Newsroom Contact us Trust Platform Overview Product Overview How it Works LLM Stack Enterprise Copilot Key Features Creator Studio Employee Experience Insights Multilingual Support Moveworks API Integration Partners Triage Performance Dashboards Experi

Job "run_sales_navigator_launches (trigger: interval[0:01:00], next run at: 2023-10-06 22:46:36 PDT)" raised an exception
Traceback (most recent call last):
  File "/Users/aakash/Documents/core-SellScale/sellscale-api/venv/lib/python3.10/site-packages/apscheduler/executors/base.py", line 125, in run_job
    retval = job.func(*job.args, **job.kwargs)
  File "/Users/aakash/Documents/core-SellScale/sellscale-api/notebooks/website_scraper/../../src/utils/scheduler.py", line 175, in run_sales_navigator_launches
    from src.automation.phantom_buster.services import (
  File "/Users/aakash/Documents/core-SellScale/sellscale-api/notebooks/website_scraper/../../src/automation/phantom_buster/services.py", line 1, in <module>
    from app import db, celery
  File "/Users/aakash/Documents/core-SellScale/sellscale-api/notebooks/website_scraper/../../app.py", line 130, in <module>
    app.config.from_object(os.environ["APP_SETTINGS"])
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/p

Formatted output:
{
"description": "Moveworks is an AI company that provides an enterprise copilot platform powered by advanced large language models for work automation.",
"summary": "Moveworks is revolutionizing the way enterprises operate by automating work with generative AI, providing insights on service desk performance, and offering versatile solutions for IT and HR departments.",
"products": ["Enterprise Copilot", "Creator Studio", "Employee Experience Insights", "Moveworks API"],
"industries": ["IT", "HR", "Finance", "Facilities"],
"target_profiles": ["IT departments", "HR teams", "Finance professionals", "Facility Managers"],
"company_type": "B2B",
"location": "Global",
"highlights": ["Moveworks customers see a 256% three-year ROI.", "The platform is capable of answering more than 100K common HR questions.", "Broadcom is resolving 57%+ of IT issues while seeing a 40% reduction in incidents with Moveworks"],
"linkedin_url": "",
"crunchbase_url": "",
"twitter_url": "",
"instagr

Job "run_queued_gm_jobs (trigger: interval[0:00:30], next run at: 2023-10-06 22:46:36 PDT)" raised an exception
Traceback (most recent call last):
  File "/Users/aakash/Documents/core-SellScale/sellscale-api/venv/lib/python3.10/site-packages/apscheduler/executors/base.py", line 125, in run_job
    retval = job.func(*job.args, **job.kwargs)
  File "/Users/aakash/Documents/core-SellScale/sellscale-api/notebooks/website_scraper/../../src/utils/scheduler.py", line 235, in run_queued_gm_jobs
    from src.message_generation.services import run_queued_gm_job
  File "/Users/aakash/Documents/core-SellScale/sellscale-api/notebooks/website_scraper/../../src/message_generation/services.py", line 2, in <module>
    from src.email_sequencing.models import EmailSequenceStep, EmailSubjectLineTemplate
  File "/Users/aakash/Documents/core-SellScale/sellscale-api/notebooks/website_scraper/../../src/email_sequencing/models.py", line 1, in <module>
    from app import db
  File "/Users/aakash/Documents/cor