In [22]:
import requests
from bs4 import BeautifulSoup
import json
import sys
import openai
import os
import datetime

OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
openai.api_key = OPENAI_API_KEY


In [23]:
# Taken from src/ml/openai_wrappers.py
from ctypes import Union
from typing import Optional


CURRENT_OPENAI_DAVINCI_MODEL = "text-davinci-003"
CURRENT_OPENAI_CHAT_GPT_MODEL = "gpt-3.5-turbo"
CURRENT_OPENAI_LATEST_GPT_MODEL = "gpt-4"
DEFAULT_SUFFIX = None
DEFAULT_MAX_TOKENS = 16
DEFAULT_TEMPERATURE = 1
DEFAULT_TOP_P = 1
DEFAULT_N = 1
DEFAULT_FREQUENCY_PENALTY = 0
DEFAULT_STOP = None

def wrapped_chat_gpt_completion(
    messages: list,
    history: Optional[list] = [],
    max_tokens: Optional[int] = DEFAULT_MAX_TOKENS,
    temperature: Optional[float] = DEFAULT_TEMPERATURE,
    top_p: Optional[float] = DEFAULT_TOP_P,
    n: Optional[int] = DEFAULT_N,
    frequency_penalty: Optional[float] = DEFAULT_FREQUENCY_PENALTY,
):
    """
    Generates a completion using the GPT-3.5-turbo model.

    messages needs to be in the format:
    [
        {
            "role": "user",
            "content": "Hello, how are you?"
        },
        {
            "role": "assistant",
            "content": "I am doing well, how about you?"
        }
        ...
    ]
    """
    if history:
        messages = history + messages

    response = openai.ChatCompletion.create(
        model=CURRENT_OPENAI_LATEST_GPT_MODEL,
        messages=messages,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        n=n,
        frequency_penalty=frequency_penalty,
    )
    if response is None or response["choices"] is None or len(response["choices"]) == 0:
        return [], ""

    choices = response["choices"]
    top_choice = choices[0]
    preview = top_choice["message"]["content"].strip()

    messages = messages + [{"role": "assistant", "content": preview}]
    return messages, preview


In [24]:
import requests
from bs4 import BeautifulSoup
import json
import sys
import openai
import os
from typing import List, Optional

OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
openai.api_key = OPENAI_API_KEY

DEFAULT_MAX_TOKENS = 150
DEFAULT_TEMPERATURE = 0.7
DEFAULT_TOP_P = 1.0
DEFAULT_N = 1
DEFAULT_FREQUENCY_PENALTY = -0.5


def fetch_html(url: str) -> str:
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        print(f"Failed to fetch URL: {e}")
        return ""

def get_all_links(url: str, html: str) -> List[str]:
    soup = BeautifulSoup(html, 'html.parser')
    links = [a['href'] for a in soup.find_all('a', href=True)]
    links = [link if link.startswith('http') else url + link for link in links]
    return links

def infer_jobs_url(links: List[str]) -> str:
    messages = [
        {"role": "user", "content": "Given the following list of URLs, identify the most likely URL for the jobs or careers page. IMPORTANT: ONLY include the link in your response:"},
        {"role": "assistant", "content": "\n".join(links)[0:8000]}
    ]
    response = wrapped_chat_gpt_completion(messages=messages)
    inferred_url = response[1]
    return inferred_url

def summarize_jobs_page(html: str, retries=3) -> str:
    # only keep text from website
    try:
        soup = BeautifulSoup(html, 'html.parser')
        for script in soup(["script", "style"]):
            script.extract()
        text = soup.get_text()
        html = " ".join(text.split())
        messages = [
            {"role": "user", "content": f"List all the job titles found in this HTML in a JSON with the key 'titles' and the value as an array of job titles: {html[0:8000]}"}
        ]
        response = wrapped_chat_gpt_completion(messages=messages, max_tokens=400)
        summary = response[1]

        json_test = json.loads(summary)

        return summary
    except Exception as e:
        print(f"Failed to summarize jobs page: {e}")
        if retries > 0:
            return summarize_jobs_page(html, retries - 1)
        return "\{\}"

def summarize_hiring_needs(titles: list[str]) -> str:
    messages = [
        {"role": "user", "content": f"Given the following list of job titles, at a high level, describe what types of roles the company is hiring for in one sentence. (i.e. [company] is hiring for [roles] to [particular reason]): {titles}"}
    ]
    response = wrapped_chat_gpt_completion(messages=messages, max_tokens=400)
    summary = response[1]
    return summary

def fetch_job_data(website_url: str):

    print("1. Analyzing website: {website_url}\n".format(website_url=website_url))
    
    html = fetch_html(website_url)
    if not html:
        print("Failed to fetch main page.")
        return
    
    links = get_all_links(website_url, html)
    jobs_url = infer_jobs_url(links)
    print("2. Found jobs URL: {jobs_url}\n".format(jobs_url=jobs_url))

    jobs_html = fetch_html(jobs_url)
    if not jobs_html:
        print("Failed to fetch jobs page.")
        return

    summary = summarize_jobs_page(jobs_html)

    print(f"3. Summary of active job listings: {summary}\n")

    summary = json.loads(summary)

    hiring_needs_summary = summarize_hiring_needs(summary.get('titles', []))
    print(f"4. Summary of hiring needs: {hiring_needs_summary}\n")

    return {
        "jobs_url": jobs_url,
        "titles": summary.get('titles', []),
        "hiring_needs_summary": hiring_needs_summary,
        "date": datetime.datetime.now().isoformat()
    }

In [25]:
fetch_job_data('https://www.sellscale.com')

1. Analyzing website: https://www.sellscale.com

2. Found jobs URL: https://www.sellscale.com/careers

3. Summary of active job listings: {
  "titles": [
    "Founding Software Engineer"
  ]
}

4. Summary of hiring needs: The company is hiring for a Founding Software Engineer role, to possibly take the lead in building the foundation of the company's tech stack and driving initial product development.



{'jobs_url': 'https://www.sellscale.com/careers',
 'titles': ['Founding Software Engineer'],
 'hiring_needs_summary': "The company is hiring for a Founding Software Engineer role, to possibly take the lead in building the foundation of the company's tech stack and driving initial product development.",
 'date': '2023-10-24T20:30:10.883745'}

In [26]:
import csv
import time
import os

def scrape_website_urls_in_csv(csv_name: str):
    fieldnames = ['linkedin_url', 'id', 'full_name', 'company', 'company_url', 'Company Job Board URL', 'Open Positions', 'Summary', 'ran_check']

    with open(csv_name, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            if row['ran_check'] == 'TRUE':
                continue
            company_url = row['company_url']
            print(f"Scraping {company_url}")
            try:
                job_data = fetch_job_data(company_url)
                if not job_data:
                    continue
                row['Company Job Board URL'] = job_data['jobs_url']
                row['Open Positions'] = job_data['titles']
                row['Summary'] = job_data['hiring_needs_summary']
            except Exception as e:
                print(f"Failed to scrape {company_url}: {e}")
            row['ran_check'] = 'TRUE'
            with open(csv_name, 'a') as csvfile:
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                writer.writerow(row)
            time.sleep(5)

scrape_website_urls_in_csv('grimes.csv')

Use this query:
```
select 
	linkedin_url,
	id,
	full_name,
	company,
	company_url,
	title,
	'' "Company Job Board URL",
	'' "Open Positions",
	'' "Summary",
	'' "ran_check"
	
from prospect
where client_sdr_id = 103
	and prospect.overall_status = 'PROSPECTED'
	and prospect.icp_fit_score > 2
	and company_url is not null
	and prospect.industry is not null
order by employee_count asc
limit 300;
```