In [47]:
from dotenv import load_dotenv
from pathlib import Path
import os
import requests
from bs4 import BeautifulSoup
import json
import sys
import openai
import os
import datetime


env_path = Path('../..') / '.envprod'
load_dotenv(dotenv_path=env_path)

from serpapi import GoogleSearch
import openai


serp_api_key = os.environ.get('SERP_API_KEY')
openai.api_key = os.environ.get('OPENAI_KEY')

def search_google_news(query):
    # https://support.google.com/websearch/answer/2466433?hl=en
    params = {
        "api_key": serp_api_key,
        "engine": "google",
        "q": query,
        "tbm": "nws",
        "gl": "us", # US only
        "hl": "en",
        "tbm": "nws",
    }
    search = GoogleSearch(params)
    results = search.get_dict()

    return results

In [48]:
# Taken from src/ml/openai_wrappers.py
from ctypes import Union
from typing import Optional


CURRENT_OPENAI_DAVINCI_MODEL = "text-davinci-003"
CURRENT_OPENAI_CHAT_GPT_MODEL = "gpt-3.5-turbo"
CURRENT_OPENAI_LATEST_GPT_MODEL = "gpt-4"
DEFAULT_SUFFIX = None
DEFAULT_MAX_TOKENS = 16
DEFAULT_TEMPERATURE = 1
DEFAULT_TOP_P = 1
DEFAULT_N = 1
DEFAULT_FREQUENCY_PENALTY = 0
DEFAULT_STOP = None

def wrapped_chat_gpt_completion(
    messages: list,
    history: Optional[list] = [],
    max_tokens: Optional[int] = DEFAULT_MAX_TOKENS,
    temperature: Optional[float] = DEFAULT_TEMPERATURE,
    top_p: Optional[float] = DEFAULT_TOP_P,
    n: Optional[int] = DEFAULT_N,
    frequency_penalty: Optional[float] = DEFAULT_FREQUENCY_PENALTY,
):
    """
    Generates a completion using the GPT-3.5-turbo model.

    messages needs to be in the format:
    [
        {
            "role": "user",
            "content": "Hello, how are you?"
        },
        {
            "role": "assistant",
            "content": "I am doing well, how about you?"
        }
        ...
    ]
    """
    if history:
        messages = history + messages

    response = openai.ChatCompletion.create(
        model=CURRENT_OPENAI_LATEST_GPT_MODEL,
        messages=messages,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        n=n,
        frequency_penalty=frequency_penalty,
    )
    if response is None or response["choices"] is None or len(response["choices"]) == 0:
        return [], ""

    choices = response["choices"]
    top_choice = choices[0]
    preview = top_choice["message"]["content"].strip()

    messages = messages + [{"role": "assistant", "content": preview}]
    return messages, preview


In [109]:
def extract_company_name_and_fundraise_details(data: str):
    messages = [
        {"role": "user", "content": "Given the following JSON payload, return a JSON with the keys: company_name, funding_stage, amount_raised, short_description.\n-company_name: the company's name\n-funding_stage: which stage is this fundraise (i.e. seed, series b)\n-amount_raised: total amount raised (i.e. $40m)\n-short_description: short 4-5 description of the raise event\n\n Only response with JSON.\n\nJSON Output:"},
        {"role": "assistant", "content": data}
    ]
    response = wrapped_chat_gpt_completion(messages=messages, max_tokens=100)
    inferred_dta = response[1]
    
    return json.loads(inferred_dta)

def human_readable_delta(delta):
    """
    Convert a datetime.timedelta object into a human-readable string.
    """
    if delta.days >= 365:
        years = delta.days // 365
        return f"{years} year{'s' if years > 1 else ''} ago"
    elif delta.days >= 30:
        months = delta.days // 30
        return f"{months} month{'s' if months > 1 else ''} ago"
    elif delta.days > 0:
        return f"{delta.days} day{'s' if delta.days > 1 else ''} ago"
    else:
        return "recently"

def get_top_fundraise_article(company_name):
    query = f'"{company_name}" fundraise venture funding round'

    results = search_google_news(query)
    data = []
    for i in results['news_results']:
        # give score based on how often the following words appear: (company name), fundraise, raises, round, venture
        score = 0
        for word in ["fundraise", "raises", "round", "venture"]:
            if word in i['title']:
                score += 1
            if word in i['snippet']:
                score += 1

        if company_name in i['title']:
            score += 5
        if company_name in i['snippet']:
            score += 5

        data.append({
            "title": i['title'],
            "link": i['link'],
            "source": i['source'],
            "date": i['date'],
            "snippet": i['snippet'],
            "score": score
        })

        # return top scored article
    top_result = sorted(data, key=lambda x: x['score'], reverse=True)[0] if len(data) > 0 else None


    d = {}
    try:
        d = extract_company_name_and_fundraise_details(json.dumps(top_result))
    except Exception as e:
        d = d

    d['top_result'] = top_result
    d['date'] = top_result['date'] if top_result else None

    # date structured like "Jul 13, 2021"
    structured_date = datetime.datetime.strptime(d['date'], '%b %d, %Y') if d['date'] else None

    # get human readable 'time since fundraise' like "2 months ago" or "2 years ago"
    time_since_fundraise = datetime.datetime.now() - structured_date if structured_date else None
    d['human_readable_time_since_fundraise'] = human_readable_delta(time_since_fundraise) if time_since_fundraise else None
    
    # summarize into a sentence like "Acme raised a $5m funding round from Sequoia Capital 2 months ago" using wrapped_chat_gpt_completion
    messages = [
        {"role": "user", "content": "Given the following JSON payload, summarize the fundraise into a sentence. Here is an example: 'Acme raised a $5m funding round from Sequoia Capital 2 months ago' or 'Juniper won a startup competition that led them to raising a Series B a couple days back'.\n\Sentence:"},
        {"role": "assistant", "content": json.dumps(d)}
    ]
    response = wrapped_chat_gpt_completion(messages=messages, max_tokens=100)
    inferred_dta = response[1]
    d['summary'] = inferred_dta

    # Verify that the summary is talking about the right company using wrapped_chat_gpt_completion with TRUE or FALSE
    messages = [
        {"role": "user", "content": "Given the following JSON payload, verify that the summary is talking this specific company: {company}. Here is an example: 'TRUE' or 'FALSE'.\n\result:".format(company=company_name)},
        {"role": "assistant", "content": json.dumps(d)}
    ]
    response = wrapped_chat_gpt_completion(messages=messages, max_tokens=10)
    inferred_dta = response[1]
    print(inferred_dta)
    
    if inferred_dta == "TRUE":
        return d
    else:
        print("Summary is not talking about the right company")
        return None


In [116]:
get_top_fundraise_article("Simpplr")

https://serpapi.com/search
TRUE


{'company_name': 'Simpplr',
 'funding_stage': 'venture equity',
 'amount_raised': '$70M',
 'short_description': "Simpplr, a startup developing an 'employee experience' platform, has raised $70 million in a venture equity round.",
 'top_result': {'title': 'Simpplr raises $70M for its AI-powered intranet platform',
  'link': 'https://techcrunch.com/2023/05/02/simpplr-intranet-raises-70m/',
  'source': 'TechCrunch',
  'date': 'May 2, 2023',
  'snippet': "Simpplr, a startup developing an 'employee experience' platform, has raised \n$70 million in a venture equity round.",
  'score': 13},
 'date': 'May 2, 2023',
 'human_readable_time_since_fundraise': '5 months ago',
 'summary': "Simpplr, a startup developing an 'employee experience' platform, raised $70M in a venture equity round 5 months ago."}

In [133]:
# Given a csv, with the columns: linkedin_url, id, full_name, company, company_url, title, industry, Fundraise Summary
# create a new CSV with Fundraise Summary column filled out

import pandas as pd

SOURCE_CSV = "hristina.csv"
DEST_CSV = "hristina_output.csv"

df = pd.read_csv(SOURCE_CSV)

def get_fundraise_summary(row):
    company_name = row['company']
    summary = get_top_fundraise_article(company_name)
    return summary['summary'] if summary else None

for i, row in df.iterrows():
    # skip if fundraise summary already exists or is pd Nan
    if row['Fundraise Summary'] and str(row['Fundraise Summary']) != "nan":
        print("Skipping row", row['Fundraise Summary'])
        continue

    summary = get_fundraise_summary(row)
    df.at[i, 'Fundraise Summary'] = summary

    df.to_csv(DEST_CSV, index=False)

https://serpapi.com/search
'FALSE'
Summary is not talking about the right company
https://serpapi.com/search
'FALSE'
Summary is not talking about the right company
https://serpapi.com/search
TRUE
https://serpapi.com/search


ValueError: time data '3 weeks ago' does not match format '%b %d, %Y'