In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Set sensible defaults
sns.set()
sns.set_style("ticks")
sns.set_context('talk')

In [3]:
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics.pairwise import cosine_similarity
import openai
import os

In [4]:
from dotenv import load_dotenv
load_dotenv()
openai.api_key = os.environ["OPENAI_API_KEY"]

In [7]:
# Load data at startup
df_metadata = pd.read_csv('metadata.csv')
with open('embeddings.pkl', 'rb') as f:
    embeddings_array = pickle.load(f)

df_metadata.shape, embeddings_array.shape

((2100, 4), (2100, 1536))

In [14]:
def get_embeddings(text):
    response = openai.Embedding.create(
        model="text-embedding-ada-002",
        input=text,
    )
    return response['data'][0]['embedding']


def search_knowledgebase(query, min_score=0.75):
    # Get the embedding of the query
    query_embedding = get_embeddings(query)
    query_embedding = np.array(query_embedding).reshape(1, -1)

    # Calculate cosine similarities
    similarities = cosine_similarity(query_embedding, embeddings_array)
    similarities = similarities.flatten()

    # Create a DataFrame for easy manipulation
    df = df_metadata.copy()
    df['similarity'] = similarities

    # Sort by similarity
    df_sorted = df.sort_values(by='similarity', ascending=False)

    # Get the ranked list of titles and descriptions
    results = df_sorted[['title', 'url', 'description', 'similarity']]
    results = results[results['similarity'] >= min_score]

    return results

def search(text: str):
    # Perform search
    results = search_knowledgebase(text)

    # Convert top 5 results to desired format
    top_results = results.head(5)
    top_results_list = top_results.to_dict('records')

    # Prepare response
    response = {'results': top_results_list}
    return response


In [15]:
search('I am an angel investor interested in understanding LLMs so I can perform due diligence on pre-seed startups. angel investor, due diligence, pre-seed startups, LLMs, understanding, machine learning, natural language processing, deep learning, neural networks, artificial intelligence, data analysis, predictive modeling, investment strategy, risk assessment, startup evaluation, decision making, venture capital, investment opportunities, technology trends, market research, industry analysis, startup ecosystem, investment portfolio, startup valuation, early-stage companies, investment thesis, startup funding, angel investing, startup growth, startup success, founder evaluation, business model, competitive advantage, market potential, scalability, product-market fit, revenue model, customer acquisition, traction, team assessment, leadership, market disruption, innovation, industry disruption, regulatory landscape, intellectual property, exit strategy, startup ecosystem, mentorship, networking, startup community, startup ecosystem, startup incubator, startup accelerator')

{'results': [{'title': 'LLM-Reading-List',
   'url': 'https://github.com/evanmiller/LLM-Reading-List',
   'description': "REPO: LLM-Reading-List [2023-07-26T15:16:28Z, stars: 254] | LLM papers I'm reading, mostly on inference and model compression",
   'similarity': 0.7985937347757681},
  {'title': 'Natural Language Processing in Electronic Health Records in Relation to\n  Healthcare Decision-making: A Systematic Review',
   'url': 'http://arxiv.org/abs/2306.12834v1',
   'description': 'PAPER: Natural Language Processing in Electronic Health Records in Relation to\n  Healthcare Decision-making: A Systematic Review [2023-06-22T12:10:41Z] |   Background: Natural Language Processing (NLP) is widely used to extract\nclinical insights from Electronic Health Records (EHRs). However, the lack of\nannotated data, automated tools, and other challenges hinder the full\nutilisation of NLP for EHRs. Various Machine Learning (ML), Deep Learning (DL)\nand NLP techniques are studied and compared to u

In [37]:
CONSTANTS = {
  "vibes": {
    "identities": ["an Investor", "a Tech Founder"],
    "interests": ["Generative AI", "AI Agents"],
    "goals": [
      "know what insiders are talking about",
      "assess early-stage tech startups",
      "build better products"
    ]
  },
  "prompts": {
    "vibeparse": {
      "max_tokens": 200,
      "model": "gpt-3.5-turbo",
      "temperature": 0.5,
      "messages": [
        {
          "role": "system",
          "content": "You are a helpful assistant who helps users find articles, github repos, and arXiv papers they might be interested in. The user tells you about themself, their interests, and their goals. You return a comma-separated list of 50 key terms that are relevant to the user and their interests."
        }
      ]
    },
    "justify": {
      "max_tokens": 300,
      "model": "gpt-3.5-turbo-0613",
      "temperature": 0.2,
      "functions": [
        {
          "name": "augment_resource",
          "description": "Tailor a resource to the user's identity, interests, and goals.",
          "parameters": {
            "type": "object",
            "properties": {
              "short_summary": {
                "type": "string",
                "description": "A one-sentence summary of the resource, tailored to the user's identity, interests, and goals."
              },
              "justification": {
                "type": "string",
                "description": "A two-sentence justification for why the resource is relevant to the user, based on their goals."
              }
            },
            "required": ["short_summary", "justification"]
          }
        }
      ],
      "function_call": { "name": "augment_resource" },
      "messages": [
        {
          "role": "system",
          "content": "You are a helpful assistant who helps the user make sense of articles, github repos, and arXiv papers they might be interested in given their identity, interests, and goals. Only use the functions you have been provided with."
        }
      ]
    },
  }
}


In [44]:
from copy import deepcopy as copy
# User's input to frontend
raw_vibe = "I am an angel investor interested in understanding LLMs so I can perform due diligence on pre-seed startups."
print(raw_vibe)

# Extend the user's "vibe" with GPT
params = copy(CONSTANTS['prompts']['vibeparse'])
params['messages'].append(
    {"role": "user", "content": raw_vibe}
)
vibe_ext = openai.ChatCompletion.create(
    **params,
).choices[0].message.content
print(vibe_ext)

# Perform embedding search
results = search(f"{raw_vibe}, {vibe_ext}")['results']
print(results)
print(len(results))

I am an angel investor interested in understanding LLMs so I can perform due diligence on pre-seed startups.
angel investor, due diligence, pre-seed startups, understanding LLMs, machine learning, natural language processing, deep learning, language models, artificial intelligence, investment, startups, venture capital, risk assessment, decision making, investment analysis, startup evaluation, predictive modeling, data analysis, investment strategy, investment portfolio, investment opportunities, investment trends, startup ecosystem, technology startups, early-stage startups, startup funding, investment thesis, investment criteria, startup valuation, market analysis, competitive analysis, industry research, investment landscape, startup growth, startup success, startup failure, investment risks, investment returns, startup scalability, market potential, market size, market demand, customer acquisition, revenue model, business model, competitive advantage, intellectual property, team as

In [63]:
import json
# Loop through results and justify each one
for result in results:
    params = copy(CONSTANTS['prompts']['justify'])
    params['messages'].extend([
        {"role": "user", "content": raw_vibe},
        {"role": "user", "content": str(result)},
    ])
    justification = openai.ChatCompletion.create(
        **params,
    ).choices[0]['message']['function_call']['arguments']
    justification = json.loads(justification)
    summary = justification['short_summary']
    reason = justification['justification']
    print(f"{result['title']}\n{result['url']}\n{summary}\n{reason}\n\n")



LLM-Reading-List
https://github.com/evanmiller/LLM-Reading-List
LLM-Reading-List is a curated list of papers on LLMs, focusing on inference and model compression.
This resource is relevant to you as an angel investor interested in understanding LLMs for due diligence on pre-seed startups.


LLM-eval-survey
https://github.com/MLGroupJLU/LLM-eval-survey
A Survey on Evaluation of Large Language Models
This survey paper provides an overview of the evaluation methods and challenges for large language models, which can help you assess the quality and capabilities of pre-seed startups utilizing LLMs.


Natural Language Processing in Electronic Health Records in Relation to
  Healthcare Decision-making: A Systematic Review
http://arxiv.org/abs/2306.12834v1
Natural Language Processing in Electronic Health Records in Relation to Healthcare Decision-making: A Systematic Review
This paper provides a systematic review of the use of Natural Language Processing (NLP) in Electronic Health Records (EHR