In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Set sensible defaults
sns.set()
sns.set_style("ticks")
sns.set_context('talk')

In [2]:
"""https://github.com/gianlucatruda/GPTools/blob/master/web2md/web2md.py
"""
import argparse
import os
import sys
from pathlib import Path
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service as ChromeService
from bs4 import BeautifulSoup
import html2text
import requests
from urllib.parse import urlparse, parse_qs
import feedparser


def extract_text(url: str, ignore_images: bool) -> str:
    options = webdriver.ChromeOptions()
    options.add_argument("headless")
    options.add_argument("disable-infobars")
    options.add_argument("--disable-extensions")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("start-maximized")
    options.add_argument("--disable-gpu")
    options.add_argument("--disable-software-rasterizer")
    options.add_argument("--remote-debugging-port=9222")

    # Add the following line to set a user agent
    options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0;Win64) AppleWebkit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36"
    )
    options.binary_location = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"

    driver = webdriver.Chrome(
        service=ChromeService(executable_path=ChromeDriverManager().install()),
        options=options,
    )

    driver.get(url)
    html = driver.page_source
    driver.quit()

    soup = BeautifulSoup(html, "html.parser")

    if ignore_images:
        for img in soup.find_all("img"):
            img.decompose()

    html_content = str(soup)

    h = html2text.HTML2Text()
    h.ignore_links = False
    h.body_width = 0
    return h.handle(html_content)


def get_arxiv_info(id):
    r = requests.get(f'http://export.arxiv.org/api/query?id_list={id}')
    feed = feedparser.parse(r.content)

    title = feed.entries[0].title
    summary = feed.entries[0].summary

    return title, summary

def get_github_info(user, repo):
    r = requests.get(f'https://api.github.com/repos/{user}/{repo}')
    data = r.json()

    name = data['name']
    description = data['description']

    return name, description



In [3]:
url_list = [
    "https://github.com/invoke-ai/InvokeAI",
    "https://github.com/microsoft/Data-Science-For-Beginners",
    "https://github.com/chathub-dev/chathub",
    "https://github.com/sweepai/sweep",
    "https://github.com/karpathy/llama2.c",
    "https://github.com/assafelovic/gpt-researcher",
    "https://github.com/floneum/floneum",
    "https://github.com/swyxio/chatgpt-mac",
    "https://github.com/jamesmurdza/agenteval",
    "https://github.com/langgenius/dify",
    "https://github.com/1rgs/jsonformer",
    "https://github.com/simonw/symbex",
    "https://github.com/Lightning-AI/lit-gpt",
    "https://github.com/AntonOsika/gpt-engineer",
    "https://github.com/gianlucatruda/GPTools",
    "https://github.com/imartinez/privateGPT",
    "https://arxiv.org/abs/2307.04492",
    "https://arxiv.org/abs/2307.04349",
    "https://arxiv.org/abs/2307.05074",
    "https://arxiv.org/abs/2307.02179",
    "https://arxiv.org/abs/2307.08678",
    "https://arxiv.org/abs/2307.08191",
    "https://arxiv.org/abs/2307.09909",
    "https://arxiv.org/abs/2307.04964",
    "https://arxiv.org/abs/2307.00184",
    "https://arxiv.org/abs/2307.02502",
    "https://arxiv.org/abs/2306.03809",
    "https://arxiv.org/abs/2306.03604",
    "https://arxiv.org/abs/2305.03819",
    "https://arxiv.org/abs/2303.11381",
    "https://arxiv.org/abs/2306.01499",
    "https://arxiv.org/abs/2306.17459",
    "https://arxiv.org/abs/2307.06917",

]
metadata = {"url": [], "title": [], "description": []}

for url in url_list:
    try:
        parsed_url = urlparse(url)
        if "github.com" in parsed_url.netloc:
            user, repo = parsed_url.path.strip("/").split("/")
            title, description = get_github_info(user, repo)
        elif "arxiv.org" in parsed_url.netloc:
            id = parsed_url.path.split('/')[-1]  # Get the last part of the URL path
            title, description = get_arxiv_info(id)
        else:
            print(f"Unknown domain for url: {url}")
            continue

        metadata["url"].append(url)
        metadata["title"].append(title)
        metadata["description"].append(description)
    except Exception as e:
        print(url, e)

https://github.com/Lightning-AI/lit-gpt Expecting value: line 1 column 1 (char 0)
https://github.com/imartinez/privateGPT Expecting value: line 1 column 1 (char 0)


In [4]:
metadata

{'url': ['https://github.com/invoke-ai/InvokeAI',
  'https://github.com/microsoft/Data-Science-For-Beginners',
  'https://github.com/chathub-dev/chathub',
  'https://github.com/sweepai/sweep',
  'https://github.com/karpathy/llama2.c',
  'https://github.com/assafelovic/gpt-researcher',
  'https://github.com/floneum/floneum',
  'https://github.com/swyxio/chatgpt-mac',
  'https://github.com/jamesmurdza/agenteval',
  'https://github.com/langgenius/dify',
  'https://github.com/1rgs/jsonformer',
  'https://github.com/simonw/symbex',
  'https://github.com/AntonOsika/gpt-engineer',
  'https://github.com/gianlucatruda/GPTools',
  'https://arxiv.org/abs/2307.04492',
  'https://arxiv.org/abs/2307.04349',
  'https://arxiv.org/abs/2307.05074',
  'https://arxiv.org/abs/2307.02179',
  'https://arxiv.org/abs/2307.08678',
  'https://arxiv.org/abs/2307.08191',
  'https://arxiv.org/abs/2307.09909',
  'https://arxiv.org/abs/2307.04964',
  'https://arxiv.org/abs/2307.00184',
  'https://arxiv.org/abs/2307.0

In [5]:
df_metadata = pd.DataFrame(metadata)
df_metadata

Unnamed: 0,url,title,description
0,https://github.com/invoke-ai/InvokeAI,InvokeAI,InvokeAI is a leading creative engine for Stab...
1,https://github.com/microsoft/Data-Science-For-...,Data-Science-For-Beginners,"10 Weeks, 20 Lessons, Data Science for All!"
2,https://github.com/chathub-dev/chathub,chathub,All-in-one chatbot client
3,https://github.com/sweepai/sweep,sweep,Sweep is an AI junior developer
4,https://github.com/karpathy/llama2.c,llama2.c,Inference Llama 2 in one file of pure C
5,https://github.com/assafelovic/gpt-researcher,gpt-researcher,GPT based autonomous agent that does online co...
6,https://github.com/floneum/floneum,floneum,A graph editor for local AI workflows
7,https://github.com/swyxio/chatgpt-mac,chatgpt-mac,"ChatGPT for Mac, living in your menubar."
8,https://github.com/jamesmurdza/agenteval,agenteval,Automated testing and benchmarking for code ge...
9,https://github.com/langgenius/dify,dify,"One API for plugins and datasets, one interfac..."


In [6]:
df_metadata.to_csv("metadata.csv", index=False)

In [25]:
df_metadata['embed_index'] = -1
df_metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31 entries, 0 to 30
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   url          31 non-null     object
 1   title        31 non-null     object
 2   description  31 non-null     object
 3   embed_index  31 non-null     int64 
dtypes: int64(1), object(3)
memory usage: 1.1+ KB


In [26]:
import openai
import dotenv
openai.api_key = os.getenv("OPENAI_API_KEY")

def get_embeddings(text):
    response = openai.Embedding.create(
      model="text-embedding-ada-002",
      input=text,
    )
    return response['data'][0]['embedding']

embeddings = []
for i, row in df_metadata.iterrows():
    url, title, description = row['url'], row['title'], row['description']
    print(url, title)
    
    try:
        # Combine the title and description for embedding
        combined_text = title + " " + description
        embedding = get_embeddings(combined_text)
        embeddings.append(embedding)
        df_metadata.loc[i, "embed_index"] = i
    except Exception as e:
        print(url, e)
        # raise e

# Convert embeddings list to numpy array
embeddings_array = np.array(embeddings)

https://github.com/invoke-ai/InvokeAI InvokeAI
https://github.com/microsoft/Data-Science-For-Beginners Data-Science-For-Beginners
https://github.com/chathub-dev/chathub chathub
https://github.com/sweepai/sweep sweep
https://github.com/karpathy/llama2.c llama2.c
https://github.com/assafelovic/gpt-researcher gpt-researcher
https://github.com/floneum/floneum floneum
https://github.com/swyxio/chatgpt-mac chatgpt-mac
https://github.com/jamesmurdza/agenteval agenteval
https://github.com/langgenius/dify dify
https://github.com/1rgs/jsonformer jsonformer
https://github.com/simonw/symbex symbex
https://github.com/AntonOsika/gpt-engineer gpt-engineer
https://github.com/gianlucatruda/GPTools GPTools
https://arxiv.org/abs/2307.04492 Calculating Originality of LLM Assisted Source Code
https://arxiv.org/abs/2307.04349 RLTF: Reinforcement Learning from Unit Test Feedback
https://arxiv.org/abs/2307.05074 Retrieval-augmented GPT-3.5-based Text-to-SQL Framework with
  Sample-aware Prompting and Dynamic 

In [27]:
embeddings_array.shape

(31, 1536)

In [28]:
df_metadata

Unnamed: 0,url,title,description,embed_index
0,https://github.com/invoke-ai/InvokeAI,InvokeAI,InvokeAI is a leading creative engine for Stab...,0
1,https://github.com/microsoft/Data-Science-For-...,Data-Science-For-Beginners,"10 Weeks, 20 Lessons, Data Science for All!",1
2,https://github.com/chathub-dev/chathub,chathub,All-in-one chatbot client,2
3,https://github.com/sweepai/sweep,sweep,Sweep is an AI junior developer,3
4,https://github.com/karpathy/llama2.c,llama2.c,Inference Llama 2 in one file of pure C,4
5,https://github.com/assafelovic/gpt-researcher,gpt-researcher,GPT based autonomous agent that does online co...,5
6,https://github.com/floneum/floneum,floneum,A graph editor for local AI workflows,6
7,https://github.com/swyxio/chatgpt-mac,chatgpt-mac,"ChatGPT for Mac, living in your menubar.",7
8,https://github.com/jamesmurdza/agenteval,agenteval,Automated testing and benchmarking for code ge...,8
9,https://github.com/langgenius/dify,dify,"One API for plugins and datasets, one interfac...",9


In [45]:
import sklearn
from sklearn.metrics.pairwise import cosine_similarity

def search_knowledgebase(query):
    # Get the embedding of the query
    query_embedding = get_embeddings(query)
    query_embedding = np.array(query_embedding).reshape(1, -1)

    # Calculate cosine similarities
    similarities = cosine_similarity(query_embedding, embeddings_array)
    similarities = similarities.flatten()

    # Create a DataFrame for easy manipulation
    df = df_metadata.copy()
    df['similarity'] = similarities

    # Sort by similarity
    df_sorted = df.sort_values(by='similarity', ascending=False)

    # Get the ranked list of titles and descriptions
    results = df_sorted[['title', 'description', 'similarity']]

    return results

search_knowledgebase("Python code search")


Unnamed: 0,title,description,similarity
11,symbex,Find the Python code for specified symbols,0.826333
5,gpt-researcher,GPT based autonomous agent that does online co...,0.788222
3,sweep,Sweep is an AI junior developer,0.776889
6,floneum,A graph editor for local AI workflows,0.774904
13,GPTools,Composable tools for doing useful things with ...,0.769932
9,dify,"One API for plugins and datasets, one interfac...",0.766427
8,agenteval,Automated testing and benchmarking for code ge...,0.765689
1,Data-Science-For-Beginners,"10 Weeks, 20 Lessons, Data Science for All!",0.762093
2,chathub,All-in-one chatbot client,0.761229
14,Calculating Originality of LLM Assisted Source...,The ease of using a Large Language Model (LLM)...,0.754616


In [46]:
# Write embeddings_array to pickle file
import pickle
with open('embeddings.pkl', 'wb') as f:
    pickle.dump(embeddings_array, f)
# Write df_metadata to csv
df_metadata.to_csv("metadata.csv", index=False)