In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Set sensible defaults
sns.set()
sns.set_style("ticks")
sns.set_context('talk')

In [4]:
df_metadata = pd.read_csv('merged_data.csv')
df_metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2100 entries, 0 to 2099
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        2100 non-null   object
 1   url          2100 non-null   object
 2   description  2100 non-null   object
dtypes: object(3)
memory usage: 49.3+ KB


In [5]:
df_metadata['embed_index'] = -1
df_metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2100 entries, 0 to 2099
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        2100 non-null   object
 1   url          2100 non-null   object
 2   description  2100 non-null   object
 3   embed_index  2100 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 65.8+ KB


In [7]:
import openai
from dotenv import load_dotenv
import os
load_dotenv()
openai.api_key = os.environ["OPENAI_API_KEY"]

def get_embeddings(text):
    response = openai.Embedding.create(
      model="text-embedding-ada-002",
      input=text,
    )
    return response['data'][0]['embedding']

embeddings = []
for i, row in df_metadata.iterrows():
    url, title, description = row['url'], row['title'], row['description']
    print(url, title)
    
    try:
        # Combine the title and description for embedding
        embedding = get_embeddings(description)
        embeddings.append(embedding)
        df_metadata.loc[i, "embed_index"] = i
    except Exception as e:
        print(url, e)
        raise e

# Convert embeddings list to numpy array
embeddings_array = np.array(embeddings)

http://arxiv.org/abs/2307.14328v1 Generation and Life Cycle of Solar Spicules
http://arxiv.org/abs/2307.14320v1 A new Low Gain Avalanche Diode concept: the double-LGAD
http://arxiv.org/abs/2307.14293v1 Non-chiral one-dimensional sates propagating inside AB/BA domain walls
  in bilayer graphene
http://arxiv.org/abs/2307.14263v1 High-speed plasma measurements with a plasma impedance probe
http://arxiv.org/abs/2307.14258v1 Floquet engineering of the Lifshitz phase transition in the Hubbard
  model
http://arxiv.org/abs/2307.14254v1 Deconfined pseudocriticality in a model spin-1 quantum antiferromagnet
http://arxiv.org/abs/2307.14248v1 Criteria and analytical results for the Pseudogap at the Van Hove point
  in two dimensions
http://arxiv.org/abs/2307.14196v1 Topological Insulators
http://arxiv.org/abs/2307.14191v1 Randomized higher-order tensor renormalization group
http://arxiv.org/abs/2307.14176v1 Extension of Judd-Ofelt theory: Application on Eu$^{3+}$, Nd$^{3+}$ and
  Er$^{3+}$
http://

In [8]:
embeddings_array.shape

(2100, 1536)

In [9]:
df_metadata

Unnamed: 0,title,url,description,embed_index
0,Generation and Life Cycle of Solar Spicules,http://arxiv.org/abs/2307.14328v1,PAPER: Generation and Life Cycle of Solar Spic...,0
1,A new Low Gain Avalanche Diode concept: the do...,http://arxiv.org/abs/2307.14320v1,PAPER: A new Low Gain Avalanche Diode concept:...,1
2,Non-chiral one-dimensional sates propagating i...,http://arxiv.org/abs/2307.14293v1,PAPER: Non-chiral one-dimensional sates propag...,2
3,High-speed plasma measurements with a plasma i...,http://arxiv.org/abs/2307.14263v1,PAPER: High-speed plasma measurements with a p...,3
4,Floquet engineering of the Lifshitz phase tran...,http://arxiv.org/abs/2307.14258v1,PAPER: Floquet engineering of the Lifshitz pha...,4
...,...,...,...,...
2095,RedCloud-OS,https://github.com/RedTeamOperations/RedCloud-OS,"REPO: RedCloud-OS [2023-07-05T12:22:49Z, stars...",2095
2096,LLM-Reading-List,https://github.com/evanmiller/LLM-Reading-List,"REPO: LLM-Reading-List [2023-07-26T15:16:28Z, ...",2096
2097,prompts-royale,https://github.com/meistrari/prompts-royale,"REPO: prompts-royale [2023-07-12T03:27:22Z, st...",2097
2098,LLMDrift,https://github.com/lchen001/LLMDrift,"REPO: LLMDrift [2023-07-18T05:54:14Z, stars: 2...",2098


In [18]:
import sklearn
from sklearn.metrics.pairwise import cosine_similarity

def search_knowledgebase(query):
    # Get the embedding of the query
    query_embedding = get_embeddings(query)
    query_embedding = np.array(query_embedding).reshape(1, -1)

    # Calculate cosine similarities
    similarities = cosine_similarity(query_embedding, embeddings_array)
    similarities = similarities.flatten()

    # Create a DataFrame for easy manipulation
    df = df_metadata.copy()
    df['similarity'] = similarities

    # Sort by similarity
    df_sorted = df.sort_values(by='similarity', ascending=False)

    # Get the ranked list of titles and descriptions
    results = df_sorted[['title', 'description', 'similarity']]

    return results

search_knowledgebase("Writing code using AI")


Unnamed: 0,title,description,similarity
2087,mentat,"REPO: mentat [2023-07-25T18:00:13Z, stars: 273...",0.836336
2053,code-review-gpt,"REPO: code-review-gpt [2023-07-06T22:15:26Z, s...",0.820377
2094,openai-quickstart,"REPO: openai-quickstart [2023-07-17T15:43:10Z,...",0.818189
2047,CodeGeeX2,"REPO: CodeGeeX2 [2023-07-23T18:26:53Z, stars: ...",0.812625
366,Artificial Intelligence for Science in Quantum...,PAPER: Artificial Intelligence for Science in ...,0.808637
...,...,...,...
624,Porous CrO$_2$: a ferromagnetic half-metallic ...,PAPER: Porous CrO$_2$: a ferromagnetic half-me...,0.660380
1211,Gilbert damping in metallic ferromagnets from ...,PAPER: Gilbert damping in metallic ferromagnet...,0.659543
25,Magnetotransport and Berry phase Tuning in Gd-...,PAPER: Magnetotransport and Berry phase Tuning...,0.659084
297,Lowest-order QED radiative corrections in unpo...,PAPER: Lowest-order QED radiative corrections ...,0.658636


In [10]:
# Write embeddings_array to pickle file
import pickle
with open('embeddings.pkl', 'wb') as f:
    pickle.dump(embeddings_array, f)
# Write df_metadata to csv
df_metadata.to_csv("metadata.csv", index=False)