In [1]:
import os

os.environ["OPENAI_API_KEY"] = "sk-**************"


In [2]:
import pandas as pd
import json
import numpy as np

In [3]:
from openai import OpenAI

openai_client = OpenAI()


## Preprocess the dataset:

Preprocess the dataset to clean and prepare the text for indexing.

In [4]:
# Load the dataset
with open('arxiv-metadata-oai-snapshot.json', 'r') as file:
    data = [json.loads(line) for line in file]

In [5]:
# Convert to DataFrame
df = pd.DataFrame(data)

# Select a subset for demonstration purposes
df = df[['title', 'abstract']].dropna().head(1000)

# Combine title and abstract
df['text'] = df['title'] + ". " + df['abstract']

# Save the preprocessed dataset
documents = df['text'].tolist()

In [6]:
# create embedding function
def emb_text(text):
    return (
        openai_client.embeddings.create(input=text, model="text-embedding-3-small")
        .data[0]
        .embedding
    )


create milvus collection

In [7]:
# try on local host
from pymilvus import MilvusClient

milvus_client = MilvusClient(uri="http://localhost:19530", token="root:Milvus")

collection_name = "my_rag_collection"

In [8]:
if milvus_client.has_collection(collection_name):
    milvus_client.drop_collection(collection_name)

In [9]:
test_embedding = emb_text("This is a test")
embedding_dim = len(test_embedding)
#print(embedding_dim)
#print(test_embedding[:10])


In [10]:
milvus_client.create_collection(
    collection_name=collection_name,
    dimension=embedding_dim,
    metric_type="IP",  # Inner product distance
    consistency_level="Strong",  # Strong consistency level
)


In [11]:
from tqdm import tqdm

data = []

for i, line in enumerate(tqdm(documents, desc="Creating embeddings")):
    data.append({"id": i, "vector": emb_text(line), "text": line})

milvus_client.insert(collection_name=collection_name, data=data)


Creating embeddings:  23%|██▎       | 231/1000 [00:50<03:02,  4.22it/s]

In [None]:
milvus_client

build RAG

In [None]:
question = "Who is Alan Turing?"


In [None]:
search_res = milvus_client.search(
    collection_name=collection_name,
    data=[
        emb_text(question)
    ],  # Use the `emb_text` function to convert the question to an embedding vector
    limit=2,  # Return top 3 results
    search_params={"metric_type": "IP", "params": {}},  # Inner product distance
    output_fields=["text"],  # Return the text field
)


In [None]:
retrieved_lines_with_distances = [
    (res["entity"]["text"], res["distance"]) for res in search_res[0]
]
print(json.dumps(retrieved_lines_with_distances, indent=4))


In [None]:
context = "\n".join(
    [line_with_distance[0] for line_with_distance in retrieved_lines_with_distances]
)


In [None]:
SYSTEM_PROMPT = """
Human: You are an AI assistant. You are able to find answers to the questions from the contextual passage snippets provided.
"""
USER_PROMPT = f"""
Use the following pieces of information enclosed in <context> tags to provide an answer to the question enclosed in <question> tags.
<context>
{context}
</context>
<question>
{question}
</question>
"""


In [None]:
response = openai_client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT},
    ],
)
print(response.choices[0].message.content)
