In [1]:
import os
import openai

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

data_dir = os.environ['DATA_DIR']

# Small Example

In [2]:
from langchain.embeddings.openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

  embeddings = OpenAIEmbeddings()


In [3]:
sentence1 = "I like books"
sentence2 = "I enjoy reading books"
sentence3 = "It has been a hectic day"

In [4]:
embeddings1 = embeddings.embed_query(sentence1)
embeddings2 = embeddings.embed_query(sentence2)
embeddings3 = embeddings.embed_query(sentence3)

In [5]:
import numpy as np
def cosine_similarity(a, b):
    a = np.array(a)
    b = np.array(b)
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [6]:
print(np.dot(embeddings1, embeddings2), cosine_similarity(embeddings1, embeddings2))
print(np.dot(embeddings1, embeddings3), cosine_similarity(embeddings1, embeddings3))
print(np.dot(embeddings2, embeddings3), cosine_similarity(embeddings2, embeddings3))

0.9589619330182793 0.9589619330182793
0.7619478769782573 0.7619478769782573
0.7729534751301653 0.7729534751301653


# Generate Embeddings for CS229 ML lectures

In [7]:
from langchain.document_loaders import PyPDFLoader

In [8]:
#Duplicate data added on purpose to simulate messy data
pdfs = [
    "machinelearning-lecture01.pdf",
    "machinelearning-lecture01.pdf",
    "MachineLearning-Lecture02.pdf",
    "MachineLearning-Lecture03.pdf"
]

In [9]:
docs = []
for pdf in pdfs:
    loader = PyPDFLoader(f"{data_dir}\\RAG\\PDF\\{pdf}")
    docs.extend(loader.load())

In [10]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150
)
chunks = text_splitter.split_documents(docs)
len(chunks)

208

In [11]:
from langchain.embeddings.openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [12]:
from langchain.vectorstores import Qdrant
from qdrant_client import QdrantClient
from pathlib import Path
persist_directory = Path(f"{data_dir}\\RAG\\VectorStore\\Qdrant")

In [13]:
client = QdrantClient(
    path=persist_directory,
    prefer_grpc=False
)
client.recreate_collection(
    collection_name="cs229_ml_lectures",
    vectors_config={
        "vector": {
            "size": 1536,
            "distance": "Cosine"
        }
    }
)
vectorstore = Qdrant(
    client=client,
    collection_name="cs229_ml_lectures",
    embeddings=embeddings,
    vector_name="vector",
)

  client.recreate_collection(
  vectorstore = Qdrant(


In [14]:
vectorstore.add_documents(chunks)

['80cc80e6fd8e41318c4bfdf2e2bd5538',
 'a844e1648dba4360935539299b96aebc',
 '9f986d396fb34eec820826b91f0d348e',
 '8e590b8726d94bca920b0b33167221e8',
 '3be956617cc6489881fadb5f1be42347',
 'fdc44732c93845308febbeb1b749d68e',
 '482fe5b55e604580b63bfe52ff8f4a6a',
 '53dad13105074992b124de4feb7d16f7',
 '8ab3139f9c30481186343fdc21347e01',
 '78800096d71f43ba918fb2c86717d0e0',
 '82535f9e35ed4f6d925fac5103550449',
 'e0ab6ee28cd14d49b35e282a32623517',
 '8c3b88f395a24cd99b37c6b49ededed5',
 '7bf4a521f1df45a5a9985d151f8fddf6',
 '821e8e2e5a23458bb3bf2490e8b3c8fa',
 'dc3f215492554a399df77436df9966b6',
 'af52642d98854c918833fcf3403c1689',
 '2ccb617b3241453881de677954852fbc',
 'bbe2551abbc04ca38a428be73640bffa',
 '0d274271e52446e5a9162dc78fc96953',
 'f66d31b8c63b45a2a91a5eb8ca81ae03',
 'fcdf00d04c764e789e6e0693c28454ff',
 'c766f6b41f13490d9f616d4c4d41d40c',
 '3c6dec6d467942beafe0e22c6836a8f3',
 '240aa5f72afc4035946b3aa120b87be1',
 'd17eef392e6e4fe88991e7f0223d9e78',
 '22af7dcbe63b4e06bac9e11b566955d4',
 

In [15]:
print(client.count(collection_name="cs229_ml_lectures"))

count=208


In [16]:
question = "Is there an email I can ask for help?"
vectorstore.similarity_search(query=question, k=3)

[Document(metadata={'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'creator': 'PScript5.dll Version 5.2.2', 'creationdate': '2008-07-11T11:25:23-07:00', 'author': '', 'moddate': '2008-07-11T11:25:23-07:00', 'title': '', 'source': 'C:\\\\Users\\\\gunit\\\\OneDrive\\\\Documents\\\\Study Material\\\\Practice Projects\\\\remote\\\\artificial-intelligence\\\\data\\RAG\\PDF\\machinelearning-lecture01.pdf', 'total_pages': 22, 'page': 5, 'page_label': '6', '_id': '821e8e2e5a23458bb3bf2490e8b3c8fa', '_collection_name': 'cs229_ml_lectures'}, page_content="cs229-qa@cs.stanford.edu. This goes to an account that's read by all the TAs and me. So \nrather than sending us email individually, if you send email to this account, it will \nactually let us get back to you maximally quickly with answers to your questions.  \nIf you're asking questions about homework problems, please say in the subject line which \nassignment and which question the email refers to, since that will also help us to route \ny

In [17]:
question = "What did they say about matlab?"
vectorstore.similarity_search(query=question, k=5)

[Document(metadata={'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'creator': 'PScript5.dll Version 5.2.2', 'creationdate': '2008-07-11T11:25:23-07:00', 'author': '', 'moddate': '2008-07-11T11:25:23-07:00', 'title': '', 'source': 'C:\\\\Users\\\\gunit\\\\OneDrive\\\\Documents\\\\Study Material\\\\Practice Projects\\\\remote\\\\artificial-intelligence\\\\data\\RAG\\PDF\\machinelearning-lecture01.pdf', 'total_pages': 22, 'page': 8, 'page_label': '9', '_id': 'be389ebc903d4a359a089d460399b197', '_collection_name': 'cs229_ml_lectures'}, page_content='those homeworks will be done in either MATLAB or in Octave, which is sort of — I \nknow some people call it a free version of MATLAB, which it sort of is, sort of isn\'t.  \nSo I guess for those of you that haven\'t seen MATLAB before, and I know most of you \nhave, MATLAB is I guess part of the programming language that makes it very easy to \nwrite codes using matrices, to write code for numerical routines, to move data around, to \nplot da

In [19]:
question = "What did they say about regression in the third lecture?"
vectorstore.similarity_search(query=question, k=10)

[Document(metadata={'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'creator': 'PScript5.dll Version 5.2.2', 'creationdate': '2008-07-11T11:25:03-07:00', 'author': '', 'moddate': '2008-07-11T11:25:03-07:00', 'title': '', 'source': 'C:\\\\Users\\\\gunit\\\\OneDrive\\\\Documents\\\\Study Material\\\\Practice Projects\\\\remote\\\\artificial-intelligence\\\\data\\RAG\\PDF\\MachineLearning-Lecture03.pdf', 'total_pages': 16, 'page': 0, 'page_label': '1', '_id': '2cf15b9bd6614b309daf9ed95f28459d', '_collection_name': 'cs229_ml_lectures'}, page_content='MachineLearning-Lecture03  \nInstructor (Andrew Ng):Okay. Good morning and welcome back to the third lecture of \nthis class. So here’s what I want to do today, and some of the topics I do today may seem \na little bit like I’m jumping, sort of, from topic to topic, but here’s, sort of, the outline for \ntoday and the illogical flow of ideas. In the last lecture, we talked about linear regression \nand today I want to talk about sort of an ad