# Indexing: Text Embedding with OpenAI

In [1]:
from langchain_community.document_loaders import Docx2txtLoader
from langchain_text_splitters.markdown import MarkdownHeaderTextSplitter
from langchain_text_splitters.character import CharacterTextSplitter
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
import numpy as np

In [2]:
loader_docx = Docx2txtLoader("Introduction_to_Data_and_Data_Science_2.docx")
pages = loader_docx.load()

md_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on = [("#", "Course Title"), 
                           ("##", "Lecture Title")]
)

pages_md_split = md_splitter.split_text(pages[0].page_content)

for i in range(len(pages_md_split)):
    pages_md_split[i].page_content = ' '.join(pages_md_split[i].page_content.split())
    
char_splitter = CharacterTextSplitter(
    separator = ".",
    chunk_size = 500,
    chunk_overlap  = 50
)

pages_char_split = char_splitter.split_documents(pages_md_split)

In [3]:
pages_char_split

[Document(metadata={'Course Title': 'Introduction to Data and Data Science', 'Lecture Title': 'Analysis vs Analytics'}, page_content='Alright! So… Let’s discuss the not-so-obvious differences between the terms analysis and analytics. Due to the similarity of the words, some people believe they share the same meaning, and thus use them interchangeably. Technically, this isn’t correct. There is, in fact, a distinct difference between the two. And the reason for one often being used instead of the other is the lack of a transparent understanding of both. So, let’s clear this up, shall we? First, we will start with analysis'),
 Document(metadata={'Course Title': 'Introduction to Data and Data Science', 'Lecture Title': 'Analysis vs Analytics'}, page_content='Consider the following… You have a huge dataset containing data of various types. Instead of tackling the entire dataset and running the risk of becoming overwhelmed, you separate it into easier to digest chunks and study them individu

In [4]:
# Old problematic line (OpenAI):
# embedding = OpenAIEmbeddings(model = "text-embedding-ada-002")

# New line using a free, open-source model from Hugging Face:
# You can specify model_kwargs to run on CPU if GPU memory is an issue:
# model_kwargs = {'device': 'cpu'}
# encode_kwargs = {'normalize_embeddings': False} # Can be True or False depending on the model and your needs
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedding = HuggingFaceEmbeddings(
    model_name=embedding_model_name,
    # model_kwargs=model_kwargs, # Uncomment to run on CPU
    # encode_kwargs=encode_kwargs # Uncomment to specify normalization
)

print(f"Successfully loaded embedding model: {embedding_model_name}")


Successfully loaded embedding model: sentence-transformers/all-MiniLM-L6-v2


In [5]:
pages_char_split[18]

Document(metadata={'Course Title': 'Introduction to Data and Data Science', 'Lecture Title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need'}, page_content='More importantly, it will be sufficient for your need to create quick and accurate analyses. However, if your theoretical preparation is strong enough, you will find yourself restricted by software. Knowing a programming language such as R and Python, gives you the freedom to create specific, ad-hoc tools for each project you are working on')

In [6]:
try:
    # Ensure pages_char_split is populated correctly from your previous cells
    if not pages_char_split or len(pages_char_split) < 19:
        raise ValueError("pages_char_split is not populated correctly or is too short.")

    print(f"Embedding content from document chunk 3: '{pages_char_split[3].page_content[:100]}...'")
    vector1 = embedding.embed_query(pages_char_split[3].page_content)
    
    print(f"Embedding content from document chunk 5: '{pages_char_split[5].page_content[:100]}...'")
    vector2 = embedding.embed_query(pages_char_split[5].page_content)
    
    print(f"Embedding content from document chunk 18: '{pages_char_split[18].page_content[:100]}...'")
    vector3 = embedding.embed_query(pages_char_split[18].page_content)

    print(f"Successfully generated embeddings. Vector lengths: {len(vector1)}, {len(vector2)}, {len(vector3)}")

    # Your subsequent code for dot products and norms
    dot_product_1_2 = np.dot(vector1, vector2)
    dot_product_1_3 = np.dot(vector1, vector3)
    dot_product_2_3 = np.dot(vector2, vector3)

    norm_1 = np.linalg.norm(vector1)
    norm_2 = np.linalg.norm(vector2)
    norm_3 = np.linalg.norm(vector3)

    print(f"Dot products: (1,2): {dot_product_1_2}, (1,3): {dot_product_1_3}, (2,3): {dot_product_2_3}")
    print(f"Norms: V1: {norm_1}, V2: {norm_2}, V3: {norm_3}")

except Exception as e:
    print(f"An error occurred during embedding or calculation: {e}")
    # Consider more specific error handling if needed

Embedding content from document chunk 3: 'Analytics is essentially the application of logical and computational reasoning to the component par...'
Embedding content from document chunk 5: 'You may use this intuition to decide on which styles of clothing to start selling. This would be qua...'
Embedding content from document chunk 18: 'More importantly, it will be sufficient for your need to create quick and accurate analyses. However...'
Successfully generated embeddings. Vector lengths: 384, 384, 384
Dot products: (1,2): 0.3715553261076441, (1,3): 0.3603741994879535, (2,3): 0.13479663912594328
Norms: V1: 0.9999999688589891, V2: 1.0000000051301188, V3: 0.9999999951808224


In [7]:
len(vector1), len(vector2), len(vector3)

(384, 384, 384)

In [8]:
np.dot(vector1, vector2), np.dot(vector1, vector3), np.dot(vector2, vector3)

(0.3715553261076441, 0.3603741994879535, 0.13479663912594328)

In [9]:
np.linalg.norm(vector1), np.linalg.norm(vector2), np.linalg.norm(vector3)

(0.9999999688589891, 1.0000000051301188, 0.9999999951808224)