# Indexing: Text Embedding with OpenAI

In [1]:
import sys
sys.path.append("..")
import sks_config
# sks_config.SKS_OPENAI_API_KEY    

In [2]:
from langchain_community.document_loaders import Docx2txtLoader
from langchain_text_splitters.markdown import MarkdownHeaderTextSplitter
from langchain_text_splitters.character import CharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document
import numpy as np

In [3]:
loader_docx = Docx2txtLoader("Introduction_to_Data_and_Data_Science-2.docx")
pages = loader_docx.load()
# pages
md_splitter = MarkdownHeaderTextSplitter(headers_to_split_on = [("#", "Course Title"),
                                                                 ("##", "Lecture Title")])
pages_md_split = md_splitter.split_text(pages[0].page_content)

for i in range(len(pages_md_split)):
    pages_md_split[i].page_content = ' '.join(pages_md_split[i].page_content.split())
    
char_splitter = CharacterTextSplitter(
    separator = ".",
    chunk_size = 500,
    chunk_overlap  = 50
)

pages_char_split = char_splitter.split_documents(pages_md_split)
len(pages_char_split)

21

In [4]:
embedding = OpenAIEmbeddings(api_key = sks_config.SKS_OPENAI_API_KEY, 
                             model = "text-embedding-ada-002")

In [20]:
pages_char_split[10]

Document(metadata={'Course Title': 'Introduction to Data and Data Science', 'Lecture Title': 'As you have probably guessed, analytics generally'}, page_content='In other words, R, and Python are adaptable. They can solve a wide variety of business and data-related problems from beginning to the end. Of course, R, and Python do have their limitations. They are not able to address problems specific to some domains. One example is ‘relational database management systems’—there, SQL is king. It was specifically created for that purpose. SQL is at its most advantageous when working with traditional, historical data')

In [6]:
vector1 = embedding.embed_query(pages_char_split[3].page_content)
vector2 = embedding.embed_query(pages_char_split[5].page_content)
vector3 = embedding.embed_query(pages_char_split[18].page_content)

In [7]:
len(vector1), len(vector2), len(vector3)

(1536, 1536, 1536)

In [8]:
np.dot(vector1, vector2), np.dot(vector1, vector3), np.dot(vector2, vector3)

(np.float64(0.8723196380287559),
 np.float64(0.8096156669724561),
 np.float64(0.8128024768485249))

In [9]:
np.linalg.norm(vector1), np.linalg.norm(vector2), np.linalg.norm(vector3)

(np.float64(1.0000000678197294),
 np.float64(0.9999999432048748),
 np.float64(0.9999999243456454))

# Store in ChromaDB Vector Store

In [10]:
vectorstore = Chroma.from_documents(documents = pages_char_split, 
                                    embedding = embedding, 
                                    persist_directory = "./sks-chromadb")

In [11]:
vectorstore_from_directory = Chroma(persist_directory = "./sks-chromadb",
                                    embedding_function = embedding)

  vectorstore_from_directory = Chroma(persist_directory = "./sks-chromadb",


In [12]:
# vectorstore_from_directory.get()
vectorstore_from_directory.get(ids = "ba3c29da-3593-452b-a7bd-62dfc0909d51", 
                               include = ["embeddings"])

{'ids': ['ba3c29da-3593-452b-a7bd-62dfc0909d51'],
 'embeddings': array([[ 0.00479949, -0.01397011,  0.0238505 , ...,  0.02041112,
         -0.01171889, -0.00556553]], shape=(1, 1536)),
 'documents': None,
 'uris': None,
 'included': ['embeddings'],
 'data': None,
 'metadatas': None}

In [13]:
added_document = Document(page_content = "In terms of predictive analytics, EViews is mostly used for working with econometric time-series models, and Stata—for academic statistical and econometric research, where techniques like regression, cluster, and factor analysis are constantly applied. As a final note, remember the following. Should you have the relevant business and theoretical knowledge, learning a software tool is relatively easy as opposed to learning a programming language",
                         metadata = {"Course Title": "Introduction to Data and Data Science",
                                      "Lecture Title": "As you have probably guessed, analytics generally"})

In [15]:
vectorstore_from_directory.add_documents([added_document])

['30edb7cf-e3b6-425f-8cad-6408c1e65f26']

In [16]:
vectorstore_from_directory.get("30edb7cf-e3b6-425f-8cad-6408c1e65f26")

{'ids': ['30edb7cf-e3b6-425f-8cad-6408c1e65f26'],
 'embeddings': None,
 'documents': ['In terms of predictive analytics, EViews is mostly used for working with econometric time-series models, and Stata—for academic statistical and econometric research, where techniques like regression, cluster, and factor analysis are constantly applied. As a final note, remember the following. Should you have the relevant business and theoretical knowledge, learning a software tool is relatively easy as opposed to learning a programming language'],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': [{'Course Title': 'Introduction to Data and Data Science',
   'Lecture Title': 'As you have probably guessed, analytics generally'}]}

In [21]:
updated_document = Document(page_content = "In other words, R, and Python are adaptable. They can solve a wide variety of business and data-related problems from beginning to the end. Of course, R, and Python do have their limitations. They are not able to address problems specific to some domains. One example is ‘relational database management systems’—there, SQL is king. It was specifically created for that purpose. SQL is at its most advantageous when working with traditional, historical data",
                         metadata = {"Course Title": "Introduction to Data and Data Science",
                                      "Lecture Title": "As you have probably guessed, analytics generally"})

In [22]:
vectorstore_from_directory.update_document(document_id = "30edb7cf-e3b6-425f-8cad-6408c1e65f26",
                                          document = updated_document)

In [24]:
vectorstore_from_directory.get("30edb7cf-e3b6-425f-8cad-6408c1e65f26")

{'ids': ['30edb7cf-e3b6-425f-8cad-6408c1e65f26'],
 'embeddings': None,
 'documents': ['In other words, R, and Python are adaptable. They can solve a wide variety of business and data-related problems from beginning to the end. Of course, R, and Python do have their limitations. They are not able to address problems specific to some domains. One example is ‘relational database management systems’—there, SQL is king. It was specifically created for that purpose. SQL is at its most advantageous when working with traditional, historical data'],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': [{'Lecture Title': 'As you have probably guessed, analytics generally',
   'Course Title': 'Introduction to Data and Data Science'}]}