## Creating Text Embeddings From a Text File
- Loading data using TextLoader
- Splitting into chunks using CharacterTextSplitter
- Converting chunks into vector embeddings and creating a vectorstore
- Retreiving, reducing dimensions to 2D and displaying text embeddings

In [None]:
# imports

import os
from dotenv import load_dotenv

In [None]:
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go

In [None]:
MODEL = "gpt-4o-mini"
db_name = "my_vector_db"

In [None]:
load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

In [None]:
loader = TextLoader("data.txt", encoding="utf-8")
data = loader.load()

documents = []
for text in data:
    documents.append(text)

In [None]:
text_splitter = CharacterTextSplitter(chunk_size=20, chunk_overlap=5)
chunks = text_splitter.split_documents(documents)

In [None]:
len(chunks)

In [None]:
embeddings = OpenAIEmbeddings()

In [None]:
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

In [None]:
vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

In [None]:
# Get one vector and find how many dimensions it has

collection = vectorstore._collection
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]  # represents a single vector
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions:,} dimensions")

In [None]:
sample_embedding

In [None]:
result = collection.get(include=['embeddings','documents'])
vectors = np.array(result['embeddings'])  
documents = result['documents']

In [None]:
# Reduce dimensionality to 2D using t-SNE
tsne = TSNE(n_components=2,perplexity=5, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, opacity=0.8),
    text=[f"Text: {d[:200]}..." for d in documents],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()
