In [None]:
import os
import sys
from google.colab import userdata

# Set up Pinecone API key (assuming you have stored it as 'pinecone_api_key' in Colab secrets)
os.environ["EDUHK_API_KEY"] = userdata.get('eduhkkey')
print(os.environ["EDUHK_API_KEY"])

os.environ["HUGGING_API_KEY"] = userdata.get('hugging')
print(os.environ["HUGGING_API_KEY"])

os.environ["PINECONE_API_KEY"] = userdata.get('pinecone')
print(os.environ["PINECONE_API_KEY"])

In [None]:
# Install required libraries
!pip install -q langchain langchain-community langchain-huggingface langchain-pinecone pinecone-client

# LangChain Tutorial: Document Loading, Splitting, Vectorstores, Embeddings, and Retrieval

This notebook provides a step-by-step tutorial on key LangChain concepts using a simple example. We'll use a sample text document to demonstrate:

1. Document Loading
2. Document Splitting with RecursiveCharacterTextSplitter
3. Vectorstores and Embeddings
4. Retrieval

We'll use Hugging Face embeddings for free, open-source models, and Pinecone as the vectorstore.

**Note:** Before running this notebook, you need to:
- Sign up for a free Pinecone account at https://www.pinecone.io/.
- Create a new index in your Pinecone dashboard named "tutorial-index" with dimension 384 (matching the Hugging Face model's output dimension) and metric "cosine".
- Store your Pinecone API key in Colab secrets under the name 'pinecone_api_key'.

## Step 1: Document Loading

LangChain provides various loaders to ingest documents from different sources. Here, we'll create a sample text file and load it using `TextLoader`.

In [None]:
# Create a sample text file
sample_text = """
LangChain is a framework for developing applications powered by large language models (LLMs).

It enables building applications that are:

- Data-aware: Connect a language model to other sources of data.

- Agentic: Allow a language model to interact with its environment.

The main value props of LangChain are:

1. Components: Abstractions for working with language models, along with a collection of implementations for each abstraction.

2. Off-the-shelf chains: A structured assembly of components for accomplishing specific higher-level tasks.

Off-the-shelf chains make it easy to get started. For more complex applications and nuanced use-cases, components make it easy to customize existing chains or build new ones.
"""

with open("sample_document.txt", "w") as f:
    f.write(sample_text)

# Now load the document
from langchain_community.document_loaders import TextLoader

loader = TextLoader("sample_document.txt")
documents = loader.load()

print(f"Loaded {len(documents)} document(s).")
print("Sample content:")
print(documents[0].page_content[:200] + "...")

## Step 2: Document Splitting

Large documents need to be split into smaller chunks for efficient embedding and retrieval. LangChain's `RecursiveCharacterTextSplitter` splits text recursively on characters like `\n\n`, `\n`, etc., trying to keep chunks coherent.

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,  # Maximum size of each chunk
    chunk_overlap=50,  # Overlap between chunks for context
    length_function=len,  # Function to measure chunk size
)

split_docs = text_splitter.split_documents(documents)

print(f"Split into {len(split_docs)} chunks.")
for i, doc in enumerate(split_docs):
    print(f"Chunk {i+1}:")
    print(doc.page_content)
    print("-" * 80)

## Step 3: Vectorstores and Embeddings

Embeddings convert text into vector representations. We'll use Hugging Face's `sentence-transformers` for embeddings.

A vectorstore (like Pinecone) stores these embeddings and allows efficient similarity search.

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore

# Initialize embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create vectorstore from split documents
# Note: Ensure the index 'tutorial-index' exists in Pinecone with dimension 384 and cosine metric
index_name = "tutorial-index"
vectorstore = PineconeVectorStore.from_documents(
    documents=split_docs,
    embedding=embeddings,
    index_name=index_name
)

print("Vectorstore created with embeddings.")

## Step 4: Retrieval

Retrieval involves querying the vectorstore to find relevant documents based on semantic similarity.

We'll use the vectorstore as a retriever and query it.

In [None]:
# Create a retriever from the vectorstore
retriever = vectorstore.as_retriever(
    search_type="similarity",  # Use cosine similarity
    search_kwargs={"k": 2}  # Return top 2 results
)

# Query the retriever
query = "What are the main value props of LangChain?"
retrieved_docs = retriever.invoke(query)

print(f"Retrieved {len(retrieved_docs)} documents for query: '{query}'")
for i, doc in enumerate(retrieved_docs):
    print(f"Result {i+1}:")
    print(doc.page_content)
    print("-" * 80)

This covers the basics! You can expand this by using different loaders (e.g., PDFLoader), splitters, embeddings (e.g., OpenAI), or vectorstores (e.g., Chroma). Experiment with the parameters to see how they affect the results.