## Miscellaneous tutorials from docs to familiarize with library

In [13]:
import dotenv
import os
from langchain.chat_models import init_chat_model
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [2]:
dotenv.load_dotenv()

True

### Basic Chat Model Usage

In [3]:
model = init_chat_model(
    "gemini-2.5-flash",
    model_provider="google_genai"
)

E0000 00:00:1760126742.166137 2674034 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


In [5]:
messages = [
    SystemMessage(content="Translate the following from English into Italian"),
    HumanMessage(content="hi!"),
]

In [6]:
model.invoke(messages)

AIMessage(content='Ciao!', additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-2.5-flash', 'safety_ratings': []}, id='run--763eafda-2665-4d0b-b1a0-12474a88ff57-0', usage_metadata={'input_tokens': 10, 'output_tokens': 36, 'total_tokens': 46, 'input_token_details': {'cache_read': 0}, 'output_token_details': {'reasoning': 34}})

In [8]:
system_template = "Translate the following from English into {language}"

prompt_template = ChatPromptTemplate.from_messages(
    [("system", system_template), ("user", "{text}")]
)

In [9]:
prompt = prompt_template.invoke({"language": "Italian", "text": "hi!"})

In [12]:
prompt.to_messages()

[SystemMessage(content='Translate the following from English into Italian', additional_kwargs={}, response_metadata={}),
 HumanMessage(content='hi!', additional_kwargs={}, response_metadata={})]

In [11]:
model.invoke(prompt)

AIMessage(content='Ciao!', additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-2.5-flash', 'safety_ratings': []}, id='run--4f522f1a-9648-4e8e-b696-673878d71aa1-0', usage_metadata={'input_tokens': 10, 'output_tokens': 36, 'total_tokens': 46, 'input_token_details': {'cache_read': 0}, 'output_token_details': {'reasoning': 34}})

In [13]:
joke_prompt = ChatPromptTemplate.from_messages([
    SystemMessage(content="You are a world class comedian."),
    ("human", "Tell me a joke about {topic}")
])

In [15]:
chain = joke_prompt | model

In [16]:
chain.invoke({"topic": "beets"})

AIMessage(content='Alright, alright, settle down folks! We\'re talking about beets tonight!\n\n(Adjusts mic, squints at the audience)\n\nBeets. The only vegetable that, after you eat it, makes you think you\'re either bleeding internally... or you\'ve accidentally solved a murder in your own toilet bowl.\n\nI mean, the first time it happened to me, I genuinely panicked! I looked down, saw that crimson swirl, and my mind went straight to, "This is it. This is how it ends. My colon has finally unionized and gone on strike."\n\nThen, about thirty seconds later, the little voice in my head goes, "Oh, right. Beets. The vegetable that makes your pee look like a crime scene and your poop look like a Jackson Pollock painting."\n\nIt\'s an emotional rollercoaster, people! From "I\'m dying!" to "I\'m an artist!" all thanks to a root vegetable. And they say *I\'m* dramatic!\n\nThank you, I\'ll be here all week! Try the pickled beets! ...Or don\'t. Your toilet\'s choice.', additional_kwargs={}, re

In [18]:
str_chain = chain | StrOutputParser()

In [19]:
str_chain.invoke({"topic": "cars"})



### Semantic Search

In [41]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_community.vectorstores.utils import DistanceStrategy
from pinecone import Pinecone
from pinecone import (
    Metric,
    ServerlessSpec,
    CloudProvider,
    AwsRegion,
    DeletionProtection,
    VectorType
)
import pprint

In [5]:
loader = PyPDFLoader('../data/paper.pdf')
docs = loader.load()

In [6]:
print(f"Number of documents: {len(docs)}")
print('=' * 20)
pprint.pp(f"{docs[0].page_content[:200]}\n")
print("=" * 20)
print(docs[0].metadata)

Number of documents: 12
('Less is More: Recursive Reasoning with Tiny Networks\n'
 'Alexia Jolicoeur-Martineau\n'
 'Samsung SAIL Montr´eal\n'
 'alexia.j@samsung.com\n'
 'Abstract\n'
 'Hierarchical Reasoning Model (HRM) is a\n'
 'novel approach using two sm\n')
{'producer': 'pikepdf 8.15.1', 'creator': 'arXiv GenPDF (tex2pdf:)', 'creationdate': '', 'author': 'Alexia Jolicoeur-Martineau', 'doi': 'https://doi.org/10.48550/arXiv.2510.04871', 'license': 'http://arxiv.org/licenses/nonexclusive-distrib/1.0/', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.28 (TeX Live 2025) kpathsea version 6.4.1', 'title': 'Less is More: Recursive Reasoning with Tiny Networks', 'trapped': '/False', 'arxivid': 'https://arxiv.org/abs/2510.04871v1', 'source': '../data/paper.pdf', 'total_pages': 12, 'page': 0, 'page_label': '1'}


In [7]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True
)

In [8]:
split_docs = text_splitter.split_documents(docs)

In [9]:
print(f"Number of documents: {len(split_docs)}")
print('=' * 20)
pprint.pp(f"{split_docs[-1].page_content[:200]}\n")
print("=" * 20)
print(split_docs[-1].metadata)

Number of documents: 61
('3 3 3 6 5 6 6 4\n'
 '7 5 6 3 3 6 6\n'
 '4 3 4 8 3 6 6 4\n'
 'Tokenizedz L (denotedzin TRM)\n'
 'Figure 6.This Sudoku-Extreme example shows an input, ex-\n'
 'pected output, and the tokenized zH and zL (after reversing\n'
 'the emb\n')
{'producer': 'pikepdf 8.15.1', 'creator': 'arXiv GenPDF (tex2pdf:)', 'creationdate': '', 'author': 'Alexia Jolicoeur-Martineau', 'doi': 'https://doi.org/10.48550/arXiv.2510.04871', 'license': 'http://arxiv.org/licenses/nonexclusive-distrib/1.0/', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.28 (TeX Live 2025) kpathsea version 6.4.1', 'title': 'Less is More: Recursive Reasoning with Tiny Networks', 'trapped': '/False', 'arxivid': 'https://arxiv.org/abs/2510.04871v1', 'source': '../data/paper.pdf', 'total_pages': 12, 'page': 11, 'page_label': '12', 'start_index': 2382}


In [10]:
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2"
)

In [11]:
vector_1 = embeddings.embed_query(split_docs[0].page_content)
vector_2 = embeddings.embed_query(split_docs[1].page_content)

In [12]:
assert len(vector_1) == len(vector_2)
print(f"Generated vectors of length {len(vector_1)}\n")
print(vector_1[:5])

Generated vectors of length 768

[-0.0119935879483819, 0.023104334250092506, -0.024577949196100235, 0.03807579725980759, -0.03337205573916435]


In [30]:
pc = Pinecone(
    api_key=os.getenv('PINECONE_API_KEY')
)

In [31]:
index_name='playground'

if not pc.has_index(index_name):
    print("Creating index...")
    idx_model = pc.create_index(
        name=index_name,
        dimension=len(vector_1),
        metric=Metric.COSINE,
        spec=ServerlessSpec(
            cloud=CloudProvider.AWS,
            region=AwsRegion.US_EAST_1
        ),
        deletion_protection=DeletionProtection.DISABLED,
        vector_type=VectorType.DENSE,
        tags={
            "env": "test"
        }
    )
else:
    print("Index already exists, skipping creation...")
    idx_model = pc.describe_index(index_name)

Creating index...


In [33]:
index = pc.Index(host=idx_model.host)

In [58]:
namespace = "langchain-course"

In [69]:
vector_store = PineconeVectorStore(
    embedding=embeddings,
    index=index,
    namespace=namespace,
    distance_strategy=DistanceStrategy.COSINE
)

In [40]:
ids = vector_store.add_documents(documents=split_docs)

In [42]:
with open('./output/ids.txt', 'w') as f:
    for _id in ids:
        f.write(f"{_id}\n")

In [None]:
results = vector_store.similarity_search(
    "Test accuracy on ARC-AGI-1 and ARC-AGI-2",
    k=2,
    namespace=namespace
)

In [56]:
print(results[0].page_content[:100])

accuracy on ARC-AGI-1, and 7.8% accuracy on ARC-
AGI-2 with 7M parameters. This is significantly hig


In [60]:
results = vector_store.similarity_search_with_score(
    "Test accuracy on ARC-AGI-1 and ARC-AGI-2",
    k=2,
    namespace=namespace
)
doc, score = results[0]
print(f"Score: {score}\n")
print(doc.page_content[:100])

Score: 0.433127433

accuracy on ARC-AGI-1, and 7.8% accuracy on ARC-
AGI-2 with 7M parameters. This is significantly hig


In [64]:
q_vector = embeddings.embed_query("Test accuracy on ARC-AGI-1 and ARC-AGI-2")

results = vector_store.similarity_search_by_vector(
    q_vector,
    k=2,
    namespace=namespace
)
print(results[0].page_content[:100])

accuracy on ARC-AGI-1, and 7.8% accuracy on ARC-
AGI-2 with 7M parameters. This is significantly hig


Create a Retriever using Pinecone as the vector store.

In [65]:
from langchain_core.documents import Document
from langchain_core.runnables import chain

In [89]:
@chain
def retriever(fields) -> list[Document]:
    # Check that query is provided
    if not fields.get('query'):
        raise ValueError("Query must be provided")
    # Use namespace if provided
    if fields.get('namespace'):
        return vector_store.similarity_search(
            fields.get('query'),
            k=1,
            namespace=fields.get('namespace')
        )
    # Default to no namespace
    return vector_store.similarity_search(
        fields.get('query'),
        k=1,
    )

In [90]:
retriever.batch(
    [
        {
            'query': q,
            'namespace': namespace
        } for q in [
            "Test accuracy on ARC-AGI-1 and ARC-AGI-2",
            "What are the four learnable components from HRM",
        ]
    ]
)

[[Document(id='f27d8b43-2277-4074-b5bd-624c89055950', metadata={'arxivid': 'https://arxiv.org/abs/2510.04871v1', 'author': 'Alexia Jolicoeur-Martineau', 'creationdate': '', 'creator': 'arXiv GenPDF (tex2pdf:)', 'doi': 'https://doi.org/10.48550/arXiv.2510.04871', 'license': 'http://arxiv.org/licenses/nonexclusive-distrib/1.0/', 'page': 7.0, 'page_label': '8', 'producer': 'pikepdf 8.15.1', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.28 (TeX Live 2025) kpathsea version 6.4.1', 'source': '../data/paper.pdf', 'start_index': 2360.0, 'title': 'Less is More: Recursive Reasoning with Tiny Networks', 'total_pages': 12.0, 'trapped': '/False'}, page_content='accuracy on ARC-AGI-1, and 7.8% accuracy on ARC-\nAGI-2 with 7M parameters. This is significantly higher\nthan the 74.5%, 40.3%, and 5.0% obtained by HRM us-\ning 4 times the number of parameters (27M).\nTable 4.% Test accuracy on Puzzle Benchmarks (Sudoku-\nExtreme and Maze-Hard)\nMethod # Params Sudoku Maze\nChain-of-tho

In [92]:
v_retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={
        "k": 1,
        'namespace': namespace
    },
)

In [93]:
v_retriever.batch(
    [
        "Test accuracy on ARC-AGI-1 and ARC-AGI-2",
        "What are the four learnable components from HRM",
    ]
)

[[Document(id='f27d8b43-2277-4074-b5bd-624c89055950', metadata={'arxivid': 'https://arxiv.org/abs/2510.04871v1', 'author': 'Alexia Jolicoeur-Martineau', 'creationdate': '', 'creator': 'arXiv GenPDF (tex2pdf:)', 'doi': 'https://doi.org/10.48550/arXiv.2510.04871', 'license': 'http://arxiv.org/licenses/nonexclusive-distrib/1.0/', 'page': 7.0, 'page_label': '8', 'producer': 'pikepdf 8.15.1', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.28 (TeX Live 2025) kpathsea version 6.4.1', 'source': '../data/paper.pdf', 'start_index': 2360.0, 'title': 'Less is More: Recursive Reasoning with Tiny Networks', 'total_pages': 12.0, 'trapped': '/False'}, page_content='accuracy on ARC-AGI-1, and 7.8% accuracy on ARC-\nAGI-2 with 7M parameters. This is significantly higher\nthan the 74.5%, 40.3%, and 5.0% obtained by HRM us-\ning 4 times the number of parameters (27M).\nTable 4.% Test accuracy on Puzzle Benchmarks (Sudoku-\nExtreme and Maze-Hard)\nMethod # Params Sudoku Maze\nChain-of-tho