### Q2. Explore the core components of LangChain (LLMs, Prompt Templates, Chains, etc). Experiment with each and describe how they interact

In [6]:
%pip install -U langchain-openai

Collecting langchain-openai
  Downloading langchain_openai-0.3.33-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-core<1.0.0,>=0.3.76 (from langchain-openai)
  Downloading langchain_core-0.3.76-py3-none-any.whl.metadata (3.7 kB)
Downloading langchain_openai-0.3.33-py3-none-any.whl (74 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.0/75.0 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading langchain_core-0.3.76-py3-none-any.whl (447 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m447.5/447.5 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: langchain-core, langchain-openai
  Attempting uninstall: langchain-core
    Found existing installation: langchain-core 0.3.75
    Uninstalling langchain-core-0.3.75:
      Successfully uninstalled langchain-core-0.3.75
Successfully installed langchain-core-0.3.76 langchain-openai-0.3.33


**1. LLM**

llm_cold = ChatOpenAI(model="gpt-4o-mini", temperature=0): This creates an instance of the ChatOpenAI model.
model="gpt-4o-mini": Specifies the particular OpenAI model to use

In [7]:
from google.colab import userdata
from langchain_openai import ChatOpenAI

import os
# Get the API key from Colab's secrets manager
os.environ["OPENAI_API_KEY"] = userdata.get('DC')

llm_cold = ChatOpenAI(model="gpt-4o-mini", temperature=0) ## low randomness Deterministic
llm_warm = ChatOpenAI(model="gpt-4o-mini", temperature=0.8) # more creative
print(llm_cold.invoke("In one line, what is LangChain?")) # Changed llm to llm_cold to use one of the created models
print(llm_warm.invoke("In one line, what is LangChain?"))

content='LangChain is a framework designed to facilitate the development of applications using language models by providing tools for chaining together various components and functionalities.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 26, 'prompt_tokens': 16, 'total_tokens': 42, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_560af6e559', 'id': 'chatcmpl-CFv8khilPp0NurqMVBB7Z0GGcynmv', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None} id='run--c465ee92-5373-4e05-ac84-a587acf76e6c-0' usage_metadata={'input_tokens': 16, 'output_tokens': 26, 'total_tokens': 42, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}
content='LangChain is a framework

**Prompt Templates,**

In [8]:
from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are concise."),
    ("human", "Summarize in 12 words: {text}")
])

print(prompt.format(text="LCEL lets you pipe prompt→model→parser cleanly."))

System: You are concise.
Human: Summarize in 12 words: LCEL lets you pipe prompt→model→parser cleanly.


**### Chains**

In [11]:
import time
from langchain_core.output_parsers import StrOutputParser

cold_chain = prompt | llm_cold | StrOutputParser()
warm_chain = prompt | llm_warm | StrOutputParser()

print(cold_chain.invoke({"text": "LangChain enables modular LLM apps."}))
time.sleep(21)
print(warm_chain.invoke({"text": "LangChain enables modular LLM apps."}))
time.sleep(21)

# Batch: many inputs at once (must be a list)
outs = warm_chain.batch([
    {"text": "LCEL composes steps."},
    {"text": "Prompt → Model → Parser."},
])
print(outs)

# Stream: iterate chunks as they arrive
for chunk in warm_chain.stream({"text": "Stream this reply in parts."}):
    print(chunk, end="")
print()


LangChain facilitates the development of modular applications using large language models.
LangChain facilitates the creation of modular applications using language models efficiently.
['LCEL outlines a structured approach for composing and organizing steps efficiently.', 'Input prompt guides model processing, output parsed for structured interpretation.']
Sure! Please provide the reply you want summarized.


**Tools (function calling)**

Give the model callable utilities; it decides when to use them.**

In [12]:
from langchain_core.tools import tool
from datetime import datetime
import pytz

@tool
def time_in_tz(tz: str) -> str:
    """Return current time in IANA timezone (e.g., 'Europe/Paris')."""
    return datetime.now(pytz.timezone(tz)).isoformat()

tool_llm = llm_cold.bind_tools([time_in_tz])
print(tool_llm.invoke("What's the exact time in Europe/Paris?"))


content='' additional_kwargs={'tool_calls': [{'id': 'call_Wq4o5G13W09oUHahdI5isaVa', 'function': {'arguments': '{"tz":"Europe/Paris"}', 'name': 'time_in_tz'}, 'type': 'function'}], 'refusal': None} response_metadata={'token_usage': {'completion_tokens': 18, 'prompt_tokens': 64, 'total_tokens': 82, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_560af6e559', 'id': 'chatcmpl-CFvDgYAP4ndWcEnN8Xz9t5RiTlzd2', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None} id='run--7d75f98c-4941-468d-b684-9a9bb117140a-0' tool_calls=[{'name': 'time_in_tz', 'args': {'tz': 'Europe/Paris'}, 'id': 'call_Wq4o5G13W09oUHahdI5isaVa', 'type': 'tool_call'}] usage_metadata={'input_tokens': 64, 'output_tokens': 18, 'total_tokens': 82, 'input_token_details': {'audio': 0, 'c

***Output Parser structured results***

In [13]:
!pip -q install pydantic

from pydantic import BaseModel, Field
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough # Import RunnablePassthrough

class CityPlan(BaseModel):
    city: str = Field(..., description="destination")
    attractions: list[str] = Field(..., description="top 3")

parser = JsonOutputParser(pydantic_object=CityPlan)

json_prompt = ChatPromptTemplate.from_messages([
    ("system", "Return valid JSON: {format_instructions}"), # Changed from format to format_instructions
    ("human", "Plan a short trip to {place}.")
])

json_chain = (
    RunnablePassthrough.assign(format_instructions=lambda x: parser.get_format_instructions()) # Use RunnablePassthrough
    | json_prompt | llm_cold | parser # Assuming llm_cold is defined
)
print(json_chain.invoke({"place": "Kyoto"}))

{'city': 'Kyoto', 'attractions': ['Kinkaku-ji (Golden Pavilion)', 'Fushimi Inari Taisha', 'Arashiyama Bamboo Grove']}


**###Q3. Explore following optimization techniques that can be performed while training and document your findings with a basic example code snippets:**


1.  Tensor Creation (CPU vs GPU)
2.  Weight Initialization
3.  Activation Checkpointing
4. Gradient Accumulation
5. Mixed Precision Training**

In [4]:
import time, math
import torch.nn.functional as F
from torch.utils.checkpoint import checkpoint_sequential
from contextlib import nullcontext

torch.manual_seed(42)
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using device:', DEVICE)
if DEVICE == 'cuda':
    print('The CUDA capability:', torch.cuda.get_device_name(0))



Using device: cuda
The CUDA capability: Tesla T4


In [6]:
def bench_tensor_ops(n=4096, repeats=5):
    times = {}
    for dev in ['cpu'] + (['cuda'] if torch.cuda.is_available() else []):
        # warmup
        a = torch.randn((n,n), device=dev)
        b = torch.randn((n,n), device=dev)
        c = a @ b
        if dev=='cuda': torch.cuda.synchronize()
        # timed
        t_acc = 0.0
        for _ in range(repeats):
            t0 = time.time()
            a = torch.randn((n,n), device=dev)
            b = torch.randn((n,n), device=dev)
            c = a @ b
            if dev=='cuda': torch.cuda.synchronize()
            t_acc += time.time() - t0
        times[dev] = t_acc / repeats
    return times

times = bench_tensor_ops(n=1024, repeats=3)
print('Avg. Time: for 1024x1024 matmul):', times)

Avg. Time: for 1024x1024 matmul): {'cpu': 0.0417013963063558, 'cuda': 0.0009594758351643881}


Weight Initialization

In [11]:
import torch, torch.nn as nn, torch.nn.functional as F, torch.nn.init as init
import numpy as np

torch.manual_seed(0); np.random.seed(0)

class TinyMLP(nn.Module):
    def __init__(self, d_in=784, d_hidden=512, d_out=10, depth=4):
        super().__init__()
        dims = [d_in] + [d_hidden]*depth + [d_out]
        self.layers = nn.ModuleList([nn.Linear(dims[i], dims[i+1]) for i in range(len(dims)-1)])
    def forward(self, x):
        for lin in self.layers[:-1]: x = F.relu(lin(x))
        return self.layers[-1](x)

def init_weights(model, mode):
    for m in model.modules():
        if isinstance(m, nn.Linear):
            if mode == "xavier": init.xavier_uniform_(m.weight)
            elif mode == "he":   init.kaiming_uniform_(m.weight, nonlinearity="relu")
            if m.bias is not None: init.zeros_(m.bias)

def activation_stats(model, n_samples=256, d_in=784):
    x = torch.randn(n_samples, d_in); out = []
    with torch.no_grad():
        for lin in model.layers[:-1]:
            x = F.relu(lin(x))
            a = x.detach().cpu().numpy()
            out.append((a.mean(), a.std()))
    return out

def print_block(title, stats):
    print(f"=== init: {title} ===")
    for i,(m,s) in enumerate(stats, 1):
        print(f"[ L{i:02d} ] μ={m: .4f} | σ={s: .4f}")
    print("-"*32)

for mode in ["default","xavier","he"]:
    mlp = TinyMLP()
    if mode != "default": init_weights(mlp, mode)
    stats = activation_stats(mlp)
    print_block(mode, stats)


=== init: default ===
[ L01 ] μ= 0.2292 | σ= 0.3372
[ L02 ] μ= 0.0963 | σ= 0.1396
[ L03 ] μ= 0.0418 | σ= 0.0605
[ L04 ] μ= 0.0207 | σ= 0.0299
--------------------------------
=== init: xavier ===
[ L01 ] μ= 0.4379 | σ= 0.6409
[ L02 ] μ= 0.2889 | σ= 0.4427
[ L03 ] μ= 0.2167 | σ= 0.3157
[ L04 ] μ= 0.1546 | σ= 0.2207
--------------------------------
=== init: he ===
[ L01 ] μ= 0.5619 | σ= 0.8228
[ L02 ] μ= 0.5702 | σ= 0.8382
[ L03 ] μ= 0.5666 | σ= 0.8259
[ L04 ] μ= 0.5153 | σ= 0.7817
--------------------------------


Activation Checkpointing

In [12]:
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.checkpoint import checkpoint_sequential

torch.manual_seed(0)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def sync():
    if DEVICE == "cuda": torch.cuda.synchronize()

class DeepMLP(nn.Module):
    def __init__(self, depth=16, width=1024, d_in=1024, d_out=10):
        super().__init__()
        layers = [nn.Linear(d_in, width), nn.ReLU()]
        for _ in range(depth - 2):
            layers += [nn.Linear(width, width), nn.ReLU()]
        layers += [nn.Linear(width, d_out)]
        self.seq = nn.Sequential(*layers)

    def forward(self, x):
        return self.seq(x)

def forward_ckpt(seq, x, segments):
    x = x.requires_grad_(True)
    return checkpoint_sequential(seq, segments, x, use_reentrant=False)

def train_step(model, x, y, segments=None, lr=1e-3):
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    opt.zero_grad(set_to_none=True)
    out = forward_ckpt(model.seq, x, segments) if segments else model(x)
    loss = F.cross_entropy(out, y)
    loss.backward()
    opt.step()
    return float(loss.detach())

def bench(model, x, y, segments=None):
    _ = train_step(model, x, y, segments)  # warmup
    sync()
    if DEVICE == "cuda": torch.cuda.reset_peak_memory_stats()
    t0 = time.time()
    loss = train_step(model, x, y, segments)
    sync()
    t1 = time.time()
    mb = torch.cuda.max_memory_allocated() / 1e6 if DEVICE == "cuda" else float("nan")
    return loss, (t1 - t0), mb

N, D, C = 2048, 1024, 10
x = torch.randn(N, D, device=DEVICE)
y = torch.randint(0, C, (N,), device=DEVICE)

model_a = DeepMLP().to(DEVICE)
loss_a, time_a, mem_a = bench(model_a, x, y, segments=None)

model_b = DeepMLP().to(DEVICE)
loss_b, time_b, mem_b = bench(model_b, x, y, segments=8)

print(f"Device: {DEVICE}")
if DEVICE == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"{'Mode':<22}{'Loss':>10}{'Time(s)':>12}{'Peak MB':>12}")
print("-" * 56)
print(f"{'No checkpointing':<22}{loss_a:>10.4f}{time_a:>12.3f}{mem_a:>12.0f}")
print(f"{'With checkpointing':<22}{loss_b:>10.4f}{time_b:>12.3f}{mem_b:>12.0f}")


Device: cuda
GPU: Tesla T4
Mode                        Loss     Time(s)     Peak MB
--------------------------------------------------------
No checkpointing          2.3014       0.081         341
With checkpointing        2.3014       0.083         475


Gradient Accumulation

In [7]:
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.checkpoint import checkpoint_sequential

torch.manual_seed(0)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def sync():
    if DEVICE == "cuda": torch.cuda.synchronize()

class DeepMLP(nn.Module):
    def __init__(self, depth=16, width=1024, d_in=1024, d_out=10):
        super().__init__()
        layers = [nn.Linear(d_in, width), nn.ReLU()]
        for _ in range(depth - 2):
            layers += [nn.Linear(width, width), nn.ReLU()]
        layers += [nn.Linear(width, d_out)]
        self.seq = nn.Sequential(*layers)

    def forward(self, x):
        return self.seq(x)

def forward_ckpt(seq, x, segments):
    x = x.requires_grad_(True)
    return checkpoint_sequential(seq, segments, x, use_reentrant=False)

def train_step(model, x, y, segments=None, lr=1e-3):
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    opt.zero_grad(set_to_none=True)
    out = forward_ckpt(model.seq, x, segments) if segments else model(x)
    loss = F.cross_entropy(out, y)
    loss.backward()
    opt.step()
    return float(loss.detach())

def bench(model, x, y, segments=None):
    _ = train_step(model, x, y, segments)  # warmup
    sync()
    if DEVICE == "cuda": torch.cuda.reset_peak_memory_stats()
    t0 = time.time()
    loss = train_step(model, x, y, segments)
    sync()
    t1 = time.time()
    mb = torch.cuda.max_memory_allocated() / 1e6 if DEVICE == "cuda" else float("nan")
    return loss, (t1 - t0), mb

N, D, C = 2048, 1024, 10
x = torch.randn(N, D, device=DEVICE)
y = torch.randint(0, C, (N,), device=DEVICE)

model_a = DeepMLP().to(DEVICE)
loss_a, time_a, mem_a = bench(model_a, x, y, segments=None)

model_b = DeepMLP().to(DEVICE)
loss_b, time_b, mem_b = bench(model_b, x, y, segments=8)

print(f"Device: {DEVICE}")
if DEVICE == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"{'Mode':<22}{'Loss':>10}{'Time(s)':>12}{'Peak MB':>12}")
print("-" * 56)
print(f"{'No checkpointing':<22}{loss_a:>10.4f}{time_a:>12.3f}{mem_a:>12.0f}")
print(f"{'With checkpointing':<22}{loss_b:>10.4f}{time_b:>12.3f}{mem_b:>12.0f}")


Mixed Precision Training

In [13]:
# assumes TinyMLP and DEVICE are already defined above
import torch
import torch.nn as nn

model = TinyMLP().to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

BATCH = 128
MICRO = 32
acc_steps = BATCH // MICRO  # 4 micro-batches per update

def grad_acc_step():
    model.train()
    optimizer.zero_grad(set_to_none=True)
    avg_loss = 0.0
    for _ in range(acc_steps):
        xb = torch.randn(MICRO, 784, device=DEVICE)
        yb = torch.randint(0, 10, (MICRO,), device=DEVICE)
        logits = model(xb)
        loss = criterion(logits, yb) / acc_steps
        loss.backward()
        avg_loss += float(loss.item())  # sum of scaled losses == average unscaled loss
    optimizer.step()
    return avg_loss

loss_acc = grad_acc_step()
print('Gradient accumulation step loss (avg):', loss_acc)


Gradient accumulation step loss (avg): 2.3042796850204468


In [14]:
# assumes TinyMLP and DEVICE are already defined above
import time
import torch
import torch.nn as nn

use_amp = (DEVICE == "cuda")
print(f"Device={DEVICE} | AMP={'on' if use_amp else 'off'}")

model = TinyMLP().to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

scaler = torch.amp.GradScaler("cuda", enabled=use_amp)

def amp_step():
    optimizer.zero_grad(set_to_none=True)
    x = torch.randn(256, 784, device=DEVICE)
    y = torch.randint(0, 10, (256,), device=DEVICE)
    with torch.amp.autocast("cuda", enabled=use_amp):
        logits = model(x)
        loss = criterion(logits, y)
    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()
    return float(loss)

t0 = time.time()
loss = amp_step()
if DEVICE == "cuda":
    torch.cuda.synchronize()
t1 = time.time()

print(f"[AMP] loss={loss:.4f} | elapsed={t1 - t0:.4f}s")


Device=cuda | AMP=on
[AMP] loss=2.2958 | elapsed=0.1565s


Consider using tensor.detach() first. (Triggered internally at /pytorch/torch/csrc/autograd/generated/python_variable_methods.cpp:835.)
  return float(loss)


/diabetes (1).csv