A "token" is typically a word, part of a word, or even a single character, depending on the encoding used.

Every word, punctuation mark, and sometimes even part of a word, is split into tokens. These tokens are what the model processes, and each model has a limit on how many tokens it can handle in one request.

In [4]:
import os
os.environ["OPENAI_API_KEY"] = "sk-"

In [63]:
from langchain_core.messages import AIMessage, HumanMessage, ToolMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

from langchain_core.tools import tool


@tool
def add(a: int, b: int) -> int:
    """Adds a and b."""
    return a + b


@tool
def multiply(a: int, b: int) -> int:
    """Multiplies a and b."""
    return a * b


tools = [add, multiply]


llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
llm_with_tools = llm.bind_tools(tools)

examples = [
    HumanMessage(
        "What's the product of 317253 and 128472 plus four", name="example_user"
    ),
    AIMessage(
        "",
        name="example_assistant",
        tool_calls=[
            {"name": "Multiply", "args": {"x": 317253, "y": 128472}, "id": "1"}
        ],
    ),
    ToolMessage("16505054784", tool_call_id="1"),
    AIMessage(
        "",
        name="example_assistant",
        tool_calls=[{"name": "Add", "args": {"x": 16505054784, "y": 4}, "id": "2"}],
    ),
    ToolMessage("16505054788", tool_call_id="2"),
    AIMessage(
        "The product of 317253 and 128472 plus four is 16505054788",
        name="example_assistant",
    ),
]

system = """You are bad at math but are an expert at using a calculator. 

Use past tool usage as an example of how to correctly use the tools."""
few_shot_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        *examples,
        ("human", "{query}"),
    ]
)

chain = {"query": RunnablePassthrough()} | few_shot_prompt | llm_with_tools
chain.invoke("Whats 119 times 8 minus 20").tool_calls

[{'name': 'multiply',
  'args': {'a': 119, 'b': 8},
  'id': 'call_xgNN2zPIgY9iFhj8vI2uKlWM',
  'type': 'tool_call'},
 {'name': 'add',
  'args': {'a': -20, 'b': 0},
  'id': 'call_NLoPj1PUUFqctwuJ65KyVqDa',
  'type': 'tool_call'}]

In [64]:
from langchain_core.messages import (
    AIMessage,
    HumanMessage,
    SystemMessage,
    ToolMessage,
    trim_messages,
)
from langchain_openai import ChatOpenAI

In [65]:
messages = [
    SystemMessage("you're a good assistant, you always respond with a joke."),
    HumanMessage("i wonder why it's called langchain"),
    AIMessage(
        'Well, I guess they thought "WordRope" and "SentenceString" just didn\'t have the same ring to it!'
    ),
    HumanMessage("and who is harrison chasing anyways"),
    AIMessage(
        "Hmmm let me think.\n\nWhy, he's probably chasing after the last cup of coffee in the office!"
    ),
    HumanMessage("what do you call a speechless parrot"),
]

In [66]:
trim_messages(
    messages, #This is the list of conversation messages that you want to trim.
    strategy="last", #This tells the function how to trim the messages. it will keep the last n_count tokens. So, it trims the conversation by keeping the most recent part
    token_counter=ChatOpenAI(model="gpt-4o"),#This is the tool used to count how many tokens (pieces of language) are in the conversation. The function needs to know how many tokens your messages have, so it uses a model like gpt-4o to count the tokens
    max_tokens=45,#This sets the maximum number of tokens (words or characters) that can be in the trimmed conversation
    start_on="human",#This specifies where the trimmed conversation should start. In this case, it tells the function to start with a message from a human
    end_on=("human", "tool"),#This specifies where the trimmed conversation should end. The function will stop trimming when it reaches a message from a human or a tool
    include_system=True, #This tells the function to keep the SystemMessage in the conversation if it exists. 
    allow_partial=True, #When allow_partial is set to False, the function will only keep entire messages, not fragments. So, if a message doesn't fully fit in the token limit, it will be cut entirely rather than keeping part of it.
)

[SystemMessage(content="you're a good assistant, you always respond with a joke.", additional_kwargs={}, response_metadata={}),
 HumanMessage(content='what do you call a speechless parrot', additional_kwargs={}, response_metadata={})]

In [7]:
trim_messages(
    messages, #This is the list of conversation messages that you want to trim.
    strategy="last", #This tells the function how to trim the messages. it will keep the last n_count tokens. So, it trims the conversation by keeping the most recent part
    token_counter=ChatOpenAI(model="gpt-4o"),#This is the tool used to count how many tokens (pieces of language) are in the conversation. The function needs to know how many tokens your messages have, so it uses a model like gpt-4o to count the tokens
    max_tokens=75,#This sets the maximum number of tokens (words or characters) that can be in the trimmed conversation
    start_on="human",#This specifies where the trimmed conversation should start. In this case, it tells the function to start with a message from a human
    end_on=("human", "tool"),#This specifies where the trimmed conversation should end. The function will stop trimming when it reaches a message from a human or a tool
    include_system=True, #This tells the function to keep the SystemMessage in the conversation if it exists. 
    allow_partial=True, #When allow_partial is set to False, the function will only keep entire messages, not fragments. So, if a message doesn't fully fit in the token limit, it will be cut entirely rather than keeping part of it.
)

[SystemMessage(content="you're a good assistant, you always respond with a joke.", additional_kwargs={}, response_metadata={}),
 HumanMessage(content='and who is harrison chasing anyways', additional_kwargs={}, response_metadata={}),
 AIMessage(content="Hmmm let me think.\n\nWhy, he's probably chasing after the last cup of coffee in the office!", additional_kwargs={}, response_metadata={}),
 HumanMessage(content='what do you call a speechless parrot', additional_kwargs={}, response_metadata={})]

In [8]:
trim_messages(
    messages, #This is the list of conversation messages that you want to trim.
    # strategy="last", #This tells the function how to trim the messages. it will keep the last n_count tokens. So, it trims the conversation by keeping the most recent part
    token_counter=ChatOpenAI(model="gpt-4o"),#This is the tool used to count how many tokens (pieces of language) are in the conversation. The function needs to know how many tokens your messages have, so it uses a model like gpt-4o to count the tokens
    max_tokens=75,#This sets the maximum number of tokens (words or characters) that can be in the trimmed conversation
    start_on="human",#This specifies where the trimmed conversation should start. In this case, it tells the function to start with a message from a human
    end_on=("human", "tool"),#This specifies where the trimmed conversation should end. The function will stop trimming when it reaches a message from a human or a tool
    include_system=True, #This tells the function to keep the SystemMessage in the conversation if it exists. 
    allow_partial=True, #When allow_partial is set to False, the function will only keep entire messages, not fragments. So, if a message doesn't fully fit in the token limit, it will be cut entirely rather than keeping part of it.
)

[SystemMessage(content="you're a good assistant, you always respond with a joke.", additional_kwargs={}, response_metadata={}),
 HumanMessage(content='and who is harrison chasing anyways', additional_kwargs={}, response_metadata={}),
 AIMessage(content="Hmmm let me think.\n\nWhy, he's probably chasing after the last cup of coffee in the office!", additional_kwargs={}, response_metadata={}),
 HumanMessage(content='what do you call a speechless parrot', additional_kwargs={}, response_metadata={})]

In [13]:
llm = ChatOpenAI(model="gpt-4o")
llm.invoke("""Well, I guess they thought "WordRope" and "SentenceString" just didn\'t have the same ring to it!""").response_metadata

{'token_usage': {'completion_tokens': 83,
  'prompt_tokens': 32,
  'total_tokens': 115,
  'completion_tokens_details': {'audio_tokens': None, 'reasoning_tokens': 0},
  'prompt_tokens_details': {'audio_tokens': None, 'cached_tokens': 0}},
 'model_name': 'gpt-4o-2024-08-06',
 'system_fingerprint': 'fp_a20a4ee344',
 'finish_reason': 'stop',
 'logprobs': None}

In [16]:
llm = ChatOpenAI(model="gpt-4o")
llm.invoke("""Well, I guess they thought "WordRope" and "SentenceString" just didn\'t have the same ring to it! man""").response_metadata

{'token_usage': {'completion_tokens': 79,
  'prompt_tokens': 33,
  'total_tokens': 112,
  'completion_tokens_details': {'audio_tokens': None, 'reasoning_tokens': 0},
  'prompt_tokens_details': {'audio_tokens': None, 'cached_tokens': 0}},
 'model_name': 'gpt-4o-2024-08-06',
 'system_fingerprint': 'fp_a7d06e42a7',
 'finish_reason': 'stop',
 'logprobs': None}

In [15]:
llm = ChatOpenAI(model="gpt-4o")
llm.invoke("""Well, I guess they thought "WordRope" and "SentenceString" just didn\'t have the same ring to it! man """).response_metadata

{'token_usage': {'completion_tokens': 92,
  'prompt_tokens': 34,
  'total_tokens': 126,
  'completion_tokens_details': {'audio_tokens': None, 'reasoning_tokens': 0},
  'prompt_tokens_details': {'audio_tokens': None, 'cached_tokens': 0}},
 'model_name': 'gpt-4o-2024-08-06',
 'system_fingerprint': 'fp_a7d06e42a7',
 'finish_reason': 'stop',
 'logprobs': None}

In [17]:
llm = ChatOpenAI(model="gpt-4o")
llm.invoke("""Well, I guess they thought "WordRope" and "SentenceString" just didn\'t have the same ring to it! man "women" """).response_metadata

{'token_usage': {'completion_tokens': 92,
  'prompt_tokens': 37,
  'total_tokens': 129,
  'completion_tokens_details': {'audio_tokens': None, 'reasoning_tokens': 0},
  'prompt_tokens_details': {'audio_tokens': None, 'cached_tokens': 0}},
 'model_name': 'gpt-4o-2024-08-06',
 'system_fingerprint': 'fp_a7d06e42a7',
 'finish_reason': 'stop',
 'logprobs': None}

In [19]:
messages = [
    SystemMessage("you're a good assistant, you always respond with a joke."),
    HumanMessage("i wonder why it's called langchain"),
    AIMessage(
        'Well, I guess they thought "WordRope" and "SentenceString" just didn\'t have the same ring to it!'
    ),
    HumanMessage("and who is harrison chasing anyways"),
    AIMessage(
        "Hmmm let me think.\n\nWhy, he's probably chasing after the last cup of coffee in the office! and some times he also chages coffe even in the house as he is a coffe lover and might be he wanna be born as a cofee bean in the next birth, incase if he has got next birth byy god "
    ),
    HumanMessage("what do you call a speechless parrot"),
]

In [23]:
trim_messages(
    messages, #This is the list of conversation messages that you want to trim.
    strategy="last", #This tells the function how to trim the messages. it will keep the last n_count tokens. So, it trims the conversation by keeping the most recent part
    token_counter=ChatOpenAI(model="gpt-4o"),#This is the tool used to count how many tokens (pieces of language) are in the conversation. The function needs to know how many tokens your messages have, so it uses a model like gpt-4o to count the tokens
    max_tokens=75,#This sets the maximum number of tokens (words or characters) that can be in the trimmed conversation
    start_on="human",#This specifies where the trimmed conversation should start. In this case, it tells the function to start with a message from a human
    end_on=("human", "tool"),#This specifies where the trimmed conversation should end. The function will stop trimming when it reaches a message from a human or a tool
    include_system=True, #This tells the function to keep the SystemMessage in the conversation if it exists. 
    allow_partial=True, #When allow_partial is set to False, the function will only keep entire messages, not fragments. So, if a message doesn't fully fit in the token limit, it will be cut entirely rather than keeping part of it.
)

[SystemMessage(content="you're a good assistant, you always respond with a joke.", additional_kwargs={}, response_metadata={}),
 HumanMessage(content='what do you call a speechless parrot', additional_kwargs={}, response_metadata={})]

In [24]:
trim_messages(
    messages, #This is the list of conversation messages that you want to trim.
    strategy="last", #This tells the function how to trim the messages. it will keep the last n_count tokens. So, it trims the conversation by keeping the most recent part
    token_counter=ChatOpenAI(model="gpt-4o"),#This is the tool used to count how many tokens (pieces of language) are in the conversation. The function needs to know how many tokens your messages have, so it uses a model like gpt-4o to count the tokens
    max_tokens=120,#This sets the maximum number of tokens (words or characters) that can be in the trimmed conversation
    start_on="human",#This specifies where the trimmed conversation should start. In this case, it tells the function to start with a message from a human
    end_on=("human", "tool"),#This specifies where the trimmed conversation should end. The function will stop trimming when it reaches a message from a human or a tool
    include_system=True, #This tells the function to keep the SystemMessage in the conversation if it exists. 
    allow_partial=True, #When allow_partial is set to False, the function will only keep entire messages, not fragments. So, if a message doesn't fully fit in the token limit, it will be cut entirely rather than keeping part of it.
)

[SystemMessage(content="you're a good assistant, you always respond with a joke.", additional_kwargs={}, response_metadata={}),
 HumanMessage(content='and who is harrison chasing anyways', additional_kwargs={}, response_metadata={}),
 AIMessage(content="Hmmm let me think.\n\nWhy, he's probably chasing after the last cup of coffee in the office! and some times he also chages coffe even in the house as he is a coffe lover and might be he wanna be born as a cofee bean in the next birth, incase if he has got next birth byy god ", additional_kwargs={}, response_metadata={}),
 HumanMessage(content='what do you call a speechless parrot', additional_kwargs={}, response_metadata={})]

In [28]:
trim_messages(
    messages,
    strategy="last",
    token_counter=len,
    max_tokens=2,
    start_on="human",
    end_on=("human", "tool"),
    include_system=True,
)

[SystemMessage(content="you're a good assistant, you always respond with a joke.", additional_kwargs={}, response_metadata={}),
 HumanMessage(content='what do you call a speechless parrot', additional_kwargs={}, response_metadata={})]

In [29]:
trim_messages(
    messages,
    strategy="last",
    token_counter=len,
    max_tokens=6,
    start_on="human",
    end_on=("human", "tool"),
    include_system=True,
)

[SystemMessage(content="you're a good assistant, you always respond with a joke.", additional_kwargs={}, response_metadata={}),
 HumanMessage(content="i wonder why it's called langchain", additional_kwargs={}, response_metadata={}),
 AIMessage(content='Well, I guess they thought "WordRope" and "SentenceString" just didn\'t have the same ring to it!', additional_kwargs={}, response_metadata={}),
 HumanMessage(content='and who is harrison chasing anyways', additional_kwargs={}, response_metadata={}),
 AIMessage(content="Hmmm let me think.\n\nWhy, he's probably chasing after the last cup of coffee in the office! and some times he also chages coffe even in the house as he is a coffe lover and might be he wanna be born as a cofee bean in the next birth, incase if he has got next birth byy god ", additional_kwargs={}, response_metadata={}),
 HumanMessage(content='what do you call a speechless parrot', additional_kwargs={}, response_metadata={})]

In [31]:
trim_messages(
    messages,
    max_tokens=75,
    strategy="last",
    token_counter=ChatOpenAI(model="gpt-4o"),
)

[HumanMessage(content='what do you call a speechless parrot', additional_kwargs={}, response_metadata={})]

#Custom Token Counter !!

In [35]:
from typing import List

import tiktoken
from langchain_core.messages import BaseMessage, ToolMessage


def str_token_counter(text: str) -> int:
    enc = tiktoken.get_encoding("o200k_base")
    return len(enc.encode(text))


def tiktoken_counter(messages: List[BaseMessage]) -> int:
    num_tokens =0
    for msg in messages:
        if isinstance(msg, HumanMessage):
            role = "user"
        elif isinstance(msg, AIMessage):
            role = "assistant"
        elif isinstance(msg, ToolMessage):
            role = "tool"
        elif isinstance(msg, SystemMessage):
            role = "system"
        else:
            raise ValueError(f"Unsupported messages type {msg.__class__}")
        num_tokens += (
            + str_token_counter(role)
            + str_token_counter(msg.content)
        )
        if msg.name:
            num_tokens += str_token_counter(msg.name)
    return num_tokens


trim_messages(
    messages,
    token_counter=tiktoken_counter,
    strategy="last",
    max_tokens=45,
    start_on="human",
    end_on=("human", "tool"),
    include_system=True,
)

[SystemMessage(content="you're a good assistant, you always respond with a joke.", additional_kwargs={}, response_metadata={}),
 HumanMessage(content='what do you call a speechless parrot', additional_kwargs={}, response_metadata={})]

https://api.python.langchain.com/en/latest/messages/langchain_core.messages.ai.AIMessage.html

https://api.python.langchain.com/en/latest/messages/langchain_core.messages.human.HumanMessage.html

In [38]:
from langchain_core.messages import filter_messages

In [36]:
messages = [
    SystemMessage("you are a good assistant who can answer funny on sports", id="1"),
    HumanMessage("Why do people get addicted to badminton", id="2", name="praveen"),
    AIMessage("People when they get to enjoy the game then fall in love with that sport, ideally in the case of badminton, they get smash so, thye love badminton", id="3", name="chatgpt"),
    HumanMessage("you sound funny but, you are correct", id="4", name="praveen"),
    AIMessage("thanks for the appretiation", id="5", name="chatgpt"),
]

In [39]:
filter_messages(messages, include_types="human")

[HumanMessage(content='Why do people get addicted to badminton', additional_kwargs={}, response_metadata={}, name='praveen', id='2'),
 HumanMessage(content='you sound funny but, you are correct', additional_kwargs={}, response_metadata={}, name='praveen', id='4')]

In [40]:
filter_messages(messages, exclude_names=["praveen", "chinna"])

[SystemMessage(content='you are a good assistant who can answer funny on sports', additional_kwargs={}, response_metadata={}, id='1'),
 AIMessage(content='People when they get to enjoy the game then fall in love with that sport, ideally in the case of badminton, they get smash so, thye love badminton', additional_kwargs={}, response_metadata={}, name='chatgpt', id='3'),
 AIMessage(content='thanks for the appretiation', additional_kwargs={}, response_metadata={}, name='chatgpt', id='5')]

In [43]:
filter_ = filter_messages(exclude_names=["praveen", "chinna"])
chain = filter_ | llm
chain.invoke(messages)

AIMessage(content="You're welcome! If you have any more questions or need a good laugh about sports, just let me know! Why did the football team go to the bank? Because they wanted to get their quarterback! 🏈😄", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 45, 'prompt_tokens': 72, 'total_tokens': 117, 'completion_tokens_details': {'audio_tokens': None, 'reasoning_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': None, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_7693ae462b', 'finish_reason': 'stop', 'logprobs': None}, id='run-fd01d332-bc54-4392-a6bd-688830606ab0-0', usage_metadata={'input_tokens': 72, 'output_tokens': 45, 'total_tokens': 117, 'input_token_details': {'cache_read': 0}, 'output_token_details': {'reasoning': 0}})

In [58]:
from langchain_core.messages import merge_message_runs

messages = [
    SystemMessage("you're a good assistant with lot of humour."),
    SystemMessage("you always respond very funny."),
    HumanMessage("i wonder why people like sports"),
    HumanMessage("and why is badminton very addictive")
]

merged = merge_message_runs(messages)

In [59]:
merged

[SystemMessage(content="you're a good assistant with lot of humour.\nyou always respond very funny.", additional_kwargs={}, response_metadata={}),
 HumanMessage(content='i wonder why people like sports\nand why is badminton very addictive', additional_kwargs={}, response_metadata={})]

In [60]:
llm.invoke(merged)

AIMessage(content='Ah, the age-old question: why do people like sports? It’s simple! Sports are like reality TV, but with fewer dramatic confessions and more sweat. People love the thrill, the competition, and the chance to yell at the TV as if the players can hear them. Plus, where else can you wear stretchy pants in public and call it “athletic wear”?\n\nAs for badminton being addictive, it’s probably because it’s the only sport where you can feel like a ninja while wielding a racket! You get to smash things (the shuttlecock, not your neighbor’s window), and it’s a great excuse to run around like a kid again. Plus, who doesn’t love a sport where you can say “I’m just going to play a quick game of badminton” and then end up playing for hours, completely forgetting about adult responsibilities? It’s like a mini-vacation from reality, with a side of cardio! 🏸😄', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 195, 'prompt_tokens': 39, 'total_