https://medium.com/@sridevi.gogusetty/from-rag-to-tag-leveraging-the-power-of-table-augmented-generation-tag-a-leap-beyond-bc894b2d34b9

In [15]:
import pandas as pd
import lotus
from lotus.models import LM, SentenceTransformersRM, CrossEncoderReranker
from qdrant_client import QdrantClient
from lotus.vector_store import QdrantVS
from langchain_openai import ChatOpenAI
from lotus.vector_store import FaissVS
import os

In [16]:
# lm = LM(model='groq/llama-3.1-70b-versatile')
# lm = LM(model='openai/gpt-4.1-nano')
lm = LM(model="ollama/llama3.2")
rm = SentenceTransformersRM(model="intfloat/e5-base-v2")
reranker = CrossEncoderReranker(model="mixedbread-ai/mxbai-rerank-large-v1")

# client = QdrantClient(url="http://localhost:6333")
# vs = QdrantVS(client)
vs = FaissVS()

2025-07-21 12:33:04,191 - INFO - Use pytorch device_name: cuda
2025-07-21 12:33:04,191 - INFO - Load pretrained SentenceTransformer: intfloat/e5-base-v2
2025-07-21 12:33:09,729 - INFO - Use pytorch device: cuda


In [17]:
lotus.settings.configure(lm=lm, rm=rm, reranker=reranker, vs=vs)

In [18]:
courses_data = {
    "Course Name": [
        "History of the Atlantic World",
        "Riemannian Geometry",
        "Operating Systems",
        "Food Science",
        "Compilers",
        "Intro to computer science",
    ]
}
skills_data = {"Skill": ["Math", "Computer Science"]}
courses_df = pd.DataFrame(courses_data)
skills_df = pd.DataFrame(skills_data)

In [19]:
from IPython.display import display, HTML
df = pd.DataFrame(courses_data)
user_instruction = "What is a similar course to {Course Name}. Be concise."
df = df.sem_map(user_instruction)
display(df)

Mapping: 100%|██████████ 6/6 LM calls [00:03<00:00,  1.72it/s]


Unnamed: 0,Course Name,_map
0,History of the Atlantic World,"A similar course to ""History of the Atlantic W..."
1,Riemannian Geometry,"A similar course to ""Riemannian Geometry"" coul..."
2,Operating Systems,"A similar course to ""Operating Systems"" could ..."
3,Food Science,"A similar course to ""Food Science"" could be ""N..."
4,Compilers,"A similar course to ""Compilers"" could be ""Algo..."
5,Intro to computer science,"A similar course to ""Intro to Computer Science..."


In [20]:
df = pd.DataFrame(courses_data)
user_instruction = "{Course Name} requires a lot of math"
df = df.sem_filter(user_instruction)
display(df)

Filtering: 100%|██████████ 6/6 LM calls [00:05<00:00,  1.17it/s]


Unnamed: 0,Course Name
1,Riemannian Geometry


In [21]:
df = pd.DataFrame(courses_data)
df = df.sem_agg("Summarize all {Course Name}")
display(df)

Aggregating: 100%|██████████ 1/1 LM calls [00:04<00:00,  4.16s/it]


Unnamed: 0,_output
0,Based on the provided context from multiple do...


In [22]:
df

Unnamed: 0,_output
0,Based on the provided context from multiple do...


In [23]:
df = pd.DataFrame(courses_data)

for method in ["quick", "heap", "naive"]:
    sorted_df, stats = df.sem_topk(
        "Which {Course Name} requires the least math?",
        K=2,
        method=method,
        return_stats=True,
    )
    print(sorted_df)
    print(stats)

Quicksort comparisons: 100%|██████████| 5/5 LM calls [00:01<00:00]
Quicksort comparisons: 100%|██████████| 4/4 LM calls [00:00<00:00]


         Course Name
0       Food Science
1  Operating Systems
{'total_tokens': 1186, 'total_llm_calls': 9, 'explanations': {}}


Heap comparisons: 100%|██████████ 1/1 LM calls [00:00<00:00,  2.96it/s]
Heap comparisons: 100%|██████████ 1/1 LM calls [00:00<00:00,  2.84it/s]
Heap comparisons: 100%|██████████ 1/1 LM calls [00:00<00:00,  2.89it/s]
Heap comparisons: 100%|██████████ 1/1 LM calls [00:00<00:00,  2.72it/s]
Heap comparisons: 100%|██████████ 1/1 LM calls [00:00<00:00,  2.77it/s]
Heap comparisons: 100%|██████████ 1/1 LM calls [00:00<00:00,  2.62it/s]


                     Course Name
0            Riemannian Geometry
1  History of the Atlantic World
{'total_tokens': 806, 'total_llm_calls': 6, 'explanations': {}}


All-pairs comparisons: 100%|██████████| 15/15 LM calls [00:03<00:00]

                 Course Name
0  Intro to computer science
1                  Compilers
{'total_tokens': 1990, 'total_llm_calls': 15, 'explanations': {}}





In [24]:
skill_data = {"Skill": ["Math", "Computer Science"]}

df1 = pd.DataFrame(courses_data)
df2 = pd.DataFrame(skill_data)
join_instruction = "Taking {Course Name:left} will help me learn {Skill:right}"
res = df1.sem_join(df2, join_instruction)
print(res)

Join comparisons: 100%|██████████ 12/12 LM Calls [00:09<00:00,  1.29it/s]

                 Course Name             Skill
1        Riemannian Geometry              Math
2          Operating Systems              Math
2          Operating Systems  Computer Science
4                  Compilers              Math
4                  Compilers  Computer Science
5  Intro to computer science              Math
5  Intro to computer science  Computer Science





In [25]:
df = pd.DataFrame(courses_data)
df = df.sem_index("Course Name", "course_name_index").sem_cluster_by("Course Name", 2)
display(df)

100%|██████████| 1/1 [00:00<00:00, 50.93it/s]


Unnamed: 0,Course Name,cluster_id
0,History of the Atlantic World,0
1,Riemannian Geometry,0
2,Operating Systems,1
3,Food Science,1
4,Compilers,1
5,Intro to computer science,1


In [26]:
df = pd.DataFrame(courses_data)
df = df.sem_index("Course Name", "course_name_index").sem_cluster_by("Course Name", 2)
display(df)

100%|██████████| 1/1 [00:00<00:00, 34.10it/s]


Unnamed: 0,Course Name,cluster_id
0,History of the Atlantic World,0
1,Riemannian Geometry,0
2,Operating Systems,1
3,Food Science,1
4,Compilers,1
5,Intro to computer science,1
