# LOTUS demo

## APIs

In [1]:
import bigframes.pandas as bpd
from bigframes.ml.llm import GeminiTextGenerator, _GEMINI_1P5_FLASH_001_ENDPOINT, _GEMINI_1P5_PRO_001_ENDPOINT

bpd.options.display.progress_bar = None

In [2]:
# First let's initialize the dataframe we will use to perform semantic operations on
data = {
    "Course Name": [
        "Probability and Random Processes",
        "Optimization Methods in Engineering",
        "Digital Design and Integrated Circuits",
        "Computer Security",
        "Operating Systems and Systems Programming",
        "Compilers",
        "Computer Networks",
        "Deep Learning",
        "Graphics",
        "Databases",
        "Art History",
    ]
}
df = bpd.DataFrame(data)
 
model = GeminiTextGenerator(model_name=_GEMINI_1P5_FLASH_001_ENDPOINT)

  return func(get_global_session(), *args, **kwargs)


### 1. `sem_filter`

In [3]:
predict_df = df.sem_filter("{Course Name} requires a lot of math", model, logprobs=True)
predict_df



Unnamed: 0,Course Name,confidence_scores
0,Probability and Random Processes,0.95
1,Optimization Methods in Engineering,0.95
2,Digital Design and Integrated Circuits,0.95
5,Compilers,0.8
7,Deep Learning,0.95
8,Graphics,0.8


### 2. `sem_join`

In [4]:
skills_df = bpd.DataFrame({"Skill": ["Art", "Cryptography", "Baking"]})

join_df = df.sem_join(skills_df, "Taking {Course Name} will make me better at {Skill}", model=model, logprobs=True)
join_df



Unnamed: 0,Course Name,Skill,confidence_scores
10,Computer Security,Cryptography,0.85
19,Computer Networks,Cryptography,0.8
24,Graphics,Art,0.85


### 3. `sem_map`

In [5]:
map_df = df.sem_map("Generate a short study plan to succeed in {Course Name}", model=model)
map_df



Unnamed: 0,Course Name,_map
0,Probability and Random Processes,## Study Plan for Probability and Random Proce...
1,Optimization Methods in Engineering,## Study Plan for Optimization Methods in Engi...
2,Digital Design and Integrated Circuits,## Study Plan for Digital Design and Integrate...
3,Computer Security,## Computer Security Study Plan **Goal:** Ac...
4,Operating Systems and Systems Programming,## Study Plan for Operating Systems and Syste...
5,Compilers,## Short Study Plan for Compilers: **1. Funda...
6,Computer Networks,## Short Study Plan for Computer Networks **G...
7,Deep Learning,## Short Study Plan for Deep Learning: **Focu...
8,Graphics,## Graphics Course Study Plan: **Goal:** Achi...
9,Databases,## Short Study Plan for Databases: **1. Acti...


In [6]:
map_df.iloc[0, 1]

"## Study Plan for Probability and Random Processes\n\n**Goal:**  Master the core concepts and build strong problem-solving skills in Probability and Random Processes.\n\n**Strategy:** \n\n1. **Understand the Fundamentals:**\n    * **Week 1-2:** Focus on probability basics: events, axioms, probability distributions, conditional probability, Bayes' Theorem.  \n    * **Week 3-4:** Dive deeper into random variables, expected value, variance, common distributions (Bernoulli, Binomial, Poisson, Normal). \n    * **Week 5-6:**  Explore fundamental concepts of random processes: stochastic processes, Markov Chains, Poisson process. \n\n2. **Practice Regularly:**\n    * **Daily:** Solve at least 5-10 problems from the textbook or previous exams. \n    * **Weekly:**  Review class notes, work on challenging problems, and try to explain concepts to yourself or a study partner.\n\n3. **Seek Help and Resources:**\n    * **Office Hours:**  Utilize your professor's office hours to clarify concepts and 

## Optimizations

### Cascade Models

In [7]:
# Cascade models, where the smaller model running first to save cost.
large_model = GeminiTextGenerator(model_name=_GEMINI_1P5_PRO_001_ENDPOINT)
small_model = GeminiTextGenerator(model_name=_GEMINI_1P5_FLASH_001_ENDPOINT)


In [8]:
predict_df = df.sem_filter(
    "{Course Name} requires a lot of math", 
    model=large_model, 
    small_model=small_model,
    confidence_threshold=0.9, 
    logprobs=True
)
predict_df



Debug:
5 rows resolved by helper model.
6 rows resolved by large model


Unnamed: 0,Course Name,helper_lm_results,helper_lm_confidence_scores,large_lm_results,large_lm_confidence_scores
0,Probability and Random Processes,True,0.95,,
1,Optimization Methods in Engineering,True,0.99,,
2,Digital Design and Integrated Circuits,True,0.95,,
7,Deep Learning,True,0.95,,


## Apply to the `bigquery-public-data.hacker_news.full` dataset

### 1. Import required packages

In [1]:
import bigframes.pandas as bpd
from bigframes.ml.llm import GeminiTextGenerator, _GEMINI_1P5_FLASH_001_ENDPOINT, _GEMINI_1P5_PRO_001_ENDPOINT

large_model = GeminiTextGenerator(model_name=_GEMINI_1P5_PRO_001_ENDPOINT)
small_model = GeminiTextGenerator(model_name=_GEMINI_1P5_FLASH_001_ENDPOINT)
# Run 6s

  return global_session.get_global_session()


### 2. Read table and select columns

- This dataset contains information on Hacker News stories and comments.
- The dataset includes 41M rows in total

In [4]:
hacker_news = bpd.read_gbq("bigquery-public-data.hacker_news.full")
hacker_news = hacker_news[["title", "text", "by", "score", "time"]].head(5000)
hacker_news
# Run 4s

Unnamed: 0,title,text,by,score,time
0,,"Well, most people aren&#x27;t alcoholics, so I...",slipframe,,1624675076
1,,"No, you don&#x27;t really <i>need</i> a smartp...",vetinari,,1681919794
2,,It&#x27;s for the late Paul Allen RIP. Should&...,lsr_ssri,,1539652075
3,,Yup they are dangerous. Be careful Donald Trump.,Sven7,,1439222754
4,,"Sure, it&#x27;s totally reasonable. Just point...",nicoburns,,1601896851
...,...,...,...,...,...
4995,,The report this article is based on says about...,jowea,,1695300352
4996,,This is what I do and it works great.,hacknat,,1396460951
4997,,Not every law is a restriction of freedom. My ...,throwaway248329,,1638452070
4998,,Give em an excuse to really switch to git and ...,zobzu,,1396451042


### 3. Filter titles related to "Art"

In [5]:
hacker_news_w_title = hacker_news[hacker_news["title"].isnull() == False]
hacker_news_w_title
# Run 6s

Unnamed: 0,title,text,by,score,time
6,The Impending NY Tech Apocalypse: Here's What ...,,gaoprea,3,1317163407
8,Eureca beta is live. A place for your business...,,ricardos,1,1350306572
15,Discord vs. IRC Rough Notes,,todsacerdoti,48,1720809592
21,Oh dear: new Yahoo anti-spoofing measures brea...,,joshreads,1,1396963790
22,How Much Warmer Was Your City in 2016?,,smb06,1,1487287594
...,...,...,...,...,...
4946,Using Social Media To Cover For Lack Of Origin...,,taytus,2,1353533202
4953,Design from Memory to UI,,wizardofmysore,1,1610036225
4958,How to keep your employees from joining the ‘G...,,CrankyBear,1,1631035622
4969,Marketplace for themes built with the highest ...,Looking for themes made with frameworks like T...,themebulk,1,1372027020


In [11]:
%time
art_hacker_news = hacker_news_w_title.sem_filter("{title} is related to Art", model=large_model)
#Run time: 55m 30.2s

CPU times: user 10 μs, sys: 0 ns, total: 10 μs
Wall time: 20.3 μs




In [12]:
art_hacker_news
# Run time 7.1s

Unnamed: 0,title,text,by,score,time
144,Colorful JSON Visualization Representation,,hussein8844,1,1566767108
155,Making Blogger Blogs Prettier: Google Launches...,,andrimtd,1,1305447025
407,"GCH Guitar Academy, free online guitar lessons...",,evo_9,1,1647355546
655,The Beauty of Code,,cobralibre,63,1429980871
657,Top Free Programming Fonts,,Deprecated,2,1406562457
...,...,...,...,...,...
4391,Illustrating Your Life in Graphs and Charts,,rayboyd,1,1303385137
4410,Corey Haines and Dan North debate over Twitter...,,evolve2k,4,1297215698
4839,Matthias Buchinger,,gmargari,9,1684684807
4862,The Decline of Madness,,chippy,1,1659691153


### 4. Cascade model performance

In [13]:
%time 
art_hacker_news = hacker_news_w_title.sem_filter("Is {title} related to Art", model=large_model, helper_model=small_model)
#Run time 20m 10.2s

CPU times: user 6 μs, sys: 3 μs, total: 9 μs
Wall time: 16.2 μs






Debug:
553 rows resolved by helper model.
36 rows resolved by large model


In [14]:
%time
art_hacker_news
#Run 6.4s

CPU times: user 5 μs, sys: 3 μs, total: 8 μs
Wall time: 16.2 μs


Unnamed: 0,title,text,by,score,time
144,Colorful JSON Visualization Representation,,hussein8844,1,1566767108
155,Making Blogger Blogs Prettier: Google Launches...,,andrimtd,1,1305447025
235,Be Good (Not Less Bad),,yarapavan,1,1318436118
407,"GCH Guitar Academy, free online guitar lessons...",,evo_9,1,1647355546
655,The Beauty of Code,,cobralibre,63,1429980871
...,...,...,...,...,...
4410,Corey Haines and Dan North debate over Twitter...,,evolve2k,4,1297215698
4425,RESTArt: A Python library with good intentions...,,russellluo,1,1437320070
4839,Matthias Buchinger,,gmargari,9,1684684807
4862,The Decline of Madness,,chippy,1,1659691153


### 5. Automatically refill the missed title

In [None]:
hacker_news_wo_title = hacker_news[hacker_news["title"].isnull()].head(100)
hacker_news_gai_title = hacker_news_wo_title.sem_map("Generate a short title for the given context: {text}", model=large_model)
hacker_news_gai_title = hacker_news_gai_title[hacker_news_gai_title["title"].isnull()]
hacker_news_gai_title