# LOTUS demo

## APIs

In [2]:
import bigframes.pandas as bpd
from bigframes.ml.llm import GeminiTextGenerator, _GEMINI_1P5_FLASH_001_ENDPOINT, _GEMINI_1P5_PRO_001_ENDPOINT

bpd.options.display.progress_bar = None

In [3]:
# First let's initialize the dataframe we will use to perform semantic operations on
data = {
    "Course Name": [
        "Probability and Random Processes",
        "Optimization Methods in Engineering",
        "Digital Design and Integrated Circuits",
        "Computer Security",
        "Operating Systems and Systems Programming",
        "Compilers",
        "Computer Networks",
        "Deep Learning",
        "Graphics",
        "Databases",
        "Art History",
    ]
}
df = bpd.DataFrame(data)
 
model = GeminiTextGenerator(model_name=_GEMINI_1P5_FLASH_001_ENDPOINT)

  return func(get_global_session(), *args, **kwargs)


### 1. `sem_filter`

In [3]:
predict_df = df.sem_filter("{Course Name} requires a lot of math", model, logprobs=True)
predict_df



Unnamed: 0,Course Name,confidence_scores
0,Probability and Random Processes,0.95
1,Optimization Methods in Engineering,0.95
2,Digital Design and Integrated Circuits,0.95
5,Compilers,0.8
7,Deep Learning,0.95
8,Graphics,0.8


### 2. `sem_join`

In [4]:
skills_df = bpd.DataFrame({"Skill": ["Art", "Cryptography", "Baking"]})

join_df = df.sem_join(skills_df, "Taking {Course Name} will make me better at {Skill}", model=model, logprobs=True)
join_df



Unnamed: 0,Course Name,Skill,confidence_scores
10,Computer Security,Cryptography,0.85
19,Computer Networks,Cryptography,0.8
24,Graphics,Art,0.85


### 3. `sem_map`

In [5]:
map_df = df.sem_map("Generate a short study plan to succeed in {Course Name}", model=model)
map_df



Unnamed: 0,Course Name,_map
0,Probability and Random Processes,## Study Plan for Probability and Random Proce...
1,Optimization Methods in Engineering,## Study Plan for Optimization Methods in Engi...
2,Digital Design and Integrated Circuits,## Study Plan for Digital Design and Integrate...
3,Computer Security,## Computer Security Study Plan **Goal:** Ac...
4,Operating Systems and Systems Programming,## Study Plan for Operating Systems and Syste...
5,Compilers,## Short Study Plan for Compilers: **1. Funda...
6,Computer Networks,## Short Study Plan for Computer Networks **G...
7,Deep Learning,## Short Study Plan for Deep Learning: **Focu...
8,Graphics,## Graphics Course Study Plan: **Goal:** Achi...
9,Databases,## Short Study Plan for Databases: **1. Acti...


In [6]:
map_df.iloc[0, 1]

"## Study Plan for Probability and Random Processes\n\n**Goal:**  Master the core concepts and build strong problem-solving skills in Probability and Random Processes.\n\n**Strategy:** \n\n1. **Understand the Fundamentals:**\n    * **Week 1-2:** Focus on probability basics: events, axioms, probability distributions, conditional probability, Bayes' Theorem.  \n    * **Week 3-4:** Dive deeper into random variables, expected value, variance, common distributions (Bernoulli, Binomial, Poisson, Normal). \n    * **Week 5-6:**  Explore fundamental concepts of random processes: stochastic processes, Markov Chains, Poisson process. \n\n2. **Practice Regularly:**\n    * **Daily:** Solve at least 5-10 problems from the textbook or previous exams. \n    * **Weekly:**  Review class notes, work on challenging problems, and try to explain concepts to yourself or a study partner.\n\n3. **Seek Help and Resources:**\n    * **Office Hours:**  Utilize your professor's office hours to clarify concepts and 

### 4. `sem_agg`

#### No optimizations

In [1]:
agg_df = df.sem_agg("Generate a study plan for all {Course Name}s", model=model, num_batch = 5)
agg_df

  return func(get_global_session(), *args, **kwargs)


Loop 0: aggregate 11 rows




Loop 1: aggregate 3 rows




0    A study plan for all courses should cover: 

*...
Name: _lotus_doc, dtype: string

In [2]:
agg_df.iloc[0]

'A study plan for all courses should cover: \n\n**Core Engineering Concepts:** \n* Probability and Random Processes:  Understand probability, random variables, and distributions. Practice problem-solving.\n* Optimization Methods in Engineering:  Master optimization techniques like linear programming, gradient descent, and genetic algorithms. Apply them to engineering problems.\n* Digital Design and Integrated Circuits:  Gain expertise in digital logic, Boolean algebra, and circuit design. Understand different types of integrated circuits and their applications.\n\n**Computer Science Fundamentals:**\n* Compilers:  Learn how code is translated into machine-readable instructions.\n* Computer Networks:  Study network protocols, architecture, and communication methods.\n* Operating Systems and Systems Programming:  Explore operating system architecture, functionalities, and design principles. Learn about processes, memory management, and file systems.\n\n**Emerging Technologies:**\n* Deep L

#### Optimization

In [3]:
from bigframes.ml.llm import TextEmbeddingGenerator

model = TextEmbeddingGenerator()
predicted_embeddings = model.predict(df["Course Name"])
predicted_embeddings



Unnamed: 0,ml_generate_embedding_result,ml_generate_embedding_statistics,ml_generate_embedding_status,content
0,[-0.04662969 0.03035904 -0.02908228 -0.047811...,"{""token_count"":7,""truncated"":false}",,Probability and Random Processes
1,[ 1.23578347e-02 -6.41184822e-02 2.17545331e-...,"{""token_count"":5,""truncated"":false}",,Optimization Methods in Engineering
2,[-3.44864875e-02 -3.02195549e-02 -1.20360842e-...,"{""token_count"":7,""truncated"":false}",,Digital Design and Integrated Circuits
3,[-2.35837866e-02 4.53444012e-02 -2.02544555e-...,"{""token_count"":2,""truncated"":false}",,Computer Security
4,[ 3.36089656e-02 5.33087850e-02 -2.57000513e-...,"{""token_count"":6,""truncated"":false}",,Operating Systems and Systems Programming
5,[-0.01681837 -0.00725462 -0.02383776 0.004380...,"{""token_count"":3,""truncated"":false}",,Compilers
6,[-5.21815242e-03 2.79279947e-02 -4.04327549e-...,"{""token_count"":3,""truncated"":false}",,Computer Networks
7,[-1.14989486e-02 -3.65677141e-02 -5.42889163e-...,"{""token_count"":2,""truncated"":false}",,Deep Learning
8,[-3.00005227e-02 2.32829303e-02 -5.32173887e-...,"{""token_count"":2,""truncated"":false}",,Graphics
9,[-1.83971766e-02 2.91438457e-02 -1.71812531e-...,"{""token_count"":2,""truncated"":false}",,Databases


In [4]:
from bigframes.ml.cluster import KMeans

cluster_model = KMeans(n_clusters=3) # We will divide our complaints into 3 groups
cluster_model.fit(predicted_embeddings["ml_generate_embedding_result"])
clustered_result = cluster_model.predict(predicted_embeddings["ml_generate_embedding_result"])
clustered_result

Unnamed: 0,CENTROID_ID,NEAREST_CENTROIDS_DISTANCE,ml_generate_embedding_result
0,1,"[{'CENTROID_ID': 1, 'DISTANCE': 0.723336677947...",[-0.04662969 0.03035904 -0.02908228 -0.047811...
1,1,"[{'CENTROID_ID': 1, 'DISTANCE': 0.766976711295...",[ 1.23578347e-02 -6.41184822e-02 2.17545331e-...
2,1,"[{'CENTROID_ID': 1, 'DISTANCE': 0.707395546757...",[-3.44864875e-02 -3.02195549e-02 -1.20360842e-...
3,1,"[{'CENTROID_ID': 1, 'DISTANCE': 0.720433604160...",[-2.35837866e-02 4.53444012e-02 -2.02544555e-...
4,1,"[{'CENTROID_ID': 1, 'DISTANCE': 0.647857725730...",[ 3.36089656e-02 5.33087850e-02 -2.57000513e-...
5,1,"[{'CENTROID_ID': 1, 'DISTANCE': 0.750436294089...",[-0.01681837 -0.00725462 -0.02383776 0.004380...
6,1,"[{'CENTROID_ID': 1, 'DISTANCE': 0.643328944273...",[-5.21815242e-03 2.79279947e-02 -4.04327549e-...
7,3,"[{'CENTROID_ID': 3, 'DISTANCE': 2.388983096625...",[-1.14989486e-02 -3.65677141e-02 -5.42889163e-...
8,1,"[{'CENTROID_ID': 1, 'DISTANCE': 0.762420394460...",[-3.00005227e-02 2.32829303e-02 -5.32173887e-...
9,1,"[{'CENTROID_ID': 1, 'DISTANCE': 0.765362162524...",[-1.83971766e-02 2.91438457e-02 -1.71812531e-...


In [12]:
df_copy = df.copy()
df_copy['_lotus_partition_id'] = clustered_result["CENTROID_ID"] - 1
df_copy

Unnamed: 0,Course Name,_lotus_partition_id
0,Probability and Random Processes,0
1,Optimization Methods in Engineering,0
2,Digital Design and Integrated Circuits,0
3,Computer Security,0
4,Operating Systems and Systems Programming,0
5,Compilers,0
6,Computer Networks,0
7,Deep Learning,2
8,Graphics,0
9,Databases,0


In [9]:
agg_df = df_copy.sem_agg("Generate a study plan for all {Course Name}s", model=model, num_batch = 5)
agg_df

Loop 0: aggregate 11 rows




Loop 1: aggregate 4 rows




Loop 2: aggregate 3 rows
Starting aggregation cross groups.




0    Answer:

The provided study plan already outli...
Name: _lotus_doc, dtype: string

In [11]:
agg_df.iloc[0]

'Answer:\n\nThe provided study plan already outlines a plan for the courses: "Operating Systems and Systems Programming," "Probability and Random Processes," "Computer Networks," "Optimization Methods in Engineering," "Digital Design and Integrated Circuits," "Databases," "Computer Security," "Compilers," and "Graphics." The plan is organized by week and prioritizes foundational courses before diving into more advanced topics. You can adjust the plan based on your individual needs and course requirements.  \n'

## Optimizations

### Cascade Models

In [7]:
# Cascade models, where the smaller model running first to save cost.
large_model = GeminiTextGenerator(model_name=_GEMINI_1P5_PRO_001_ENDPOINT)
small_model = GeminiTextGenerator(model_name=_GEMINI_1P5_FLASH_001_ENDPOINT)


In [8]:
predict_df = df.sem_filter(
    "{Course Name} requires a lot of math", 
    model=large_model, 
    small_model=small_model,
    confidence_threshold=0.9, 
    logprobs=True
)
predict_df



Debug:
5 rows resolved by helper model.
6 rows resolved by large model


Unnamed: 0,Course Name,helper_lm_results,helper_lm_confidence_scores,large_lm_results,large_lm_confidence_scores
0,Probability and Random Processes,True,0.95,,
1,Optimization Methods in Engineering,True,0.99,,
2,Digital Design and Integrated Circuits,True,0.95,,
7,Deep Learning,True,0.95,,


## Apply to the `bigquery-public-data.hacker_news.full` dataset

### 1. Import required packages

In [12]:
import bigframes.pandas as bpd
from bigframes.ml.llm import GeminiTextGenerator, _GEMINI_1P5_FLASH_001_ENDPOINT, _GEMINI_1P5_PRO_001_ENDPOINT

large_model = GeminiTextGenerator(model_name=_GEMINI_1P5_PRO_001_ENDPOINT)
small_model = GeminiTextGenerator(model_name=_GEMINI_1P5_FLASH_001_ENDPOINT)
# Run 6s

### 2. Read table and select columns

- This dataset contains information on Hacker News stories and comments.
- The dataset includes 41M rows in total

In [13]:
hacker_news = bpd.read_gbq("bigquery-public-data.hacker_news.full")
hacker_news = hacker_news[["title", "text", "by", "score", "time"]].head(10000)
hacker_news
# Run 4s

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,title,text,by,score,time
0,,"Well, most people aren&#x27;t alcoholics, so I...",slipframe,,1624675076
1,,"No, you don&#x27;t really <i>need</i> a smartp...",vetinari,,1681919794
2,,It&#x27;s for the late Paul Allen RIP. Should&...,lsr_ssri,,1539652075
3,,Yup they are dangerous. Be careful Donald Trump.,Sven7,,1439222754
4,,"Sure, it&#x27;s totally reasonable. Just point...",nicoburns,,1601896851
...,...,...,...,...,...
9995,LinkedIn rival Viadeo acquires French startup ...,,jmfork,2,1358159590
9996,,"I think it&#x27;s true, as the blog author not...",borepop,,1608045566
9997,Nvidia to Android: We're Just Not That Into You,,kungfudoi,1,1245950208
9998,,How do you propose I&#x27;d be caught? This is...,TangoTrotFox,,1541520331


### 3. Filter titles related to "Art"

In [14]:
hacker_news_w_title = hacker_news[hacker_news["title"].isnull() == False]
hacker_news_w_title
# Run 6s

Unnamed: 0,title,text,by,score,time
6,The Impending NY Tech Apocalypse: Here's What ...,,gaoprea,3,1317163407
8,Eureca beta is live. A place for your business...,,ricardos,1,1350306572
15,Discord vs. IRC Rough Notes,,todsacerdoti,48,1720809592
21,Oh dear: new Yahoo anti-spoofing measures brea...,,joshreads,1,1396963790
22,How Much Warmer Was Your City in 2016?,,smb06,1,1487287594
...,...,...,...,...,...
9977,Ask HN: Sharing a dedicated server,Had an idea that I would like some feedback on...,idiet,2,1401901111
9986,How to Launch on Product Hunt,,debdutmukherjee,3,1554441805
9993,"Show HN: Free, open source JavaScript and mong...",,jdawg77,3,1422419156
9995,LinkedIn rival Viadeo acquires French startup ...,,jmfork,2,1358159590


In [15]:
art_hacker_news = hacker_news_w_title.sem_filter("{title} is related to Art", model=large_model)
art_hacker_news
# Run TODO

### 4. Cascade model performance

In [None]:
art_hacker_news = hacker_news_w_title.sem_filter("Is {title} related to Art", model=large_model, helper_model=small_model)
art_hacker_news

### 5. Automatically refill the missed title

In [None]:
hacker_news_wo_title = hacker_news[hacker_news["title"].isnull()].head(100)
hacker_news_gai_title = hacker_news_wo_title.sem_map("Generate a short title for the given context: {text}", model=large_model)
hacker_news_gai_title = hacker_news_gai_title[hacker_news_gai_title["title"].isnull()]
hacker_news_gai_title