## Preparation

In [1]:
import bigframes
import bigframes.pandas as bpd

Enable the semantic operator experiment

In [2]:
bigframes.options.experiments.semantic_operators = True



Prepare the LLM model. Here we are going to use Gemini 1.5 Flash.

In [3]:
import bigframes.ml.llm as llm
gemini_model = llm.GeminiTextGenerator(model_name=llm._GEMINI_1P5_FLASH_001_ENDPOINT)
text_embedding_model = llm.TextEmbeddingGenerator(model_name="text-embedding-004")

  return global_session.get_global_session()


## Semantic Filtering

In [4]:
df = bpd.DataFrame({'country': ['USA', 'Germany'], 'city': ['Seattle', 'Berlin']})
df.semantics.filter("{city} is the capital of {country}", gemini_model)



Unnamed: 0,country,city
1,Germany,Berlin


## Semantic Mapping

In [5]:
df = bpd.DataFrame(
        data={"ingredient_1": ["Burger Bun", "Soy Bean"], "ingredient_2": ["Beef Patty", "Bittern"]}
    )

In [6]:
df.semantics.map("What is the food made from {ingredient_1} and {ingredient_2}? One word only.", output_column="food", model=gemini_model)



Unnamed: 0,ingredient_1,ingredient_2,food
0,Burger Bun,Beef Patty,Burger
1,Soy Bean,Bittern,Tofu


## Semantic Joining

In [7]:
cities = bpd.DataFrame({'city': ['Seattle', 'Ottawa', 'Berlin', 'Shanghai', 'New Delhi']})
continents = bpd.DataFrame({'continent': ['North America', 'Africa', 'Asia']})

In [8]:
cities.semantics.join(continents, "{city} is in {continent}", gemini_model)



Unnamed: 0,city,continent
0,Seattle,North America
1,Ottawa,North America
2,Shanghai,Asia
3,New Delhi,Asia


### Self Joins

In [9]:
animals = bpd.DataFrame({'animal': ['cow', 'cat', 'spider', 'elephant']})

In [10]:
animals.semantics.join(animals, "{animal_left} generally weighs heavier than {animal_right}", gemini_model)



Unnamed: 0,animal_left,animal_right
0,cow,cat
1,cow,spider
2,cat,spider
3,elephant,cow
4,elephant,cat
5,elephant,spider


## Semantic Search

In [11]:
df = bpd.DataFrame({"creatures": ["salmon", "sea urchin", "baboons", "frog", "chimpanzee"]})
df

Unnamed: 0,creatures
0,salmon
1,sea urchin
2,baboons
3,frog
4,chimpanzee


In [12]:
df.semantics.search("creatures", "monkey", top_k = 2, model = text_embedding_model, score_column='similarity score')





Unnamed: 0,creatures,similarity score
2,baboons,0.773411
4,chimpanzee,0.781101


## Semantic Similarity Join

In [13]:
df1 = bpd.DataFrame({'animal': ['monkey', 'spider', 'salmon', 'giraffe', 'sparrow']})
df2 = bpd.DataFrame({'animal': ['scorpion', 'baboon', 'owl', 'elephant', 'tuna']})

In [14]:
df1.semantics.sim_join(df2, left_on='animal', right_on='animal', top_k=1, model= text_embedding_model, score_column='distance')





Unnamed: 0,animal,animal_1,distance
0,monkey,baboon,0.747665
1,spider,scorpion,0.890909
2,salmon,tuna,0.925461
3,giraffe,elephant,0.887858
4,sparrow,owl,0.932959


In [15]:
df1 = bpd.DataFrame({'animal': ['monkey', 'spider']})
df2 = bpd.DataFrame({'animal': ['scorpion', 'baboon']})

df1.semantics.sim_join(df2, left_on='animal', right_on='animal', top_k=1, model= text_embedding_model)





Unnamed: 0,animal,animal_1
0,monkey,baboon
1,spider,scorpion


## Semantic Aggregation

In [16]:
df = bpd.DataFrame({
    "Movies": [
        "Titanic",
        "The Wolf of Wall Street",
        "Killers of the Flower Moon",
        "The Revenant",
        "Inception",
        "Shuttle Island",
        "The Great Gatsby",
    ],
    "Year": [1997, 2013, 2023, 2015, 2010, 2010, 2013],
})
df

Unnamed: 0,Movies,Year
0,Titanic,1997
1,The Wolf of Wall Street,2013
2,Killers of the Flower Moon,2023
3,The Revenant,2015
4,Inception,2010
5,Shuttle Island,2010
6,The Great Gatsby,2013


In [17]:
agg_df = df.semantics.agg("Find the shared first name of actors in {Movies}. One word answer.", model=gemini_model)
agg_df



0    Leonardo 

Name: Movies, dtype: string

## Semantic Cluster

In [19]:
df = bpd.DataFrame({'Product': ['Smartphone', 'Laptop', 'Coffee Maker', 'T-shirt', 'Jeans']})

df.semantics.cluster_by(column='Product', output_column='Cluster ID', model=text_embedding_model, n_clusters=3)



Unnamed: 0,Product,Cluster ID
0,Smartphone,3
1,Laptop,3
2,Coffee Maker,1
3,T-shirt,2
4,Jeans,2


## Semantic TopK

In [4]:
df = bpd.DataFrame({"Animals": ["Dog", "Cat", "Bird", "Horse"]})
df.semantics.top_k("{Animals} are more popular as pets", model=gemini_model, k=2)





Unnamed: 0,Animals
0,Dog
1,Cat
