# Michelin Restaurant Dataset: Encoding Text Data

In [0]:
import pandas as pd
import os
from pyspark.sql.functions import *
from transformers import AutoTokenizer, OpenAIGPTTokenizer
from langchain.text_splitter import RecursiveCharacterTextSplitter

### Catalog, Schema Set up

In [0]:
catalog_ = os.getenv('CATALOG_NAME')
schema_ = os.getenv('SCHEMA_NAME')
spark.sql("USE CATALOG "+catalog_)
spark.sql("USE SCHEMA "+schema_)

### Read Silver Data

In [0]:
silver_df = spark.sql("SELECT * FROM silver_data")
display(silver_df)

## Restaurant Descriptions

##### Tokenization
Creating a Spark UDF that tokenizes the descriptions, exploding each token to a new row.

In [0]:
## Base function for splitting and tokenizing (splitting not needed here)
max_chunk_size = 300

tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(tokenizer, chunk_size=max_chunk_size, chunk_overlap=50)

def tokz_(text, min_chunk_size = 1, max_chunk_size=max_chunk_size):
  if not text:
    return []
  chunks = text_splitter.split_text(text)
  return [c for c in chunks if len(tokenizer.encode(c)) > min_chunk_size]

In [0]:
## Encapsulate function into a PandasUDF
@pandas_udf("array<string>")
def save_tokens(descr: pd.Series) -> pd.Series:
  return descr.apply(tokz_)

In [0]:
%sql
-- Create an empty table with tokens
DROP TABLE IF EXISTS rest_descr_tokenized;
CREATE TABLE IF NOT EXISTS rest_descr_tokenized (
  Id BIGINT GENERATED BY DEFAULT AS IDENTITY,
  Res_ID STRING,
  Descr_Tokenized STRING
);

In [0]:
## Generating Tokens for all descriptions
(spark.table("silver_data")
        .filter('Description is not null')
        .select(['Res_ID', 'Description'])
        .withColumn('Descr_Tokenized', explode(save_tokens('Description')))
        .drop('Description')
        .write
        .mode('overwrite')
        .saveAsTable('rest_descr_tokenized'))

display(spark.table('rest_descr_tokenized'))


##### Embedding
Starting from the different sentences, embed them using a BGE model.

The encode method is explained [here](https://sbert.net/docs/package_reference/sentence_transformer/SentenceTransformer.html).

In [0]:
### Create a base function to embed sentences
from sentence_transformers import SentenceTransformer

# Load the BAAI/bge-m3 model
model_name = "BAAI/bge-m3"
model = SentenceTransformer(model_name)

# Embed
def embed(text_):
  return model.encode(text_)

# Output example
emb = Embed_("This is an example")
print(emb)
print(emb.shape)

In [0]:
### Wrap the function into a Spark UDF
@pandas_udf("array<float>")
def save_embeddings(text_: pd.Series) -> pd.Series:
  return text_.apply(embed)

In [0]:
%sql
-- Create an empty table with embeddings
DROP TABLE IF EXISTS rest_descr_embedded;
CREATE TABLE IF NOT EXISTS rest_descr_embedded (
  Id BIGINT GENERATED BY DEFAULT AS IDENTITY,
  Res_ID STRING,
  Descr_Tokenized STRING,
  Descr_Embedded ARRAY<FLOAT>
);

In [0]:
## Generating Embeddings
(spark.table("rest_descr_tokenized")
        .select(['Res_ID', 'Descr_Tokenized'])
        .withColumn('Descr_Embedded', save_embeddings('Descr_Tokenized'))
        .write
        .mode('overwrite')
        .saveAsTable('rest_descr_embedded'))

display(spark.table('rest_descr_embedded'))