In [0]:
%pip install sentence_transformers

In [0]:
#To do:  look for latest pinto and pim files
pinto_path = 'abfss://data@sa8451entlakegrnprd.dfs.core.windows.net/source/third_party/prd/pinto/pinto_effo_kroger_export_20230413'
pinto_prods = spark.read.parquet(pinto_path)
pim_path = "abfss://pim@sa8451posprd.dfs.core.windows.net/pim_core/by_cycle/cycle_date=20230408"
pim_core = spark.read.parquet(pim_path)
search_path = "abfss://personloyalty@sa8451dbxadhocprd.dfs.core.windows.net/relevancy/e451_query2concept2vec/20221007/query2concept2vec.parquet"
search_emb = spark.read.parquet(search_path)

In [0]:
from pyspark.sql import functions as f
from pyspark.sql.functions import collect_set, substring_index, concat_ws, concat, split, regexp_replace, size, expr
pim_diet = pim_core.select(f.col("upc_key"), f.col("krogerOwnedEcommerceDescription"), f.col("gtinName"), f.col("diets.*"))
pim_diets = pim_diet.withColumn('diet', concat_ws(', ', f.col('AYERVEDIC.name'), f.col('LOW_BACTERIA.name'),\
     f.col('COELIAC.name'), f.col('DIABETIC.name'), f.col('FREE_FROM_GLUTEN.name'), f.col('GLYCEMIC.name'),\
     f.col('GRAIN_FREE.name'), f.col('HALAL.name'), f.col('HGC.name'), f.col('HIGH_PROTEIN.name'),\
     f.col('KEHILLA.name'), f.col('KETOGENIC.name'), f.col('KOSHER.name'), f.col('LACTOSE_FREE.name'),\
     f.col('LOW_CALORIE.name'), f.col('LOW_PROTEIN.name'), f.col('LOW_SALT.name'), f.col('MACROBIOTIC.name'),\
     f.col('METABOLIC.name'), f.col('NON_VEG.name'), f.col('PALEO.name'),\
     f.col('PECETARIAN.name'), f.col('PLANT_BASED.name'), f.col('RAW_FOOD.name'), f.col('VEGAN.name'),\
     f.col('VEGETARIAN.name'), f.col('VEG_OVO.name'), f.col('WITHOUT_BEEF.name'), f.col('WITHOUT_PORK.name')))\
     .withColumn('clean_commas', f.regexp_replace(f.col('diet'), ' ,', ','))\
     .withColumn('lowercase', f.lower(concat_ws(',', 'clean_commas')))\
     .withColumn('no_trailing_space', regexp_replace(f.col('lowercase'), r'\s+$', ''))\
     .withColumnRenamed('no_trailing_space', 'pim_sentence')\
     .select('upc_key', 'krogerOwnedEcommerceDescription', 'gtinName', 'pim_sentence')

In [0]:
pinto_diets = pinto_prods.select(f.explode(f.col('upcs.standard')).alias('gtin_no'), f.col('dietList'))\
    .withColumn("gtin_no",expr("substring(gtin_no, 1, length(gtin_no)-1)"))\
    .select(f.col('gtin_no'), f.col('dietList.slug').alias('diet_slug'))#.filter(size("diet_slug") >= 1)
pinto_names = pinto_prods.select(f.explode(f.col('upcs.standard')).alias('gtin_no'), f.col('name'))\
    .withColumn("gtin_no",expr("substring(gtin_no, 1, length(gtin_no)-1)"))\
    .select('gtin_no', 'name').distinct()
pinto_data = pinto_names.join(pinto_diets, 'gtin_no', 'inner')

In [0]:
pinto_pim = pinto_data.join(pim_diets, pim_diets.upc_key == pinto_data.gtin_no, 'outer')

from pyspark.sql.functions import when
pinto_pim = pinto_pim.withColumn("name", when(pinto_pim.name.isNull(), pinto_pim.krogerOwnedEcommerceDescription).otherwise(pinto_pim.name))

pinto_pim = pinto_pim.withColumn("krogerOwnedEcommerceDescription", when(pinto_pim.krogerOwnedEcommerceDescription.isNull(), pinto_pim.name).otherwise(pinto_pim.krogerOwnedEcommerceDescription))

In [0]:
# Manipulate data into Pandas dataframe at the GTIN level (or whatever UPC level)

sentences = pinto_pim.withColumn('diet_string', concat_ws(',  ', f.col('diet_slug')))\
    .withColumn('diet_string', f.lower(concat(f.col('diet_string'))))\
    .withColumn('diet_sentence', split(f.col('diet_string'), ', '))\
    .withColumn('pimto_sentence', concat_ws(',', f.col('diet_sentence'),f.col('pim_sentence')))\
    .withColumn('no_leading_comma', regexp_replace(f.col('pimto_sentence'), r'^\,+', ''))\
    .select('upc_key','name','gtin_no','no_leading_comma')
sentences = sentences.select(f.coalesce(sentences["upc_key"], sentences["gtin_no"]).alias('gtin_no'), f.col("name"),f.col('no_leading_comma').alias('diet_sentence'))
sentences = sentences.select(f.col("gtin_no"), f.col("name"),split(f.col("diet_sentence"),",").alias("diet_sentence_Arr"))

from pyspark.sql.functions import array_distinct
sentences = sentences.withColumn("dedup_diet_sentence_Arr", array_distinct("diet_sentence_Arr"))\
     .withColumnRenamed('dedup_diet_sentence_Arr', 'diet_sentence')\
     .withColumn("diet_sentence", concat_ws(",",f.col("diet_sentence")))\
     .select('gtin_no', 'name', 'diet_sentence')

In [0]:
# Create a "text" field that concatenates everything you want to encode
# excise the vector I need for model input: a string that has the sentence as well as gtin
vector = sentences.select('diet_sentence', 'gtin_no', 'name')\
    .withColumn('sentence_string', concat_ws(',', f.col('diet_sentence')))\
    .withColumn('full_string', when(f.col('sentence_string') == '', concat(f.col('name'), f.lit('.'))).otherwise(concat(f.col('name'), f.lit(' is '), f.col('sentence_string'), f.lit('.'))))\
    .withColumn('diet_string', f.regexp_replace(f.col('full_string'), ',,', ','))\
    .select('gtin_no', 'diet_string')
    
pandas_df = vector.toPandas()

In [0]:
# Specify the model directory on DBFS
model_dir = "/dbfs/dbfs/FileStore/users/s354840/pretrained_transformer_model" 

# These packages are required for delivery
from sentence_transformers import SentenceTransformer, util

# Loading the transformer model
model = SentenceTransformer(model_dir) 

In [0]:
# Extract "text" field into a list
sentence = pandas_df.diet_string.tolist() 

# Import date class from datetime module
from datetime import date
 
# Returns the current local date
today = date.today()#

In [0]:
# Encode vectors from sentences
# You may need a GPU cluster to do this efficiently
vectors = model.encode(sentence, normalize_embeddings=True) 

In [0]:
pandas_df['vector'] = vectors.tolist()

In [0]:
# Join together into output dataframe
import pandas as pd
output_df = spark.createDataFrame(pd.DataFrame({"gtin_no": pandas_df["gtin_no"].to_list(), "vector": pandas_df["vector"].to_list()}))
output_df.write.mode("overwrite").parquet('abfss://media@sa8451dbxadhocprd.dfs.core.windows.net/Users/s354840/embedded_dimensions/pimto_product_vectors_diet_description/{today}')