In [65]:
# imports
import pandas as pd
import os
import openai
import json
from pymongo import MongoClient

from dotenv import load_dotenv
from openai.embeddings_utils import (
get_embedding,
distances_from_embeddings,
indices_of_nearest_neighbors_from_distances
)

In [66]:
# Load my API key
# Using OpenAI to calculate embeddings

load_dotenv()
openai.api_key = os.environ.get("OPENAI_API_KEY")

In [67]:
# load Triple offers dataset
# we need to split the dataset in smaller chunks for performance reasons (it is slow to write a full cache file with 12000 offers + embeddings)

dataset_path = "/Users/gauthierrobe/Documents/DataSpell/triple_offers_sandbox_12022023_withID_12571.csv"
df = pd.read_csv(dataset_path)

# print dataframe
n_examples = 10
df.head(n_examples)

Unnamed: 0,id,external_id,title,merchant,description,category
0,10732,159067,5% back at Frank Anthony's Italian Restau,Frank Anthony's Italian Restaurant,"A welcoming family restaurant, Frank Anthony's...",FOOD
1,10733,158962,5% back at Fresh & Meaty Burgers Carson,Fresh & Meaty Burgers Carson,"Known for amazingly good burgers, Fresh & Meat...",FOOD
2,10734,158697,5% back at Ikko II Japanese Steak House,Ikko II Japanese Steak House,Check out the fabulous selection of authentic ...,FOOD
3,10735,158581,5% back at Marco's Pizza,Marco's Pizza,When you're craving amazing pizza made from hi...,FOOD
4,10736,159084,5% back at Monterey Halal Market,Monterey Halal Market,Monterey Halal Market is a convenient and quic...,FOOD
5,10737,158960,5% back at Soulshine Pizza Factory Franklin,Soulshine Pizza Factory Franklin,Soulshine Pizza Factory offers a terrific vari...,FOOD
6,10738,158582,5% back at Ploy Thai Cuisine,Ploy Thai Cuisine,Ploy Thai Cuisine is an award-winning family f...,FOOD
7,10739,159026,5% back at Smokey D'Z BBQ & Catering,Smokey D'Z BBQ & Catering,"Home of the backyard BBQ, Smokey D'Z BBQ & Cat...",FOOD
8,10740,158650,5% back at Al Pastor,Al Pastor,Al Pastor is a family-friendly Mexican eatery ...,FOOD
9,10741,157980,5% back at Tailgaters Sports Bar & Grill,Tailgaters Sports Bar & Grill,Get ready for some fun because it's always liv...,FOOD


In [68]:
# print the offer title, description, merchant, category for each

for idx, row in df.head(n_examples).iterrows():
    print("")
    print(f"ID: {row['id']}")
    print(f"External ID: {row['external_id']}")
    print(f"Title: {row['title']}")
    print(f"Merchant: {row['merchant']}")
    print(f"Description: {row['description']}")
    print(f"Category: {row['category']}")


ID: 10732
External ID: 159067
Title: 5% back at Frank Anthony's Italian Restau
Merchant: Frank Anthony's Italian Restaurant
Description: A welcoming family restaurant, Frank Anthony's Italian Restaurant offers delicious gourmet Italian favorites created with the best ingredients and time-honored recipes. Here you'll find helpful servers providing a great atmosphere that will have you coming back again and again. Frank Anthony's also offers full service fine catering and hosts private events. 
Category: FOOD

ID: 10733
External ID: 158962
Title: 5% back at Fresh & Meaty Burgers Carson
Merchant: Fresh & Meaty Burgers Carson
Description: Known for amazingly good burgers, Fresh & Meaty Burgers is all about perfecting the art of the humble hamburger. Starting with the very best quality ingredients, every made-to-order burgers is fresh, juicy, and topped just how you like it. There are breakfast picks and daily specials to ask about at this gem, so check it out today! 
Category: FOOD

ID: 1

In [73]:
# establish a cache of embeddings as JSON
# the cache format is {"offers": []}
# get the embeddings from OpenAI for each offer description in the CSV file

def get_json_embeddings(offer_id, offer_external_id, offer_string, model_id):
    embedding = get_embedding(offer_string, engine=model_id)
    my_embeddings = {
        "triple_id": offer_id,
        "external_id": offer_external_id,
        "embeddings": embedding
    }
    json_file = "/Users/gauthierrobe/Documents/DataSpell/cache_6.json"
    write_json(my_embeddings, json_file)

In [74]:
# write the embeddings to the intermediary cache file

def write_json(new_data, filename):
    with open(filename, "r+") as file:
        file_data = json.load(file)
        file_data["offers"].append(new_data)
        file.seek(0)
        json.dump(file_data, file, indent = 4)

In [75]:
# create the embeddings

def create_embeddings_cache(model_id):
    for idx, row in df.iterrows():
        get_json_embeddings(row['id'], row['external_id'], row['description'],model_id)
        print(f"Done with row: {idx} - {row['id']}")

In [76]:
# call the creation of embeddings using the model ADA-002

create_embeddings_cache("text-embedding-ada-002")

Done with row: 0 - 10732
Done with row: 1 - 10733
Done with row: 2 - 10734
Done with row: 3 - 10735
Done with row: 4 - 10736
Done with row: 5 - 10737
Done with row: 6 - 10738
Done with row: 7 - 10739
Done with row: 8 - 10740
Done with row: 9 - 10741
Done with row: 10 - 10742
Done with row: 11 - 10743
Done with row: 12 - 10744
Done with row: 13 - 10745
Done with row: 14 - 10746
Done with row: 15 - 10747
Done with row: 16 - 10748
Done with row: 17 - 10749
Done with row: 18 - 10750
Done with row: 19 - 10751
Done with row: 20 - 10752
Done with row: 21 - 10753
Done with row: 22 - 10754
Done with row: 23 - 10755
Done with row: 24 - 10756
Done with row: 25 - 10757
Done with row: 26 - 10758
Done with row: 27 - 10759
Done with row: 28 - 10760
Done with row: 29 - 10761
Done with row: 30 - 10762
Done with row: 31 - 10763
Done with row: 32 - 10764
Done with row: 33 - 10765
Done with row: 34 - 10766
Done with row: 35 - 10767
Done with row: 36 - 10768
Done with row: 37 - 10769
Done with row: 38 - 10

In [77]:
# Combining all the intermediary cache files into a single one

with open ("/Users/gauthierrobe/Documents/DataSpell/cache_1.json") as f1:
    data1 = json.load(f1)

with open ("/Users/gauthierrobe/Documents/DataSpell/cache_2.json") as f2:
    data2 = json.load(f2)

with open ("/Users/gauthierrobe/Documents/DataSpell/cache_3.json") as f3:
    data3 = json.load(f3)

with open ("/Users/gauthierrobe/Documents/DataSpell/cache_4.json") as f4:
    data4 = json.load(f4)

with open ("/Users/gauthierrobe/Documents/DataSpell/cache_5.json") as f5:
    data5 = json.load(f5)

with open ("/Users/gauthierrobe/Documents/DataSpell/cache_6.json") as f6:
    data6 = json.load(f6)

df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)
df3 = pd.DataFrame(data3)
df4 = pd.DataFrame(data4)
df5 = pd.DataFrame(data5)
df6 = pd.DataFrame(data6)
df_final = pd.concat([df1, df2, df3, df4, df5, df6], ignore_index=True)
df_final.head()

out = df_final.to_json(orient="records")

In [78]:
with open("/Users/gauthierrobe/Documents/DataSpell/triple_cache.json", 'w') as f:
    f.write(out)

In [86]:
# MongoDB info. We will write the offers and their embeddings + nearest neighbors to MongoDB

mongodb_client = MongoClient('mongodb+srv://g23e981:HNZbcNL0rzXrWO5W@triple-user-preferences.nhqlrkl.mongodb.net/?retryWrites=true&w=majority')
database = mongodb_client['offers-embeddings']
collection = database['offers-embeddings']

In [87]:
# for each offer in the cache file, we calculate the K nearest neighbors

def calculate_recommendations(embeddings_file, source_offer_id, source_offer_external_id, nb_recommendations):
    with open(embeddings_file) as json_file:
        data = json.load(json_file)

    # get embeddings from the source offer
    source_embeddings = []
    all_embeddings = []
    list_nearest_neighbors = []

    for idx, ids in enumerate(data):
        if data[idx]['offers']['triple_id'] == source_offer_id:
            source_embeddings = data[idx]['offers']['embeddings']

    # get all embeddings
    for idx, ids in enumerate(data):
        all_embeddings.append(data[idx]['offers']['embeddings'])

    # calculate the distances
    distances = distances_from_embeddings(source_embeddings, all_embeddings, distance_metric="cosine")

    # get index of nearest neighbors
    indices_of_nearest_neighbors = indices_of_nearest_neighbors_from_distances(distances)

    k_counter = 0
    for i in indices_of_nearest_neighbors:
        # skip if same as original offer
        if source_offer_id == data[i]['offers']['triple_id']:
            continue
        # stop after k neighbors are found
        if k_counter >= nb_recommendations:
            break
        k_counter += 1

        list_nearest_neighbors.append(data[i]['offers']['triple_id'])

    offer_details = {
        "offer_id": source_offer_id,
        "offer_external_id": source_offer_external_id,
        "embeddings": source_embeddings,
        "nearest_neighbors": list_nearest_neighbors
    }

    insert_in_mongo(offer_details)

In [88]:
def insert_in_mongo(offer_details):

    mongoInsert = collection.insert_one(offer_details)
    print("Inserted Offer ID: " + str(offer_details['offer_id']))

In [89]:
# This is a slow process (24h on local machine. Can likely be optimized)

em_cache = 'triple_cache.json'

with open(em_cache) as cache_file:
    datafile = json.load(cache_file)

    for idx, offers, in enumerate(datafile):
        calculate_recommendations(em_cache, datafile[idx]['offers']['triple_id'], datafile[idx]['offers']['external_id'], 10)
        print("Just sent Offer: " + str(datafile[idx]['offers']['triple_id']) + " for calculation")

    print("All done with embeddings")

Inserted Offer ID: 1
Just sent Offer: 1for calculation
Inserted Offer ID: 2
Just sent Offer: 2for calculation
Inserted Offer ID: 3
Just sent Offer: 3for calculation
Inserted Offer ID: 4
Just sent Offer: 4for calculation
Inserted Offer ID: 5
Just sent Offer: 5for calculation
Inserted Offer ID: 6
Just sent Offer: 6for calculation
Inserted Offer ID: 7
Just sent Offer: 7for calculation
Inserted Offer ID: 8
Just sent Offer: 8for calculation
Inserted Offer ID: 9
Just sent Offer: 9for calculation
Inserted Offer ID: 10
Just sent Offer: 10for calculation
Inserted Offer ID: 11
Just sent Offer: 11for calculation
Inserted Offer ID: 12
Just sent Offer: 12for calculation
Inserted Offer ID: 13
Just sent Offer: 13for calculation
Inserted Offer ID: 14
Just sent Offer: 14for calculation
Inserted Offer ID: 15
Just sent Offer: 15for calculation
Inserted Offer ID: 16
Just sent Offer: 16for calculation
Inserted Offer ID: 17
Just sent Offer: 17for calculation
Inserted Offer ID: 18
Just sent Offer: 18for calc

In [14]:
# NOT USED
def calculate_recommendations_old(embeddings_file, source_offer_id, source_offer_external_id, nb_recommendations):
    with open(embeddings_file) as json_file:
        data = json.load(json_file)

    # get embeddings from the source offer
    source_embeddings = []
    all_embeddings = []
    for idx, ids in enumerate(data):
        if data[idx]['offers']['triple_id'] == source_offer_id:
            source_embeddings = data[idx]['offers']['embeddings']

    # get all embeddings
    for idx, ids in enumerate(data):
        all_embeddings.append(data[idx]['offers']['embeddings'])

    # calculate the distances
    distances = distances_from_embeddings(source_embeddings, all_embeddings, distance_metric="cosine")

    # get index of nearest neighbors
    indices_of_nearest_neighbors = indices_of_nearest_neighbors_from_distances(distances)
    list_nearest_neighbors = []

    k_counter = 0
    for i in indices_of_nearest_neighbors:
        # skip if same as original offer
        if source_offer_id == data[i]['offers']['triple_id']:
            continue
        # stop after k neighbors are found
        if k_counter >= nb_recommendations:
            break
        k_counter += 1

        # print out the similar strings and their distances
        print(
            f"""
                --- Recommendation #{k_counter} (nearest neighbor {k_counter} of {nb_recommendations}) ---
                Offer ID: {data[i]['offers']['triple_id']}
                External ID: {data[i]['offers']['external_id']}
                Distance: {distances[i]:0.3f}"""
        )
        list_nearest_neighbors.append(data[i]['offers']['triple_id'])

    offer_details = {
        "offer_id": source_offer_id,
        "offer_external_id": source_offer_external_id,
        "embeddings": source_embeddings,
        "nearest_neighbors": list_nearest_neighbors
    }
    return offer_details