In [None]:
import datadotworld as dw
import pandas as pd
import psycopg
import openai

In [None]:
DB_NAME = "ai_product_catalog"
DB_USER = "ai_product_catalog"
DB_PASSWORD = "ai_product_catalog123"
DB_HOST = "127.0.0.1"
DB_PORT = "5432"
DB_CONNECTION_STRING = f"host={DB_HOST} port={DB_PORT} dbname={DB_NAME} user={DB_USER} password={DB_PASSWORD}"

CATEGORY_DESC = "Shoes"

PROVIDER_OPENAI = "openai"
PROVIDER_OPENAI_EMBEDDINGS_MODEL = "text-embedding-ada-002"

In [None]:
product_dataset = dw.load_dataset('data-hut/product-data-from-nike')
product_dataset.describe()

In [None]:
df = product_dataset.dataframes["nike_2020_04_13"]
df.head()

In [None]:
print ("Shape = " + str(df.shape))
print ("Number of unique Product IDs = " + str(len(df['product_id'].drop_duplicates())))
print ("Maximum Length of Product ID Column = " + str(df['product_id'].str.len().max()))
print ("Number of unique Brands = " + str(len(df['brand'].drop_duplicates())))
print ("Maximum Length of Product Name Column = " + str(df['product_name'].str.len().max()))
print ("Maximum Length of Product Description Column = " + str(df['description'].str.len().max()))
print ("Maximum Length of Brand Column = " + str(df['brand'].str.len().max()))


In [None]:
category_id = None

with psycopg.connect(DB_CONNECTION_STRING) as db_connection:
  with db_connection.cursor() as c:
      c.execute(
          """
              INSERT INTO categories(category_desc) VALUES (%s) RETURNING category_id;
          """, 
          (CATEGORY_DESC,))

      rows = c.fetchone()
      if rows:
          category_id = rows[0]

      db_connection.commit()

if category_id == None:
  print ("Issue with category creation.  Was not created.")
else:
  print ("Category Created.  Category_ID = " + str(category_id) + " for Category_Name = " + CATEGORY_DESC)

In [None]:
brandsDF = pd.DataFrame({ "Brand": df['brand'].drop_duplicates() })
print(brandsDF.shape)
brandsDF.head()

In [None]:
with psycopg.connect(DB_CONNECTION_STRING) as db_connection:
  with db_connection.cursor() as c:
    c.executemany(
        query = 
          """
              INSERT INTO brands ( brand_desc ) 
              VALUES ( %(Brand)s )
              RETURNING brand_id, brand_desc
          """,
        params_seq = brandsDF.to_dict(orient="records"),
        returning=True
    )

    rows = c.fetchall()
    if rows:
      for row in rows:
        print ("Inserted Brand Row.  ID =", row[0], "Description =", row[1])
    else:
      print ("No Brand Rows Created!")

In [None]:
df['msrp'] = df['sale_price'].astype('float') / 100.0
df['msrp']

In [None]:
with psycopg.connect(DB_CONNECTION_STRING) as db_connection:
  with db_connection.cursor() as c:
    c.executemany(
       query = f"""
            INSERT INTO products
            (
                sku, 
                brand_id,
                product_name,
                product_desc,
                size,
                msrp,
                category_id
            ) 
            VALUES 
            (
                  %(product_id)s,
                  (select brand_id from brands where brand_desc=%(brand)s fetch first 1 rows only),
                  %(product_name)s,
                  %(description)s,
                  null,
                  cast(%(msrp)s as double precision),
                  {category_id}
            )
            RETURNING product_id
        """,
        params_seq = df.to_dict(orient="records"),
        returning = True
    )
      
    productIds = c.fetchall()
    productIds


In [None]:
def get_embedding_from_db(productName, sku, price, brand, description, model):
    productName = productName.replace("'", "''")
    
    cleansedDescription = None
    if description != None and type(description) != float:
        cleansedDescription = description.replace("\n", " ").replace("'", "''")

    with psycopg.connect(DB_CONNECTION_STRING) as db_connection:
        with db_connection.cursor() as c:
            sql = f"""
                            select embedding
                            from product_embeddings
                            where model = '{model}'
                              and engine = '{PROVIDER_OPENAI}'
                              and product_id = 
                                (
                                    select product_id 
                                    from products 
                                    where product_name='{productName}'
                                      and sku='{sku}'
                                      and brand_id=(select brand_id from brands where brand_desc='{brand}' fetch first 1 rows only)
                                      and product_desc"""
            if cleansedDescription != None:
              sql = sql + f"='{cleansedDescription}'"
            else:
              sql = sql + " is null"
            sql = sql + f"""
                                    fetch first 1 rows only
                                )
                      """
            print(sql)
            
            c.execute(sql)
            record = c.fetchone()

            if record == None:
                  return None
            
            return record[0]

In [None]:
client = openai.OpenAI()

counter = 0

def create_embedding(productName, sku, price, brand, description, model):
   global counter

   text = """'{productName}', '{sku}', {price}, '{brand}', '{description}'"""
   text = text.replace("\n", " ")

   embedding = get_embedding_from_db(productName, sku, price, brand, description, model)
   
   if embedding != None:
      return None

   counter = counter + 1
   if counter > 1:
      return None

   return client.embeddings.create(input = [text], model=model).data[0].embedding

df['embeddingToStore'] = df.apply(lambda row: create_embedding(row["product_name"], row["product_id"], row["msrp"], row["brand"], row["description"], model=PROVIDER_OPENAI_EMBEDDINGS_MODEL), axis=1)

In [None]:
embeddingsDF = df[df['embeddingToStore'].notnull()]
print("Shape:", embeddingsDF.shape)
#print ("Vector Length:", str(len(embeddingsDF[0])))
embeddingsDF.head()

In [None]:
embeddingsDF = df[df['embeddingToStore'].notnull()]
print("Shape:", embeddingsDF.shape)
embeddingsDF.head()

In [None]:
with psycopg.connect(DB_CONNECTION_STRING) as db_connection:
  with db_connection.cursor() as c:
      c.executemany(
          f"""
              INSERT INTO product_embeddings
              (
                product_id,
                engine,
                model,
                embedding
              ) 
              VALUES
              (
                  (
                      select product_id 
                      from products 
                      where product_name=%(product_name)s
                        and sku=%(product_id)s
                        and brand_id=(select brand_id from brands where brand_desc=%(brand)s fetch first 1 rows only)
                        and product_desc=%(description)s
                      fetch first 1 rows only
                  ),
                  '{PROVIDER_OPENAI}',
                  '{PROVIDER_OPENAI_EMBEDDINGS_MODEL}',
                  %(embeddingToStore)s
              )
          """,
          embeddingsDF.to_dict(orient="records"),
      )