In [None]:
import datadotworld as dw
import pandas as pd
import os

import ProductDataLib


In [None]:
DATASET_NAME = 'data-hut/product-data-from-nike'

CATEGORY_DESC = "Shoes"

SENTENCE_TRANSFORMER_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

In [None]:
DB_HOST = "127.0.0.1"
if "DB_HOST" in os.environ:
    DB_HOST = os.environ["DB_HOST"]

DB_PORT = "5432"
if "DB_PORT" in os.environ:
    DB_PORT = os.environ["DB_PORT"]

DB_NAME = "ai_product_catalog"
if "DB_NAME" in os.environ:
    DB_NAME = os.environ["DB_NAME"]

DB_USER = "ai_product_catalog"
if "DB_USER" in os.environ:
    DB_USER = os.environ["DB_USER"]

DB_PASSWORD = "ai_product_catalog123"
if "DB_PASSWORD" in os.environ:
    DB_PASSWORD = os.environ["DB_PASSWORD"]

DB_CONNECTION_STRING = f"host={DB_HOST} port={DB_PORT} dbname={DB_NAME} user={DB_USER} password={DB_PASSWORD}"
print ("DB_CONNECTION_STRING:", DB_CONNECTION_STRING)

In [None]:
product_dataset = dw.load_dataset(DATASET_NAME)
product_dataset.describe()

In [None]:
df = product_dataset.dataframes["nike_2020_04_13"]
df = df.drop_duplicates()
print (df.shape)
df.head()

In [None]:
print ("Shape = " + str(df.shape))
print ("Number of unique Product IDs = " + str(len(df['product_id'].drop_duplicates())))
print ("Maximum Length of Product ID Column = " + str(df['product_id'].str.len().max()))
print ("Number of unique Brands = " + str(len(df['brand'].drop_duplicates())))
print ("Maximum Length of Product Name Column = " + str(df['product_name'].str.len().max()))
print ("Maximum Length of Product Description Column = " + str(df['description'].str.len().max()))
print ("Maximum Length of Brand Column = " + str(df['brand'].str.len().max()))

In [None]:
df['msrp'] = df['sale_price'].astype('float') / 100.0
df['msrp']

In [None]:
df["category"] = CATEGORY_DESC

df["description"].fillna('', inplace=True)

In [None]:
productDataSet = ProductDataLib.ProductDataSet(DATASET_NAME, 
                                               DB_CONNECTION_STRING,
                                               SENTENCE_TRANSFORMER_MODEL)

resultDF = productDataSet.import_df(df,
                    {
                        "product_id": productDataSet.ProductColumns.SKU, 
                        "msrp": productDataSet.ProductColumns.PRICE,
                        "brand": productDataSet.ProductColumns.BRAND_DESC,
                        "category": productDataSet.ProductColumns.CATEGORY_DESC,
                        "product_name": productDataSet.ProductColumns.NAME,
                        "description": productDataSet.ProductColumns.DESC
                    }
                )
productDataSet.persist()
productDataSet.load_embeddings()
productDataSet.refresh_embeddings()
productDataSet.persist_embeddings()

print(resultDF.head())
