# generate data to use AI

In [24]:
!pip install openai
import openai;
COMPLETION_MODEL = "text-davinci-003"

def generate_data_by_prompt(prompt):
    response = openai.Completion.create(
        engine=COMPLETION_MODEL,
        prompt=prompt,
        temperature=0.5,
        max_tokens=2048,
        top_p=1,
    )
    return response.choices[0].text

prompt = """请你生成50条淘宝网里的商品的标题，每条在30个字左右，品类是3C数码产品，标题里往往也会有一些促销类的信息，每行一条。"""
data = generate_data_by_prompt(prompt);

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [28]:
# handle data with pandas
import pandas as pd
product_names = data.strip().split('\n')
df = pd.DataFrame({'product_name': product_names})
df.head()
df.product_name = df.product_name.apply(lambda x: x.split('.')[1].strip())
df.head()

Unnamed: 0,product_name
0,【新款】Apple/苹果 iPhone 11 Pro Max 智能手机
1,【限时特惠】华为 P30 Pro 全面屏手机
2,【超值热卖】小米 9 Pro 5G 双模旗舰
3,【热销爆款】三星 Galaxy S20 Ultra 5G 旗舰
4,【狂欢优惠】OPPO Reno 10X Zoom 全面屏


In [27]:
# generate woman clothes data
clothes_prompt = """请你生成50条淘宝网里的商品的标题，每条在30个字左右，品类是女性的服饰箱包等等，标题里往往也会有一些促销类的信息，每行一条。"""
clothes_data = generate_data_by_prompt(clothes_prompt)
clothes_product_names = clothes_data.strip().split('\n')
clothes_df = pd.DataFrame({'product_name': clothes_product_names})
clothes_df.product_name = clothes_df.product_name.apply(lambda x: x.split('.')[1].strip())
clothes_df.head()

Unnamed: 0,product_name
0,【新款】时尚潮流女士双肩包，折扣价格！
1,【热销】2020春夏季新款单肩手提包！
2,【促销】特价活动！拼色牛皮女士包包！
3,【特惠】超火爆！时尚百搭小方包！
4,【抢购】精选高档真皮女士手提包！


In [5]:
# Splice 2 data
df = pd.concat([df, clothes_df], axis=0)
df = df.reset_index(drop=True)
display(df)

Unnamed: 0,product_name
0,【新款】iPhone XS Max 智能手机 全网通6G+128G
1,【热卖】小米MIX 3 全面屏拍照旗舰手机
2,【特价】华为P30Pro 旗舰手机 8G+128G 全网通
3,【精选】vivo NEX 3 5G 手机 8G+256G 超薄全面屏
4,【超值】苹果iPad Pro 11英寸 平板电脑
...,...
93,全新款式女士晚宴包，抢购价格低！
94,潮流时尚女士双肩包，优惠限量抢购！
95,精致新潮女士单肩斜挎包，超值特惠！
96,时尚小巧女士小熊包包，折扣限量抢购！


# Semantic search by Embedding 

In [36]:
!pip install backoff;
from openai.embeddings_utils import get_embeddings
import openai, os, backoff;

embedding_model = "text-embedding-ada-002";

batch_size = 100

@backoff.on_exception(backoff.expo, openai.error.RateLimitError)
def get_embeddings_with_backoff(prompts, engine):
    embeddings = []
    for i in range(0, len(prompts), batch_size):
        batch = prompts[i:i+batch_size]
        embeddings += get_embeddings(list_of_text=batch, engine=engine)
    return embeddings

prompts = df.product_name.tolist()
prompt_batches = [prompts[i:i+batch_size] for i in range(0, len(prompts), batch_size)]

embeddings = []
for batch in prompt_batches:
    batch_embeddings = get_embeddings_with_backoff(prompts=batch, engine=embedding_model)
    embeddings += batch_embeddings

df["embedding"] = embeddings
df.to_parquet("taobao_product_title.parquet", index=False)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [37]:
from openai.embeddings_utils import get_embedding, cosine_similarity

# search through the reviews for a specific product
def search_product(df, query, n=3, pprint=True):
    product_embedding = get_embedding(
        query,
        engine=embedding_model
    )
    df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, product_embedding))

    results = (
        df.sort_values("similarity", ascending=False)
        .head(n)
        .product_name
    )
    if pprint:
        for r in results:
            print(r)
    return results

results = search_product(df, "自然淡雅背包", n=3)

【时尚精选】荣耀 MagicBook Pro 笔记本
【夏日热卖】联想 ThinkPad X1 Carbon 笔记本
【超值特惠】华为 MateBook 13 英寸轻薄本


# Cold start for product recommendations using Embedding information

In [38]:
def recommend_product(df, product_name, n=3, pprint=True):
    if product_name not in df['product_name'].values:
          print(f"Product '{product_name}' not found in the dataframe.")
          return None
    product_embedding = df[df['product_name'] == product_name].iloc[0].embedding
    df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, product_embedding))

    results = (
        df.sort_values("similarity", ascending=False)
        .head(n)
        .product_name
    )
    if pprint:
        for r in results:
            print(r)
    return results

results = recommend_product(df, "【热卖】小米MIX 3 全面屏拍照旗舰手机", n=3)

Product '【热卖】小米MIX 3 全面屏拍照旗舰手机' not found in the dataframe.


# 通过 FAISS 加速搜索过程

In [40]:
# !pip install faiss-cpu;
import faiss;
import numpy as np
def load_embeddings_to_faiss(df):
    embeddings = np.array(df['embedding'].tolist()).astype('float32')
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    return index

index = load_embeddings_to_faiss(df)

In [41]:
def search_index(index, df, query, k=5):
    query_vector = np.array(get_embedding(query, engine=embedding_model)).reshape(1, -1).astype('float32')
    distances, indexes = index.search(query_vector, k)

    results = []
    for i in range(len(indexes)):
        product_names = df.iloc[indexes[i]]['product_name'].values.tolist()
        results.append((distances[i], product_names))    
    return results

products = search_index(index, df, "自然淡雅背包", k=3)

for distances, product_names in products:
    for i in range(len(distances)):
        print(product_names[i], distances[i])

【时尚精选】荣耀 MagicBook Pro 笔记本 0.37121534
【夏日热卖】联想 ThinkPad X1 Carbon 笔记本 0.38607883
【超值特惠】华为 MateBook 13 英寸轻薄本 0.38635442
