In [None]:
!pip install -q "sentence-transformers>=3.0.0" "faiss-cpu>=1.8.0" "langchain>=0.2.0" langchain-community

import os, time, gc, numpy as np, torch, pandas as pd
from google.colab import drive
from sentence_transformers import SentenceTransformer
from langchain_community.vectorstores import FAISS
from langchain.embeddings.base import Embeddings

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m72.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m97.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.7/64.7 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires requests==2.32.4, but you have requests 2.32.5 which is incompatible.[0m[31m
[0m

In [None]:
# ------------- 參數區（依需要修改） -------------
DRIVE_DATA_PATH = "/content/drive/MyDrive/data/articles.csv"   # 你的 CSV 路徑
SAVE_DIR        = "/content/drive/MyDrive/data/rag_faiss_demo1"       # 存放 embeddings/索引
EMB_NPY         = os.path.join(SAVE_DIR, "embeddings.npy")
TXT_NPY         = os.path.join(SAVE_DIR, "texts.npy")
FAISS_DIR       = os.path.join(SAVE_DIR, "faiss_index")       # LangChain FAISS 存放目錄

MODEL_NAME      = "sentence-transformers/all-mpnet-base-v2" #"sentence-transformers/all-MiniLM-L6-v2"
BATCH_SIZE      = 256                     # 視 GPU 可調 128/256/512
USE_HEAD        = None  # 例如 1000；若只想先測 1000 筆，填整數；用全部就留 None
QUERY_EXAMPLE   = "red cotton t-shirt"       # 測試查詢
TOP_K           = 5              # 查詢回傳筆數
# --------------------------------------------------

In [None]:
# 0) 掛載 Google Drive
drive.mount('/content/drive', force_remount=True)

# 建立保存目錄
os.makedirs(SAVE_DIR, exist_ok=True)

# 1) 讀取資料 & 建立文本清單
print("Loading CSV from:", DRIVE_DATA_PATH)
data = pd.read_csv(DRIVE_DATA_PATH)

if USE_HEAD is not None:
    data = data.head(USE_HEAD)

docs = np.array([
    f"{row['prod_name']} - {row['product_type_name']} - {row['colour_group_name']} - {row['index_name']} - {row['detail_desc']}"
    for _, row in data.iterrows()
], dtype=object)

np.save(TXT_NPY, docs)  # 方便續跑
print(f"Docs prepared: {len(docs)} items; sample[420] =>")
if len(docs) > 420:
    print(docs[420][:200])
else:
    print("(less than 421 items; skip sample)")

Mounted at /content/drive
Loading CSV from: /content/drive/MyDrive/data/articles.csv
Docs prepared: 105542 items; sample[420] =>
Theron (1) - Hoodie - Grey - Ladieswear - Jacket in sweatshirt fabric with a lined drawstring hood, zip down the front, side pockets and ribbing at the cuffs and hem.


In [None]:
# 2) 載入模型 → 放到 GPU
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Torch CUDA available:", torch.cuda.is_available())
print("Using device:", DEVICE)

model = SentenceTransformer(MODEL_NAME, device=DEVICE)
# 可降低 CPU 過度繁忙，讓 GPU 吃飽（依需要調整）
try:
    torch.set_num_threads(4)
except:
    pass


Torch CUDA available: True
Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# 3) 產生或載入嵌入
if not os.path.exists(EMB_NPY):
    print("Encoding embeddings (this uses GPU if available)...")
    t0 = time.time()
    embs = model.encode(
        docs.tolist(),
        batch_size=BATCH_SIZE,
        convert_to_numpy=True,
        normalize_embeddings=True,   # 之後用 cosine/inner-product 搜尋時很有用
        show_progress_bar=True
    )
    np.save(EMB_NPY, embs)
    print(f"Embeddings saved to {EMB_NPY} | shape={embs.shape} | took {time.time()-t0:.1f}s")
else:
    embs = np.load(EMB_NPY)
    print(f"Loaded cached embeddings: {EMB_NPY} | shape={embs.shape}")

Encoding embeddings (this uses GPU if available)...


Batches:   0%|          | 0/413 [00:00<?, ?it/s]

Embeddings saved to /content/drive/MyDrive/data/rag_faiss_demo1/embeddings.npy | shape=(105542, 768) | took 370.9s


In [None]:
# 4) 建立或載入 FAISS 索引
from langchain_core.embeddings import Embeddings
from langchain_community.vectorstores import FAISS
import numpy as np
import os, time

# LangChain 的 FAISS 需要一個 Embeddings 介面來處理「查詢向量」的計算
class PrecomputedEmbeddings(Embeddings):
    def __init__(self, st_model):
        self.st_model = st_model
    def embed_documents(self, texts):
        # 這裡不會被用到（我們用 from_embeddings 載入），仍回傳空陣列或直接 NotImplemented
        return []
    def embed_query(self, text):
        v = self.st_model.encode([text], convert_to_numpy=True, normalize_embeddings=True)
        return v[0].tolist()

lc_embedder = PrecomputedEmbeddings(model)

# 確保 docs 是 List[str]；embs 是 (N, d) 的 numpy.ndarray 或可轉成 list 的 2D 結構
docs_list = docs.tolist() if hasattr(docs, "tolist") else list(docs)
embs_array = np.asarray(embs)  # shape: (N, d)
assert len(docs_list) == len(embs_array), "docs 與 embs 筆數不一致"

if os.path.exists(FAISS_DIR):
    print(f"Loading existing FAISS index from: {FAISS_DIR}")
    # 注意：load_local 需要傳入同樣的 Embeddings 物件
    faiss_store = FAISS.load_local(FAISS_DIR, lc_embedder, allow_dangerous_deserialization=True)
else:
    print("Building FAISS index from precomputed embeddings...")
    t0 = time.time()

    # 準備 text_embeddings: List[Tuple[str, List[float]]]
    # 建議轉 float32，避免不同精度造成的問題
    text_embeddings = [
        (txt, vec.astype(np.float32).tolist())
        for txt, vec in zip(docs_list, embs_array)
    ]

    # 正確呼叫方式：只要提供 text_embeddings；embedding 用於之後查詢向量的計算
    faiss_store = FAISS.from_embeddings(
        text_embeddings=text_embeddings,
        embedding=lc_embedder,
        # metadatas=metas,  # 如果有對應的 metadata，可一併傳入（選用）
        # ids=ids_list,     # 如果你自己管理 ID，也可以在這裡指定（選用）
    )

    os.makedirs(FAISS_DIR, exist_ok=True)
    faiss_store.save_local(FAISS_DIR)
    print(f"FAISS index saved to {FAISS_DIR} | took {time.time()-t0:.1f}s")


Building FAISS index from precomputed embeddings...
FAISS index saved to /content/drive/MyDrive/data/rag_faiss_demo1/faiss_index | took 14.4s


In [None]:
# 5) 測試查詢
print("\n=== Test Query ===")
print("Query:", QUERY_EXAMPLE)
results = faiss_store.similarity_search(QUERY_EXAMPLE, k=TOP_K)
for i, r in enumerate(results, 1):
    # 只顯示部分內容避免太長
    print(f"{i}. {r.page_content[:200]}")

print("\nAll done ✅ 你現在擁有：")
print(f"- 文本快取：{TXT_NPY}")
print(f"- 嵌入快取：{EMB_NPY}")
print(f"- FAISS 索引：{FAISS_DIR}")
print("之後重跑時會自動偵測並跳過已完成的步驟（可續跑）。")


=== Test Query ===
Query: red cotton t-shirt
1. 1 - PACK - T-shirt - Light Red - Ladieswear - Fitted T-shirt in soft organic cotton jersey.
2. 1 - PACK - T-shirt - Red - Ladieswear - Fitted T-shirt in soft organic cotton jersey.
3. MAXENCE - T-shirt - Grey - Menswear - Round-necked T-shirt in sturdy cotton piqué.
4. MAXENCE - T-shirt - Light Grey - Menswear - Round-necked T-shirt in sturdy cotton piqué.
5. 1 - PACK - T-shirt - Grey - Ladieswear - Fitted T-shirt in soft organic cotton jersey.

All done ✅ 你現在擁有：
- 文本快取：/content/drive/MyDrive/data/rag_faiss_demo1/texts.npy
- 嵌入快取：/content/drive/MyDrive/data/rag_faiss_demo1/embeddings.npy
- FAISS 索引：/content/drive/MyDrive/data/rag_faiss_demo1/faiss_index
之後重跑時會自動偵測並跳過已完成的步驟（可續跑）。
