In [None]:
!nvidia-smi || echo "沒有GPU"

Sun Sep  7 12:21:50 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   42C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:

import torch
import subprocess
import os

print("=== 環境檢查 ===")
print("CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)
print("GPU count:", torch.cuda.device_count())
if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))

# 檢查 NVCC 版本
try:
    nvcc_version = subprocess.check_output(["nvcc", "--version"]).decode('utf-8')
    print("NVCC version:")
    print(nvcc_version.split('\n')[-3])
except:
    print("NVCC not found")


=== 環境檢查 ===
CUDA available: True
CUDA version: 12.6
GPU count: 1
GPU name: Tesla T4
NVCC version:
Cuda compilation tools, release 12.5, V12.5.82


In [None]:
!pip install llama-cpp-python --pre --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121 --force-reinstall --no-cache-dir


Looking in indexes: https://pypi.org/simple, https://abetlen.github.io/llama-cpp-python/whl/cu121
Collecting llama-cpp-python
  Downloading https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.16-cu121/llama_cpp_python-0.3.16-cp312-cp312-linux_x86_64.whl (551.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m551.3/551.3 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting typing-extensions>=4.5.0 (from llama-cpp-python)
  Downloading typing_extensions-4.15.0-py3-none-any.whl.metadata (3.3 kB)
Collecting numpy>=1.20.0 (from llama-cpp-python)
  Downloading numpy-2.3.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m311.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting diskcache>=5.6.1 (from llama-cpp-python)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Collecting jinja2>=2.11.3 (from llama-cpp-python)
  Downloading 

In [None]:

def test_installation():
    """測試安裝是否成功並支援 GPU"""
    try:
        from llama_cpp import Llama
        print("✅ llama-cpp-python 導入成功！")

        # 測試基本功能（不需要模型）
        print("✅ 模組載入正常")

        return True
    except ImportError as e:
        print(f"❌ 導入失敗: {e}")
        return False
    except Exception as e:
        print(f"⚠️  其他問題: {e}")
        return True  # 導入成功，但可能有其他小問題

# 運行測試
success = test_installation()

✅ llama-cpp-python 導入成功！
✅ 模組載入正常


In [None]:
!pip install -q faiss-cpu langchain langchain-core langchain-community
!pip install -q sentence-transformers
#!pip install -q llama-cpp-python huggingface_hub

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for llama-cpp-python (pyproject.toml) ... [?25l[?25hdone


In [None]:
# RAG推薦系統 - Google Colab版本
import os, re
from huggingface_hub import list_repo_files, hf_hub_download
from langchain_community.llms import LlamaCpp
from langchain_core.prompts import PromptTemplate
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

#掛載Google Drive
from google.colab import drive
drive.mount('/content/drive')

#設定路徑
FAISS_DIR = "/content/drive/MyDrive/data/rag_faiss_demo1/faiss_index" #要確認
GGUF_REPO = "floraliuya/recft_unsloth-Meta-Llama-3.1-8B-2"
MODEL_FILE = "unsloth.Q4_K_M.gguf"  # 請根據實際的 GGUF 檔名調整


Mounted at /content/drive


In [None]:
#下載 GGUF 模型
def download_gguf_model(repo_id, filename, local_dir="./models/"):
    """下載 GGUF 模型檔案"""
    print(f"正在下載模型: {repo_id}/{filename}")

    if not os.path.exists(local_dir):
        os.makedirs(local_dir)

    try:
        model_path = hf_hub_download(
            repo_id=repo_id,
            filename=filename,
            local_dir=local_dir,
            local_dir_use_symlinks=False
        )
        print(f"模型下載完成: {model_path}")
        return model_path
    except Exception as e:
        print(f"下載失敗: {e}")
        return None

# 下載模型
model_path = download_gguf_model(GGUF_REPO, MODEL_FILE)

#初始化 Llama CPP 模型
def initialize_llama_model(model_path):
    """初始化 LlamaCpp 模型"""
    print("初始化 Llama CPP 模型...")

    try:
        llm = LlamaCpp(
            model_path=model_path,
            temperature=0.7,
            max_tokens=512,
            top_p=0.9,
            n_ctx=4096,  # context 長度
            n_gpu_layers=35,  # 使用 GPU 加速的層數，根據你的 GPU 記憶體調整
            verbose=True,
            seed=42,
            n_threads=4
        )
        print("模型載入成功！")
        return llm
    except Exception as e:
        print(f"模型載入失敗: {e}")
        return None

# 初始化模型
if model_path:
    llm = initialize_llama_model(model_path)
else:
    print("模型下載失敗，無法繼續")
    llm = None


正在下載模型: floraliuya/recft_unsloth-Meta-Llama-3.1-8B-2/unsloth.Q4_K_M.gguf


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


unsloth.Q4_K_M.gguf:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    yes
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 1 CUDA devices:
  Device 0: Tesla T4, compute capability 7.5, VMM: yes


模型下載完成: models/unsloth.Q4_K_M.gguf
初始化 Llama CPP 模型...


llama_model_load_from_file_impl: using device CUDA0 (Tesla T4) - 14992 MiB free
llama_model_loader: loaded meta data with 33 key-value pairs and 292 tensors from models/unsloth.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Recft_Unsloth Meta Llama 3.1 8B 2
llama_model_loader: - kv   3:                            general.version str              = 2
llama_model_loader: - kv   4:                           general.basename str              = recft_unsloth-Meta-Llama-3.1
llama_model_loader: - kv   5:                       general.quantized_by str              = Unsloth
llama_model_loader: - kv   6:                    

模型載入成功！


CUDA : ARCHS = 500,520,530,600,610,620,700,720,750,800,860,870,890,900 | FORCE_MMQ = 1 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
Model metadata: {'general.file_type': '15', 'tokenizer.ggml.eos_token_id': '128001', 'general.quantization_version': '2', 'tokenizer.ggml.model': 'gpt2', 'llama.vocab_size': '128256', 'llama.attention.value_length': '128', 'llama.attention.key_length': '128', 'llama.attention.layer_norm_rms_epsilon': '0.000010', 'llama.rope.freq_base': '500000.000000', 'general.architecture': 'llama', 'tokenizer.ggml.add_sep_token': 'false', 'llama.attention.head_count_kv': '8', 'llama.block_count': '32', 'tokenizer.ggml.padding_token_id': '128004', 'general.basename': 'recft_unsloth-Meta-Llama-3.1', 'tokenizer.ggml.bos_token_id': '128000', 'llama.attention.head_count': '32', 'tokenizer.ggml.pre': 'llama-bpe', 'llama.context_length': '131072', 'gen

In [None]:

#載入向量資料庫
def load_faiss_database():
    """載入 FAISS 向量資料庫"""
    print("載入FAISS向量資料庫...")
    try:
        # 載入embedding模型 (需要與建立FAISS時使用相同的模型)
        embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-mpnet-base-v2", #請根據您實際使用的embedding模型調整
            #"sentence-transformers/all-MiniLM-L6-v2"
            model_kwargs={'device': 'cuda' if os.environ.get('COLAB_GPU') else 'cpu'}
        )

        # 載入FAISS向量資料庫
        faiss_db = FAISS.load_local(
            FAISS_DIR,
            embeddings,
            allow_dangerous_deserialization=True
        )
        print(f"FAISS 資料庫載入成功，包含 {faiss_db.index.ntotal} 個向量")
        return faiss_db
    except Exception as e:
        print(f"FAISS 資料庫載入失敗: {e}")
        return None

faiss_db = load_faiss_database()

  embeddings = HuggingFaceEmbeddings(


載入FAISS向量資料庫...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

FAISS 資料庫載入成功，包含 105542 個向量


In [None]:
#設定Alpaca prompt模板
alpaca_prompt_template = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Based on the provided context, give product recommendations that match the user's query. Focus on relevant features and explain why these products are suitable. Keep your response concise and helpful.

### Input:
User Query: {question}

Context from database: {context}

### Response:
"""

prompt = PromptTemplate.from_template(alpaca_prompt_template)

In [None]:
#RAG推薦系統主函數
def get_recommendations(question, k=3, max_context_length=2000):
    """
    RAG推薦系統主函數

    Args:
        question (str): 用戶查詢
        k (int): 檢索的相似文檔數量
        max_context_length (int): 最大 context 長度

    Returns:
        str: 推薦結果
    """
    if not llm or not faiss_db:
        print("模型或資料庫未正確載入")
        return None

    print(f"處理查詢: {question}")

    try:
        # 搜尋相似文檔
        print("搜尋相似產品...")
        found_docs = faiss_db.similarity_search(question, k=k)

        # 準備context，限制長度避免超出模型限制
        context_parts = []
        current_length = 0

        for i, doc in enumerate(found_docs):
            doc_text = f"Product {i+1}: {doc.page_content}"
            if current_length + len(doc_text) < max_context_length:
                context_parts.append(doc_text)
                current_length += len(doc_text)
            else:
                break

        context = "\n\n".join(context_parts)

        # 構建完整prompt
        full_prompt = prompt.format(question=question, context=context)

        print("生成推薦...")
        # 使用 LlamaCpp 生成回應
        response = llm.invoke(full_prompt)

        return response

    except Exception as e:
        print(f"推理過程發生錯誤: {e}")
        return None

In [None]:
#一邊生成、一邊即時把字吐出來
def get_recommendations_stream(question, k=3):
    """串流生成推薦結果"""
    if not llm or not faiss_db:
        print("模型或資料庫未正確載入")
        return

    print(f"處理查詢: {question}")

    try:
        found_docs = faiss_db.similarity_search(question, k=k)
        context = "\n\n".join([f"Product {i+1}: {doc.page_content}" for i, doc in enumerate(found_docs)])
        full_prompt = prompt.format(question=question, context=context)

        print("生成推薦 (串流模式):")
        print("-" * 50)

        # 使用串流生成
        for chunk in llm.stream(full_prompt):
            print(chunk, end="", flush=True)

        print("\n" + "-" * 50)

    except Exception as e:
        print(f"串流生成失敗: {e}")


In [None]:
#測試範例
def run_test_queries():
    """執行測試查詢"""
    test_queries = [
        "I'm looking for a yellow summer dress which is light and airy",
        "I’m looking for a formal shirt made of linen",
        "I’m going to my friend’s graduation party. Suggest some dresses which are formal yet casual and give a lunch vibe."
    ]

    for query in test_queries:
        print("=" * 60)
        print(f"查詢: {query}")
        print("=" * 60)

        result = get_recommendations(query, k=3)
        if result:
            print("推薦結果:")
            print(result)
        else:
            print("推薦生成失敗")

        print("=" * 60)
        print()

In [None]:
#輔助函數
def check_model_info():
    """檢查模型資訊"""
    if llm:
        print("模型資訊:")
        print(f"模型路徑: {llm.model_path}")
        print(f"上下文長度: {llm.n_ctx}")
        print(f"最大生成長度: {llm.max_tokens}")
        print(f"使用GPU層數: {llm.n_gpu_layers}")

def explore_database(sample_size=5):
    """探索資料庫內容"""
    if not faiss_db:
        print("資料庫未載入")
        return

    print("資料庫探索:")
    print(f"總文檔數: {faiss_db.index.ntotal}")

    # 隨機搜尋一些文檔來查看內容結構
    sample_query = "product"
    docs = faiss_db.similarity_search(sample_query, k=sample_size)

    for i, doc in enumerate(docs):
        print(f"\n--- 文檔 {i+1} ---")
        content = doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content
        print(content)


In [None]:
#主執行區塊
if __name__ == "__main__":
    print("RAG推薦系統 - Llama CPP - GPU版本")
    print("=" * 60)

    # 檢查系統狀態
    if llm and faiss_db:
        print("✓ 系統初始化完成")
        check_model_info()
        explore_database()

        # 執行測試
        print("\n開始測試...")
        run_test_queries()

    else:
        print("✗ 系統初始化失敗，請檢查模型和資料庫路徑")


RAG推薦系統 - Llama CPP - GPU版本
✓ 系統初始化完成
模型資訊:
模型路徑: models/unsloth.Q4_K_M.gguf
上下文長度: 4096
最大生成長度: 512
使用GPU層數: 35
資料庫探索:
總文檔數: 105542

--- 文檔 1 ---
JULIUS HT BB - Sneakers - Black - Children Accessories, Swimwear - nan

--- 文檔 2 ---
PRICED ITEM tee - T-shirt - Black - Sport - Short-sleeved sports top in printed, fast-drying functional fabric.

--- 文檔 3 ---
Borg WL sneaker - Sneakers - Black - Ladies Accessories - nan

--- 文檔 4 ---
West runner BB - Sneakers - Black - Children Accessories, Swimwear - nan

--- 文檔 5 ---
Livka sneaker - Sneakers - Black - Ladies Accessories - nan

開始測試...
查詢: I'm looking for a yellow summer dress which is light and airy
處理查詢: I'm looking for a yellow summer dress which is light and airy
搜尋相似產品...
生成推薦...


Llama.generate: 73 prefix-match hit, remaining 157 prompt tokens to eval
llama_perf_context_print:        load time =    2741.14 ms
llama_perf_context_print: prompt eval time =    1719.02 ms /   157 tokens (   10.95 ms per token,    91.33 tokens per second)
llama_perf_context_print:        eval time =   17543.58 ms /   511 runs   (   34.33 ms per token,    29.13 tokens per second)
llama_perf_context_print:       total time =   21143.05 ms /   668 tokens
llama_perf_context_print:    graphs reused =        507
Llama.generate: 73 prefix-match hit, remaining 232 prompt tokens to eval


推薦結果:
Outfit Combination 1:
- Product 1: Covent Garden - Dress - Light Yellow - Ladieswear
- Shoes: White sneakers or sandals for a summery look.
- Accessories: A straw hat and oversized sunglasses to complete the bohemian vibe.

Outfit Combination 2:
- Product 2: SUMMER STRAP DRESS_09-090 - Dress - Light Yellow - Divided
- Shoes: Metallic gold ballet flats or pumps for a glamorous touch.
- Accessories: Statement earrings in the shape of flowers, a delicate gold belt to cinch the waist, and a straw clutch bag to complete the look.

Outfit Combination 3:
- Product 3: APRIL dress - Dress - Yellow - Children Sizes 134-170
- Shoes: White canvas sneakers or sandals for a casual yet stylish vibe.
- Accessories: A colorful headband with flowers or bows, a small backpack in a matching color to carry essentials, and a pair of sunglasses with colored frames to add a playful touch.

Outfit Combination 4:
- Product 1: Covent Garden - Dress - Light Yellow - Ladieswear
- Shoes: High-heeled white pum

llama_perf_context_print:        load time =    2741.14 ms
llama_perf_context_print: prompt eval time =    2697.76 ms /   232 tokens (   11.63 ms per token,    86.00 tokens per second)
llama_perf_context_print:        eval time =   18925.90 ms /   511 runs   (   37.04 ms per token,    27.00 tokens per second)
llama_perf_context_print:       total time =   23415.15 ms /   743 tokens
llama_perf_context_print:    graphs reused =        515
Llama.generate: 74 prefix-match hit, remaining 127 prompt tokens to eval


推薦結果:
Outfit Recommendation 1:
Top: Premium My Linen Shirt
Description: This premium linen shirt is perfect for a formal occasion. It features a button-down collar, classic front, and an open chest pocket. The long sleeves are adjustable with buttoning at the cuffs. With its yoke and pleat at the back, this shirt offers both style and comfort. The rounded hem adds to the refined look of the shirt.

Bottom: lucas linen price
Description: This pair of trousers is made from premium quality linen fabric, which ensures breathability and comfort during hot weather. The straight leg cut gives a modern and stylish silhouette, while the flat front and no pleats contribute to a sleek and streamlined appearance. With its slim fit and attention to detail in construction, this pair of trousers will be a perfect match for your formal occasion.

Shoes: lucas linen price
Description: These classic dress shoes are made from premium quality leather and feature a brogue design with perforations and a dec

llama_perf_context_print:        load time =    2741.14 ms
llama_perf_context_print: prompt eval time =    1493.34 ms /   127 tokens (   11.76 ms per token,    85.04 tokens per second)
llama_perf_context_print:        eval time =   10017.79 ms /   270 runs   (   37.10 ms per token,    26.95 tokens per second)
llama_perf_context_print:       total time =   12459.83 ms /   397 tokens
llama_perf_context_print:    graphs reused =        271


推薦結果:
Outfit combination 1:
- Top: Fitted, sleeveless dress in soft jersey with gathers in the sides for best fit.
- Bottom: Wide-leg culottes in a lightweight fabric that is breathable and comfortable for all-day wear.

Outfit combination 2:
- Top: Fitted, sleeveless dress in soft jersey with gathers in the sides for best fit.
- Bottom: A-line midi skirt in a lightweight and flowy fabric. The longer length will keep you cool during the day while still maintaining a chic look.

Outfit combination 3:
- Top: Short, fitted dress in sturdy jersey with a V-neck and long sleeves.
- Bottom: High-waisted jeans in a classic straight leg cut. Pair it with a wide belt to create an hourglass silhouette.

Outfit combination 4:
- Top: Fitted, sleeveless dress in soft jersey with gathers in the sides for best fit.
- Bottom: A-line midi skirt in a lightweight and flowy fabric. The longer length will keep you cool during the day while still maintaining a chic look.

Outfit combination 5:
- Top: Short, 

In [None]:
#互動式查詢
def interactive_query():
    """互動式查詢介面"""
    if not llm or not faiss_db:
        print("系統未正確初始化")
        return

    print("進入互動模式 (輸入 'quit' 退出)")
    while True:
        user_input = input("\n請輸入您的查詢: ")
        if user_input.lower() == 'quit':
            break

        if user_input.strip():
            result = get_recommendations(user_input, k=3)
            if result:
                print(f"\n推薦結果:\n{result}")

# 取消註解以使用互動模式
interactive_query()

進入互動模式 (輸入 'quit' 退出)

請輸入您的查詢: quit
