In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    #model_name = "unsloth/Meta-Llama-3.1-8B-Instruct",
    #model_name = "unsloth/codellama-13b-bnb-4bit",
    model_name = "cognitivecomputations/Dolphin3.0-Llama3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.6.1: Fast Llama patching. Transformers: 4.52.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.2k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.6.1 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
import json
import random
from typing import Dict, List, Any
from pathlib import Path

class SimpleQAGenerator:
    def __init__(self):
        # 定义基本问题模板
        self.question_templates = [
            # 基础定义类问题
            {
                "type": "definition",
                "pattern": "What is {name} and what is it used for?",
                "required_fields": ["name", "description"],
                "answer_template": "{name} is used to {description}"
            },
            # 语法类问题
            {
                "type": "syntax",
                "pattern": "What is the correct syntax for using {name}?",
                "required_fields": ["name", "Syntax"],
                "answer_template": "The syntax for {name} is:\n{syntax}"
            },
            # 示例类问题
            {
                "type": "example",
                "pattern": "Can you show me an example of using {name}?",
                "required_fields": ["name", "example"],
                "answer_template": "Here's an example of using {name}:\n{example}"
            }
        ]

    def _clean_text(self, text: Any) -> str:
        """清理文本，处理换行和多余空格"""
        if isinstance(text, list):
            # 移除类似"Example(s)"、"Syntax"这样的标题行
            text = [line for line in text if not line.strip() in ["Example(s)", "Syntax"]]
            return ' '.join(line.strip() for line in text)
        return str(text).strip()

    def _format_context(self, doc: Dict[str, Any]) -> str:
        """格式化上下文信息"""
        context_parts = []
        for field in ["name", "Syntax", "description", "parameter", "example"]:
            if field in doc and doc[field]:
                content = self._clean_text(doc[field])
                if content:  # 只添加非空内容
                    context_parts.append(f"{field}: {content}")
        return '\n'.join(context_parts)

    def generate_qa_pair(self, doc: Dict[str, Any]) -> List[Dict[str, Any]]:
        """为单个文档生成QA对"""
        qa_pairs = []
        context = self._format_context(doc)

        for template in self.question_templates:
            # 检查是否有所需的字段
            if all(doc.get(field) for field in template["required_fields"]):
                question = template["pattern"].format(name=doc["name"])

                # 准备答案所需的字段
                fields = {
                    "name": doc["name"],
                    "description": self._clean_text(doc.get("description", "")),
                    "syntax": self._clean_text(doc.get("Syntax", "")),
                    "example": self._clean_text(doc.get("example", ""))
                }

                answer = template["answer_template"].format(**fields)

                qa_pairs.append({
                    "instruction": question,
                    "input": context,
                    "output": answer.strip()
                })

        return qa_pairs

def process_qa_file_backup(input_file: str, output_file: str, mask_ratio: float = 0.05):
    """处理JSON文件并生成JSONL格式的QA对"""
    # 创建QA生成器实例
    generator = SimpleQAGenerator()

    # 读取输入JSON文件
    with open(input_file, 'r', encoding='utf-8') as f:
        docs = json.load(f)

    # 生成所有QA对
    all_qa_pairs = []
    for doc in docs:
        qa_pairs = generator.generate_qa_pair(doc)
        all_qa_pairs.extend(qa_pairs)

    total = len(all_qa_pairs)
    num_to_mask = int(total * mask_ratio)
    indices_to_mask = set(random.sample(range(total), num_to_mask))

    # === 核心修改部分：处理15% QA，清除context，回答"Don't know"
    for idx in indices_to_mask:
        all_qa_pairs[idx]["input"] = ""
        all_qa_pairs[idx]["output"] = "Don't know"

    # 写入JSONL文件
    with open(output_file, 'w', encoding='utf-8') as f:
        for qa in all_qa_pairs:
            f.write(json.dumps(qa, ensure_ascii=False) + '\n')

    # 打印前三条作为验证
    print("\n=== First 3 QA pairs for verification ===\n")
    for qa in all_qa_pairs[:3]:
        print("---" * 30)
        print(f"Instruction: {qa['instruction']}")
        print(f"Input: {qa['input']}")
        print(f"Output: {qa['output']}\n")

    return len(all_qa_pairs)

def process_qa_file(input_file: str, output_file: str, seed: int = 42):
    generator = SimpleQAGenerator()

    with open(input_file, 'r', encoding='utf-8') as f:
        docs = json.load(f)

    all_qa_pairs = []
    for doc in docs:
        qa_pairs = generator.generate_qa_pair(doc)
        all_qa_pairs.extend(qa_pairs)

    total = len(all_qa_pairs)
    random.seed(seed)
    indices = list(range(total))
    random.shuffle(indices)

    num_context_normal = int(total * 0.85)
    num_context_dontknow = int(total * 0.10)
    num_general = total - num_context_normal - num_context_dontknow

    context_normal_idxs = indices[:num_context_normal]
    context_dontknow_idxs = indices[num_context_normal:num_context_normal + num_context_dontknow]
    general_idxs = indices[num_context_normal + num_context_dontknow:]

    updated_qa_pairs = []

    for i in range(total):
        qa = all_qa_pairs[i]
        new_qa = qa.copy()
        if i in context_normal_idxs:
            new_qa["instruction"] = "[Context QA] " + qa["instruction"]
        elif i in context_dontknow_idxs:
            new_qa["instruction"] = "[Context QA] " + qa["instruction"]
            new_qa["input"] = ""
            new_qa["output"] = "Don't know"
        elif i in general_idxs:
            new_qa["instruction"] = "[General QA] " + qa["instruction"]
            new_qa["input"] = ""
        updated_qa_pairs.append(new_qa)

    with open(output_file, 'w', encoding='utf-8') as f:
        for qa in updated_qa_pairs:
            f.write(json.dumps(qa, ensure_ascii=False) + '\n')

    print(f"\nTotal QA pairs written: {len(updated_qa_pairs)}")
    return len(updated_qa_pairs)


if __name__ == "__main__":
    # 使用你提供的文件路径
    formatted_qa_file = "/content/drive/MyDrive/Colab Notebooks/docs/filtered_data.json"
    jsonl_qa_file = "/content/drive/MyDrive/Colab Notebooks/docs/qa_pairs.jsonl"

    #total_pairs = process_qa_file(formatted_qa_file, jsonl_qa_file)
    total_pairs = process_qa_file_backup(formatted_qa_file, jsonl_qa_file)
    print(f"\nTotal QA pairs generated: {total_pairs}")


=== First 3 QA pairs for verification ===

------------------------------------------------------------------------------------------
Instruction: What is Item list and what is it used for?
Input: name: Item list
Syntax: command file.reference item-ID 
 command file.reference item-ID item-ID item-ID...
 command file.reference{*}
description: The item list specifies one or more item-IDs in the file defined by the associated file reference. The item list can be: If a select list is not active, a null item-ID implies a new item for Update processor and all items for the other processors. Any command requiring a select list can obtain it from a previously selected list. To cause a processor to use the select list, the item list must be null. An item-ID with the same name as a language element in either the master dictionary or the dictionary of the file, must be enclosed in single quotation marks. When a list is active, item.list is omitted from the (AQL) command: If a specific list of it

In [None]:
import json
import random
from typing import Dict, List, Any
from pathlib import Path

alpaca_prompt_backup = """You must strictly answer questions ONLY based on the provided context.
If the context does NOT include the relevant information, your answer must be exactly:
"Don't know" or "Not provided in context".
Never guess or add any information not mentioned in the context.

### Question:
{}

### Context:
{}

### Answer:
{}"""

def format_prompt(qa: Dict[str, str]) -> str:
    if qa["instruction"].startswith("[Context QA]"):
        return f"""You must strictly answer questions ONLY based on the provided context.
If the context does NOT include the relevant information, your answer must be exactly:
"Don't know" or "Not provided in context".
Never guess or add any information not mentioned in the context.

### Question:
{qa['instruction'].replace('[Context QA] ', '')}

### Context:
{qa['input']}

### Answer:
{qa['output']}"""

    elif qa["instruction"].startswith("[General QA]"):
        return f"""Answer the following question to the best of your knowledge and reasoning.
There is no context provided — you may rely on general understanding.

### Question:
{qa['instruction'].replace('[General QA] ', '')}

### Answer:
{qa['output']}"""

    else:
        raise ValueError("Unknown instruction label: must start with [Context QA] or [General QA]")


import json

required_fields = {"instruction", "input", "output"}

input_path = "/content/drive/MyDrive/Colab Notebooks/docs/dataset_claude_raw.jsonl"
output_clean = "/content/drive/MyDrive/Colab Notebooks/docs/dataset_clean.jsonl"
output_invalid = "/content/drive/MyDrive/Colab Notebooks/docs/dataset_invalid.jsonl"

clean_count = 0
invalid_count = 0

with open(input_path, "r", encoding="utf-8",errors='ignore') as infile, \
     open(output_clean, "w", encoding="utf-8") as cleanfile, \
     open(output_invalid, "w", encoding="utf-8") as invalidfile:

    for i, line in enumerate(infile, 1):
        line = line.strip()
        if not line:
            continue
        try:
            obj = json.loads(line)
            if not required_fields.issubset(obj):
                missing = required_fields - obj.keys()
                raise ValueError(f"Missing fields: {', '.join(missing)}")
            json.dump(obj, cleanfile, ensure_ascii=False)
            cleanfile.write("\n")
            clean_count += 1
        except (json.JSONDecodeError, ValueError) as e:
            print(f"[Line {i}] Invalid: {e}")
            invalidfile.write(line + "\n")
            invalid_count += 1

print(f"\n✅ Cleaned entries written: {clean_count}")
print(f"⚠️  Invalid entries saved: {invalid_count}")



EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    questions = examples["instruction"]
    contexts       = examples["input"]
    answers      = examples["output"]
    texts = []
    for question, context, answer in zip(questions, contexts, answers):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt_backup.format(question, context, answer) + EOS_TOKEN
        #qa_dict = {
        #    "instruction": question,
        #    "input": context,
        #    "output": answer
        #}
        #text = format_prompt(qa_dict) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("json", data_files="/content/drive/MyDrive/Colab Notebooks/docs/dataset_clean.jsonl", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)
print(dataset[0]["text"])


✅ Cleaned entries written: 2044
⚠️  Invalid entries saved: 0


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/2044 [00:00<?, ? examples/s]

You must strictly answer questions ONLY based on the provided context.
If the context does NOT include the relevant information, your answer must be exactly:
"Don't know" or "Not provided in context".
Never guess or add any information not mentioned in the context.

### Question:
How do I specify multiple item-IDs when working with a file reference?

### Context:
Item list syntax: command file.reference item-ID 
 command file.reference item-ID item-ID item-ID...
 command file.reference{*}

The item list specifies one or more item-IDs in the file defined by the associated file reference. If a select list is not active, a null item-ID implies a new item for Update processor and all items for the other processors. Any command requiring a select list can obtain it from a previously selected list. To cause a processor to use the select list, the item list must be null. An item-ID with the same name as a language element in either the master dictionary or the dictionary of the file, must be 

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    #formatting_func = formatting_prompts_func, # Add this line
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Unsloth: Tokenizing ["text"]:   0%|          | 0/2044 [00:00<?, ? examples/s]

In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
7.135 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,044 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040/8,000,000,000 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.7225
2,2.5765
3,2.6131
4,2.4567
5,2.2978
6,2.0609
7,1.8971
8,1.7198
9,1.5149
10,1.6291


In [None]:
model.save_pretrained("/content/drive/MyDrive/Llama3.1-8b-instruct-lora")  # Local saving
tokenizer.save_pretrained("/content/drive/MyDrive/Llama3.1-8b-instruct-lora")

('/content/drive/MyDrive/Llama3.1-8b-instruct-lora/tokenizer_config.json',
 '/content/drive/MyDrive/Llama3.1-8b-instruct-lora/special_tokens_map.json',
 '/content/drive/MyDrive/Llama3.1-8b-instruct-lora/chat_template.jinja',
 '/content/drive/MyDrive/Llama3.1-8b-instruct-lora/tokenizer.json')

In [None]:
# 推理模板，与训练时保持一致
def format_prompt_for_inference(question: str, context: str = "") -> str:
    """用于推理阶段的 prompt 构造"""
    if context.strip():  # 有上下文 => Context QA
        prompt = f"""You must strictly answer questions ONLY based on the provided context.
If the context does NOT include the relevant information, your answer must be exactly:
"Don't know" or "Not provided in context".
Never guess or add any information not mentioned in the context.

### Question:
{question}

### Context:
{context}

### Answer:"""
    else:  # 无上下文 => General QA
        prompt = f"""Answer the following question to the best of your knowledge and reasoning.
There is no context provided — you may rely on general understanding.

### Question:
{question}

### Answer:"""
    return prompt

# 启用快速推理
model = FastLanguageModel.for_inference(model)

# 示例使用
def get_model_response(question: str, context: str = "") -> str:
    # 创建推理输入
    #prompt = create_inference_prompt(question, context)
    #prompt = format_prompt_for_inference(question, context)
    # tokenize输入
    inputs = tokenizer(
        [alpaca_prompt_backup.format(question,
                              context,
                              "",
        )
        ],return_tensors="pt"
    ).to("cuda")
    #inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
    # 生成回答
    outputs = model.generate(
        **inputs,
        max_new_tokens=64,
        use_cache=True,
        temperature=0.7,  # 可以调整以控制输出的创造性
        do_sample=True,   # 使用采样而不是贪婪解码
    )

    # 解码输出
    response = tokenizer.batch_decode(outputs)[0]

    # 清理响应（移除prompt部分）
    response = response.split("Answer:")[-1].strip()

    return response

# 使用示例
if __name__ == "__main__":
    # 示例1：有上下文的查询
    question = "Can you provide the syntax for check-sum command?"
    context = """name: check-sum
Syntax: check-sum file.reference {item.list} {selection.criteria} {modifiers} {(options)}
description: The check-sum command calculates and displays a checksum for the specified item."""

    response = get_model_response(question, context)
    print("\n=== Example 1: With Context ===")
    print(f"Question: {question}")
    print(f"Context: {context}")
    print(f"Response: {response}")

    # 示例2：无上下文的查询
    question = "How do I enable or disable x-on/x-off flow control on a port?"
    response = get_model_response(question)
    print("\n=== Example 2: Without Context ===")
    print(f"Question: {question}")
    print(f"Response: {response}")


=== Example 1: With Context ===
Question: Can you provide the syntax for check-sum command?
Context: name: check-sum
Syntax: check-sum file.reference {item.list} {selection.criteria} {modifiers} {(options)}
description: The check-sum command calculates and displays a checksum for the specified item.
Response: Syntax: check-sum file.reference {item.list} {selection.criteria} {modifiers} {(options)}
Description: The check-sum command calculates and displays a checksum for the specified item. The file.reference is the name of the file containing the item list, and the item.list is the list of items to be checked. You

=== Example 2: Without Context ===
Question: How do I enable or disable x-on/x-off flow control on a port?
Response: Use the x-on and x-off commands to enable and disable x-on/x-off flow control on a port. When flow control is enabled, the port will automatically disable when a device goes into idle mode and re-enable when a device becomes active. This helps prevent excessi

In [None]:
!pip install -q langchain langchain_community chromadb sentence-transformers unstructured gradio pdf2image pdfminer.six

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m83.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.3/19.3 MB[0m [31m82.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.9/94.9 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0

In [None]:
import torch
from unsloth import FastLanguageModel
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate
from langchain.llms.base import LLM
import gradio as gr
from typing import Any, List, Optional
import os

In [None]:

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False

SAVE_PATH = "/content/drive/MyDrive/Llama3.1-8b-instruct-lora"
PDF_DIR = "/content/drive/MyDrive/Colab Notebooks/docs/"

class UnslothLLM(LLM):
    model: Any
    tokenizer: Any
    def __init__(self, model, tokenizer, **kwargs):
        super().__init__(model=model, tokenizer=tokenizer, **kwargs)
        self.model = model
        self.tokenizer = tokenizer

    def _call(self, prompt: str, **kwargs) -> str:
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
        outputs = self.model.generate(
            **inputs,
            max_new_tokens=512,
            temperature=0.7,
            do_sample=True
        )
        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Ensure that the response starts after the prompt
        if response.startswith(prompt):
            return response[len(prompt):].strip()
        return response[len(prompt):].strip()

    @property
    def _llm_type(self):
        return "custom"

# Add steps to clear GPU memory
print("Clearing GPU cache...")
torch.cuda.empty_cache()
# Optional: Delete previous model/tokenizer variables if they still exist
if 'model' in globals():
    del model
if 'tokenizer' in globals():
    del tokenizer
# Optional: Run Python garbage collection - can sometimes help
import gc
gc.collect()

# Add a check for GPU Memory after clearing
print("Checking GPU Memory after clearing cache:")
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved before loading.")

model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = SAVE_PATH, # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
)
llm = UnslothLLM(model=model, tokenizer=tokenizer)

Clearing GPU cache...
Checking GPU Memory after clearing cache:
GPU = Tesla T4. Max memory = 14.741 GB.
7.43 GB of memory reserved before loading.
==((====))==  Unsloth 2025.6.1: Fast Llama patching. Transformers: 4.52.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
!pip install -q pi_heif
!pip install -q unstructured_inference

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.1/48.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m68.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m73.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!sudo apt-get update
!sudo apt-get install -y poppler-utils

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,765 kB]
Get:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:6 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:8 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,742 kB]
Get:9 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [2,984 kB]
Hit:10 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:12 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [9,017 kB]
Hit:13 

In [None]:
!pip install -q unstructured-pytesseract
!pip install -q tqdm

In [None]:
# 1. 安装必要的依赖
!pip install -q langchain langchain_community chromadb sentence-transformers pypdf tqdm

# 2. 导入核心库
from google.colab import drive
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from tqdm.notebook import tqdm
import torch
import gc
import os
from typing import List, Dict
import logging
from datetime import datetime

# 3. 简单的云平台优化PDF处理类
class SimplePDFProcessor:
    def __init__(self, pdf_dir: str, batch_size: int = 5):
        """
        初始化PDF处理器

        Args:
            pdf_dir: PDF文件目录
            batch_size: 批处理大小
        """
        self.pdf_dir = pdf_dir
        self.batch_size = batch_size
        self.embeddings = self._init_embeddings()

    def _init_embeddings(self) -> HuggingFaceEmbeddings:
        """初始化embedding模型，使用GPU如果可用"""
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        return HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L6-v2",
            model_kwargs={'device': device}
        )

    def process_pdf(self, file_path: str) -> List[Dict]:
        """处理单个PDF文件，使用批处理来优化内存使用"""
        try:
            # 使用更高效的PyPDFLoader
            loader = PyPDFLoader(file_path)
            pages = loader.load()

            # 使用小批量处理文档
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=1000,
                chunk_overlap=150
            )

            splits = []
            # 批量处理文档页面
            for i in range(0, len(pages), self.batch_size):
                batch = pages[i:i + self.batch_size]
                batch_splits = text_splitter.split_documents(batch)
                splits.extend(batch_splits)

                # 主动释放内存
                gc.collect()

            return splits

        except Exception as e:
            print(f"处理文件 {file_path} 时出错: {str(e)}")
            return []

    def create_vectorstore(self, persist_dir: str = "chroma_db"):
        """处理所有PDF文件并创建向量存储"""
        all_docs = []
        pdf_files = [f for f in os.listdir(self.pdf_dir) if f.endswith('.pdf')]

        print(f"找到 {len(pdf_files)} 个PDF文件")

        # 使用tqdm显示进度
        for pdf_file in tqdm(pdf_files, desc="处理PDF文件"):
            file_path = os.path.join(self.pdf_dir, pdf_file)
            docs = self.process_pdf(file_path)
            all_docs.extend(docs)
            print(f"- {pdf_file}: 处理了 {len(docs)} 个文档块")

        print(f"\n总共处理了 {len(all_docs)} 个文档块")

        # 创建向量存储
        vectorstore = Chroma.from_documents(
            documents=all_docs,
            embedding=self.embeddings,
            persist_directory=persist_dir
        )

        # 持久化存储
        vectorstore.persist()
        return vectorstore

# 4. 主执行流程
def main():
    # 挂载Google Drive

    # 设置路径
    VECTOR_STORE_PATH = "/content/drive/MyDrive/vector_store"  # 替换为向量存储路径

    print(f"开始处理: {datetime.now()}")

    # 创建处理器并处理文档
    processor = SimplePDFProcessor(
        pdf_dir=PDF_DIR,
        batch_size=5  # 可以根据文件大小和可用内存调整
    )

    # 处理文档并创建向量存储
    vectorstore = processor.create_vectorstore(VECTOR_STORE_PATH)

    print(f"处理完成: {datetime.now()}")
    return vectorstore

# 5. 运行处理
if __name__ == "__main__":
    vectorstore = main()

开始处理: 2025-06-10 09:05:30.903261


  return HuggingFaceEmbeddings(


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

找到 2 个PDF文件


处理PDF文件:   0%|          | 0/2 [00:00<?, ?it/s]

- Manual_D3.pdf: 处理了 2793 个文档块
- pbapg.pdf: 处理了 1274 个文档块

总共处理了 4067 个文档块
处理完成: 2025-06-10 09:10:13.204513


  vectorstore.persist()


In [None]:
## 8. 设置RAG Chain
# 创建记忆组件
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)
# 创建模板
rag_template = """Please use next context to answer the questions. If you do not know about the questions, just say you do not know, please do not make up the answers.Please try your best to use the context information.

context：{context}

question：{question}

Please answer with your understanding："""

PROMPT = PromptTemplate(
    template=rag_template,
    input_variables=["context", "question"]
)

# 创建对话链
chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
    memory=memory,
    combine_docs_chain_kwargs={"prompt": PROMPT}
)

  memory = ConversationBufferMemory(


In [None]:
## 9. 创建Gradio界面
def respond(message, history):
    response = chain({"question": message})
    return response['answer']

# 创建Gradio界面
demo = gr.ChatInterface(
    respond,
    title="RAG Documents Q&A System",
    description="Based on the Fine-tunning LLama model, the RAG system can answer the questions that related to uploaded documents."
)

# 启动界面
demo.launch(share=True)

  self.chatbot = Chatbot(


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://570a88a3cdeb810ad9.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


