## Step1. lora微调模型

**本示例采用qwen2-7B-instruct作为base模型，利用peft框架来进行lora微调。
主要流程为：①导入模型、指令微调数据集；②编写处理上下文的帮手函数；③设置lora参数；④训练**

对于解析pdf，我觉得可以使用pymupdf

### 下载环境


In [2]:
#下载环境
!pip install transformers>=4.37.0
!pip install torch
# !pip install PyPDF2
!pip install docling
!pip install tqdm
!pip install accelerate
!pip install scikit-learn

[0mLooking in indexes: https://mirrors.ivolces.com/pypi/simple/
[0mLooking in indexes: https://mirrors.ivolces.com/pypi/simple/
[0mLooking in indexes: https://mirrors.ivolces.com/pypi/simple/
Collecting accelerate
  Downloading https://mirrors.ivolces.com/pypi/packages/15/33/b6b4ad5efa8b9f4275d4ed17ff8a44c97276171341ba565fdffb0e3dc5e8/accelerate-0.33.0-py3-none-any.whl (315 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.1/315.1 kB[0m [31m37.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.33.0
[0m

In [1]:
import os
import re
import sys
import json
import warnings
# import PyPDF2
from tqdm import tqdm
from transformers import AutoModelForCausalLM,AutoTokenizer,Trainer,TrainingArguments
from torch.utils.data import Dataset
import copy
import torch
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from docling.document_converter import DocumentConverter

### 定义训练集、模型、上下文路径


In [2]:
#导入你自己构建的微调数据集
train_input_path='/root/code/LLM_pdf/LLM_pipline/finetune_exmaple.jsonl'

#导入可能用到的上下文目录路径
PDF_PATH='/root/code/pdfs'

device="cuda"
local_docling_model_path = Path("/root/.cache/huggingface/hub/models--ds4sd--docling-models/snapshots/96e8ba4eb46f125ff2abbbdffbdc2a102d0150b4/")
converter = DocumentConverter(artifacts_path=local_docling_model_path)

#导入base模型
model_path="/vepfs/fs_users/lkn/huggingface/hub"  #qwen2-7B-Instruct

### 定义帮手函数（用于处理上下文，解析pdf）

In [4]:




# # 示例数据
# data = {
#     'file-info': {
#         'filename': '10.1002_adem.201700820.pdf',
#         'document-hash': '81329a6ee2745cda5505cb8eab0680caf18aab9b1d12bace90e9e43e6f72636c',
#         '#-pages': 7
#     },
#     'main-text': [
#         {'text': 'FULL PAPER', 'type': 'page-header', 'name': 'Page-header', 'prov': [{'bbox': [50.91675567626953, 752.8856201171875, 129.98294067382812, 765.4323120117188], 'page': 1, 'span': [0, 10]}]},
#         {'text': 'Full Papers', 'type': 'paragraph', 'name': 'Text', 'prov': [{'bbox': [50.53390121459961, 735.7875366210938, 88.59913635253906, 744.0340576171875], 'page': 1, 'span': [0, 11]}]},
#         {'text': 'www.aem-journal.com', 'type': 'paragraph', 'name': 'Text', 'prov': [{'bbox': [472.0702209472656, 735.7813110351562, 547.0069580078125, 743.63330078125], 'page': 1, 'span': [0, 19]}]},
#         {'text': 'Effects of Initial δ Phase on Creep Behaviors and Fracture Characteristics of a Nickel-Based Superalloy', 'type': 'subtitle-level-1', 'name': 'Section-header', 'prov': [{'bbox': [51.02360153198242, 674.0831298828125, 496.2049865722656, 717.9251098632812], 'page': 1, 'span': [0, 103]}]},
#         {'text': 'Y. C. Lin,* Liang-Xing Yin, Shun-Cun Luo, Dao-Guang He, and Xiao-Bin Peng', 'type': 'paragraph', 'name': 'Text', 'prov': [{'bbox': [50.96749496459961, 643.3577270507812, 495.8893127441406, 657.5748901367188], 'page': 1, 'span': [0, 73]}]},
#         {'text': '1700820 (1 of 7)', 'type': 'page-footer', 'name': 'Page-footer', 'prov': [{'bbox': [267.6825256347656, 37.788795471191406, 330.2043762207031, 47.574771881103516], 'page': 1, 'span': [0, 16]}]},
#         {'text': 'alloy, etc. Lin et al. [55] investigated the fracture characteristics of a typical Ni-based superalloy, and found that the combined effects of localized necking and microvoid coalescence cause the final fracture of specimens.', 'type': 'paragraph', 'name': 'Text', 'prov': [{'bbox': [304.8879699707031, 69.45539855957031, 553.051025390625, 431.31536865234375], 'page': 1, 'span': [0, 2100]}]},
#         {'name': 'Picture', 'type': 'figure', '$ref': '#/figures/0'}
#     ],
#     'figures': [
#         {'caption': 'Figure 7. SEM fractographs of creep-ruptured nickel-based superalloy...'}
#     ]
# }

# 使用Docling提取PDF内容
def extract_pdf_content(pdf_path,converter):
    
    # print(pdf_path)
    print("begin! :", pdf_path)
    doc = converter.convert_single(pdf_path)
    print("end! :", pdf_path)
    torch.cuda.empty_cache()
    return doc.render_as_dict()

# 提取主文本内容并添加页码信息
def extract_main_text(data, pages=None):
    main_text = []
    for element in data.get('main-text', []):
        # 检查每个条目是否具有 prov 属性
        if 'prov' in element and element['prov']:
            page_number = element['prov'][0]['page']
        else:
            page_number = 'unknown'  # 标记为未知页码
        if pages and int(page_number) not in pages:
            continue  # 如果页面不在指定范围内，跳过该页面
        # 如果存在 $ref 字段，替换为引用的内容
        if '$ref' in element:
            ref = element['$ref']
            ref_index = int(ref.split('/')[-1])
            element_content = data.get('figures', [])[ref_index]
            main_text.append({
                'page': page_number,
                'type': element.get('type', 'unknown'),
                'name': element.get('name', 'unknown'),
                'text': element_content  # 使用引用内容替换
            })
        else:
            main_text.append({
                'page': page_number,
                'type': element.get('type', 'unknown'),
                'name': element.get('name', 'unknown'),
                'text': element.get('text', '')
            })
    return main_text

# 将表格数据转换为字符串，用于检索
def table_to_text(table):
    return table.get('title') + '\n' + '\n'.join(['\t'.join(row) for row in table.get('data', [])])

# 提取表格内容并添加页码信息
def extract_tables(data, pages = None):
    tables = []
    for table in data.get('tables', []):
        page_number = table['prov'][0]['page']
        if pages and int(page_number) not in pages:
            continue  # 如果页面不在指定范围内，跳过该页面
        table_data = {
            'page': page_number,
            'title': table.get('text', ''),
            'type': table.get('type', 'unknown'),
            'columns': table.get('#-cols', 0),
            'rows': table.get('#-rows', 0),
            'text': ''
        }
        table_text = table.get('title', '') + '\n'
        table_text += '\n'.join(['\t'.join(cell.get('text', '') for cell in row) for row in table.get('data', [])])
        table_data['text'] = table_text
        tables.append(table_data)
    return tables

# 将表格信息组合成字符串
def combine_table_info(table):
    return f"Title: {table['title']}\nColumns: {table['columns']}\nRows: {table['rows']}\n{table['text']}"

# 构建最终的JSON结构
def build_final_json(data, pages = None):
    main_texts = extract_main_text(data, pages)
    tables = extract_tables(data, pages)
    combined_content = []

    for item in main_texts:
        combined_content.append(item)
    
    # 将表格对象转换为统一的结构
    for table in tables:
        combined_content.append({
            'page': table['page'],
            'type': table['type'],
            'name': table['title'],
            'text': combine_table_info(table),
        })

    return {
        'filename': data['file-info']['filename'],
        'number-of-pages': data['file-info']['#-pages'],
        'combined-content': combined_content
    }
    
# 检索和组合PDF内容
def parse_pdf_and_concate(obj, converter):
    pdf_path = obj["doi"].replace('/', '_').replace(' (Supporting Information)', '_si') + '.pdf'
    pdf_path = os.path.join(PDF_PATH, pdf_path)
    task_type = obj["task"]
    if task_type == "1":
        obj["input"].append({"role": "user", "content": ""})
        return
    if "pages" in obj and obj["pages"] != [1, -1]:
        pages = range(obj["pages"][0], obj["pages"][1] + 1)
    else:
        pages = None
    # print(pdf_path)
    try:
        if not os.path.exists(pdf_path):
            raise FileNotFoundError(f"PDF file does not exist: {pdf_path}")
        data = extract_pdf_content(pdf_path,converter)
    

        # 构建最终的JSON结构
        final_json = build_final_json(data,pages=pages)
        # print(final_json)
        # 提取问题作为关键词
        question = next((entry["content"] for entry in obj["input"] if entry["role"] == "user"), "")

        all_texts = [
                element['text'] if isinstance(element['text'], str) else json.dumps(element['text'])
                for element in final_json['combined-content']
            ]
        
            # 初始化 TfidfVectorizer 实例
        torch.cuda.empty_cache()
        print("begin for vectorizer")
        tfidf_vectorizer = TfidfVectorizer(use_idf=True)

        # 对所有文档进行拟合（学习词汇表和IDF）
        fitted_vectorizer = tfidf_vectorizer.fit(all_texts)
        print("fit vectorizer")
        # 将文档集合转换为TF-IDF特征矩阵
        tfidf_vectorizer_vectors = fitted_vectorizer.transform(all_texts)

        # 将问题转换为与文档集合相同空间的TF-IDF向量
        question_vec = fitted_vectorizer.transform([question])
        print("transform vectorizer")

        # 计算余弦相似度
        cosine_similarities = cosine_similarity(question_vec, tfidf_vectorizer_vectors).flatten()
        print("end vectorizer")

        # 找到最相关的内容索引
        relevant_content_indices = cosine_similarities.argsort()[-5:][::-1]  # 选择最相关的5个内容

        # 构建初步上下文
        relevant_contents = [all_texts[i] for i in relevant_content_indices]
    except Exception as e:
        print("Error PDF is: ", pdf_path)
        relevant_contents=""
    attached_file_content = "\nThe file is as follows:\n\n" + " ".join(relevant_contents)

    # 选择性获取上下文信息，限制长度
    attached_file_content = attached_file_content[:1024]
    obj["input"].append({"role": "user", "content": attached_file_content})
    torch.cuda.empty_cache()

In [7]:
extract_pdf_content("/root/code/pdfs/10.1016_j.matchar.2018.06.029.pdf",converter)



An unexpected error occurred while opening the document 10.1016_j.matchar.2018.06.029.pdf
Traceback (most recent call last):
  File "/root/miniconda3/lib/python3.10/site-packages/docling/datamodel/document.py", line 93, in __init__
    self._backend = pdf_backend(
  File "/root/miniconda3/lib/python3.10/site-packages/docling/backend/docling_parse_backend.py", line 193, in __init__
    self._pdoc = pdfium.PdfDocument(path_or_stream)
  File "/root/miniconda3/lib/python3.10/site-packages/pypdfium2/_helpers/document.py", line 78, in __init__
    self.raw, to_hold, to_close = _open_pdf(self._input, self._password, self._autoclose)
  File "/root/miniconda3/lib/python3.10/site-packages/pypdfium2/_helpers/document.py", line 678, in _open_pdf
    raise PdfiumError(f"Failed to load document (PDFium: {pdfium_i.ErrorToStr.get(err_code)}).")
pypdfium2._helpers.misc.PdfiumError: Failed to load document (PDFium: Data format error).


begin! : /root/code/pdfs/10.1016_j.matchar.2018.06.029.pdf


RuntimeError: Conversion failed with status: 3

In [12]:
torch.cuda.empty_cache()

In [11]:
# #  从PDF文件中提取文本，并以字符串列表的形式返回。
# #    参数：
# #        pdf_path: PDF文件的路径。
# #        add_page_num: 是否在每页文本的开头添加页码。
# #    返回：
# #        texts: 一个字符串列表，其中每个字符串都是一页的文本。

# def extract_text(pdf_path, ) -> list[str]:
#     texts = []
#     try:
#         # Open the PDF file
#         doc = fitz.open(pdf_path)
#         for page_num in range(doc.page_count):
#             page = doc.load_page(page_num)
#             text = page.get_text("text")  # Extract text from the page
#             if text:
#                 text = f"Page {page_num + 1}:\n{text}\n"
#                 texts.append(text)
#     except Exception as e:
#         print(f"Error while processing PDF: {e}")
#     return texts


# #读取doi字段，根据路径去解析相应的pdf，并根据"pages"字段来截取需要的上下文，把上下文作为user prompt append进原来的input list
# def parse_pdf_and_concate(obj):
#     pdf_path=obj["doi"]
#     pdf_path = pdf_path.replace('/', '_').replace(' (Supporting Information)', '_si')
#     pdf_path=PDF_PATH+pdf_path+'.pdf'
#     attach_content_list=extract_text(pdf_path=pdf_path)
#     if "pages" in obj and obj["pages"] != [1,-1] :
#         #例如 pages=[5,6] 代表attach_content_list 中第4个str和第五个str
#         index=obj["pages"]
#         attach_content_list=attach_content_list[index[0]-1:index[1]]
    
#     attached_file_content = "\nThe file is as follows:\n\n" + "".join(attach_content_list)
#     #  选择性的获取上下文信息，而不是全部内容
#     attached_file_content = attached_file_content[:1024]   
#     obj["input"].append({"role":"user","content":attached_file_content})


### 导入模型和tokenzier

In [5]:
#导入模型
local_cache_dir = "/vepfs/fs_users/lkn/huggingface/hub"

# 加载模型和分词器时指定 cache_dir
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B-Instruct", cache_dir=local_cache_dir)
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-7B-Instruct", cache_dir=local_cache_dir,torch_dtype=torch.float16)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [6]:
#工具函数，读取测试集jsonl到list
def read_jsonl(file_path):    
    data=[]
    with open(file_path,'r',encoding='utf-8') as f:
        for line in f:
            try:
                obj=json.loads(line.strip())
                data.append(obj)
            except json.JSONDecodeError as e :
                print(f"Error decoding JSON:{e}")
    return data

train_data=read_jsonl(train_input_path)

#帮手函数，提取ideal的值，因为ideal有可能是个list，也可能是个str
def extract_ideal_value(ideal):
    if isinstance(ideal, list) and len(ideal) > 0:
        return ideal[0]
    elif isinstance(ideal, str):
        return ideal
    else:
        return None  

In [33]:
train_data[0]

{'input': [{'role': 'system',
   'content': 'You are a highly intelligent assistant who answers the following multiple choice question correctly.'},
  {'role': 'system', 'content': ''},
  {'role': 'system', 'content': 'Only write the answer down.'},
  {'role': 'user',
   'content': 'Which of these evolutionary agents is most consistent at causing populations to become better suited to their environments over the course of generations?\n\na) Mutation\n\nb) Non-random mating\n\nc) Gene flow\n\nd) Natural selection'},
  {'role': 'assistant', 'content': 'd) Natural selection'},
  {'role': 'assistant', 'content': 'd) Natural selection'},
  {'role': 'assistant', 'content': 'd) Natural selection'}],
 'ideal': 'd) Natural selection',
 'option': ['a) Mutation',
  'b) Non-random mating',
  'c) Gene flow',
  'd) Natural selection'],
 'task': '1'}

### 构建自定义的Dataset和Collator

In [7]:
#导入训练集
class sftDataset(Dataset):
    def __init__(self,data,tokenizer,doc_converter):
        #data是个list
        self.data=data
        self.tokenizer=tokenizer
        self.doc_converter=doc_converter

    def __getitem__(self, index) :
        #把对应论文的内容concat到user content的后面。具体对pdf怎么处理由选手决定，此处仅是一个简单的示例。
        if "pages" in self.data[index]:
            #处理pdf并拼接
            parse_pdf_and_concate(self.data[index],self.doc_converter)
        input=self.data[index]["input"]
        ideal=extract_ideal_value(self.data[index]["ideal"])
        output=ideal

        input.append({"role":"assistant","content":output})
        #msg是个list：
        '''  msg= [
          {"role": "system", "content": "You are an expert in the electrolytes field. Please answer the following multiple choice question correctly.\nOnly write the option (e.g., a), b), c), or d)) without explanation."},
          {"role": "user", "content": "In the upper paper, what are the minimum and maximum intramolecular distancesnm) of dimethyl carbonate?\n\na) 0.41/0.87\nb) 0.49/0.67\nc) 0.25/0.25\nd) 0.25/0.38"},
          {"role":"assistant","content":"a) 0.41/0.87"},
            ]
        '''
        response=self.tokenizer.apply_chat_template(input,tokenize=False, add_generation_prompt=False)
        input=response.split("<|im_start|>assistant\n")[0]
        input+="<|im_start|>assistant\n"
        return dict(input_ids=input, labels=response)
    
    def __len__(self):
        return len(self.data)

#把输入从字符串改为特定的token list
class Collator(object):
    def __init__(self,only_train_response,tokenizer):
        self.only_train_response=only_train_response
        self.tokenizer=tokenizer
        if self.tokenizer.pad_token_id is None:
            self.tokenizer.pad_token_id = self.tokenizer.unk_token_id
        
    def __call__(self,batch):
        input_texts=[d["input_ids"] for d in batch]
        full_texts=[d["labels"] for d in batch]

        inputs=self.tokenizer(
            text=full_texts,
            text_target=input_texts,
            return_tensors="pt",
            padding="longest",
            max_length=1024,
            truncation=True,
            return_attention_mask=True,
        )
        labels=copy.deepcopy(inputs["input_ids"])
        if self.only_train_response:
            # ignore padding
            labels[labels==self.tokenizer.pad_token_id]=-100
            # ignore input text
            labels[torch.where(inputs["labels"] !=self.tokenizer.pad_token_id)]=-100

        inputs["labels"]=labels
        return inputs

In [8]:
sft_dataset=sftDataset(train_data,tokenizer,converter)
only_train_response=True
data_collator=Collator(only_train_response,tokenizer)

In [9]:
sft_dataset.__getitem__(0)

{'input_ids': '<|im_start|>system\nYou are a highly intelligent assistant who answers the following multiple choice question correctly.<|im_end|>\n<|im_start|>system\n<|im_end|>\n<|im_start|>system\nOnly write the answer down.<|im_end|>\n<|im_start|>user\nWhich of these evolutionary agents is most consistent at causing populations to become better suited to their environments over the course of generations?\n\na) Mutation\n\nb) Non-random mating\n\nc) Gene flow\n\nd) Natural selection<|im_end|>\n<|im_start|>assistant\n',
 'labels': '<|im_start|>system\nYou are a highly intelligent assistant who answers the following multiple choice question correctly.<|im_end|>\n<|im_start|>system\n<|im_end|>\n<|im_start|>system\nOnly write the answer down.<|im_end|>\n<|im_start|>user\nWhich of these evolutionary agents is most consistent at causing populations to become better suited to their environments over the course of generations?\n\na) Mutation\n\nb) Non-random mating\n\nc) Gene flow\n\nd) Natu

In [12]:
!pip install peft

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting peft
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/62/72/fcabddf222ec938c3cbd5616e5a72796938b5235897e07a1fcc2a8e7735e/peft-0.12.0-py3-none-any.whl (296 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.4/296.4 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: peft
Successfully installed peft-0.12.0
[0m

Error: INVALID mime type: status. Must be in the format "type/subtype[;optionalparameter]"

In [9]:
from peft import LoraConfig,TaskType,get_peft_model,PeftModel

### 定义lora参数

In [10]:
#定义训练参数
args = TrainingArguments(
    output_dir="./personal/Qwen2_instruct_lora",#设置输出路径
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    logging_steps=10,
    num_train_epochs=3,
    save_steps=500,
    learning_rate=1e-4,
    save_on_each_node=True,
    gradient_checkpointing=False
)

#定义lora参数
from peft import LoraConfig,TaskType,get_peft_model,PeftModel
config=LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "k_proj", "v_proj"],
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1
)


### 训练

In [11]:
lora_model=get_peft_model(model,config)
# 打印显存占用情况
def print_memory_usage():
    allocated = torch.cuda.memory_allocated() / (1024 ** 3)  # 转换为GB
    reserved = torch.cuda.memory_reserved() / (1024 ** 3)  # 转换为GB
    print(f"显存分配: {allocated:.2f} GB")
    print(f"显存预留: {reserved:.2f} GB")

# 打印显存占用情况
print("定义Trainer之前的显存情况:")
print_memory_usage()
#定义trainer
trainer=Trainer(
    model=lora_model,
    args=args,
    data_collator=data_collator,
    train_dataset=sft_dataset,
)
print("定义Trainer之后的显存情况:")
print_memory_usage()
trainer.train()


Detected kernel version 5.4.250, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


定义Trainer之前的显存情况:
显存分配: 0.23 GB
显存预留: 0.26 GB
定义Trainer之后的显存情况:
显存分配: 14.92 GB
显存预留: 15.03 GB
begin! : /root/code/pdfs/10.1016_j.mprp.2018.02.001.pdf




end! : /root/code/pdfs/10.1016_j.mprp.2018.02.001.pdf


Step,Training Loss
10,0.0


begin! : /root/code/pdfs/10.1021_acs.orglett.6b01704.pdf
end! : /root/code/pdfs/10.1021_acs.orglett.6b01704.pdf
begin! : /root/code/pdfs/10.1016_j.polymer.2014.12.060.pdf
end! : /root/code/pdfs/10.1016_j.polymer.2014.12.060.pdf


An unexpected error occurred while opening the document 10.1021_jacs.6b05418.pdf
Traceback (most recent call last):
  File "/root/miniconda3/lib/python3.10/site-packages/docling/datamodel/document.py", line 93, in __init__
    self._backend = pdf_backend(
  File "/root/miniconda3/lib/python3.10/site-packages/docling/backend/docling_parse_backend.py", line 193, in __init__
    self._pdoc = pdfium.PdfDocument(path_or_stream)
  File "/root/miniconda3/lib/python3.10/site-packages/pypdfium2/_helpers/document.py", line 78, in __init__
    self.raw, to_hold, to_close = _open_pdf(self._input, self._password, self._autoclose)
  File "/root/miniconda3/lib/python3.10/site-packages/pypdfium2/_helpers/document.py", line 678, in _open_pdf
    raise PdfiumError(f"Failed to load document (PDFium: {pdfium_i.ErrorToStr.get(err_code)}).")
pypdfium2._helpers.misc.PdfiumError: Failed to load document (PDFium: Data format error).


begin! : /root/code/pdfs/10.1021_jacs.6b05418.pdf
Error PDF is:  /root/code/pdfs/10.1021_jacs.6b05418.pdf
begin! : /root/code/pdfs/US9750738.pdf
end! : /root/code/pdfs/US9750738.pdf
begin! : /root/code/pdfs/10.1038_srep14202.pdf
end! : /root/code/pdfs/10.1038_srep14202.pdf
begin! : /root/code/pdfs/10.1002_cjoc.201500265.pdf
end! : /root/code/pdfs/10.1002_cjoc.201500265.pdf
begin! : /root/code/pdfs/10.1021_acs.orglett.6b01658.pdf
end! : /root/code/pdfs/10.1021_acs.orglett.6b01658.pdf
begin! : /root/code/pdfs/10.1002_asia.201402019.pdf
end! : /root/code/pdfs/10.1002_asia.201402019.pdf
begin! : /root/code/pdfs/10.1021_acs.orglett.6b01595.pdf
end! : /root/code/pdfs/10.1021_acs.orglett.6b01595.pdf
begin! : /root/code/pdfs/10.1016_j.matchar.2018.06.029.pdf


An unexpected error occurred while opening the document 10.1016_j.matchar.2018.06.029.pdf
Traceback (most recent call last):
  File "/root/miniconda3/lib/python3.10/site-packages/docling/datamodel/document.py", line 93, in __init__
    self._backend = pdf_backend(
  File "/root/miniconda3/lib/python3.10/site-packages/docling/backend/docling_parse_backend.py", line 193, in __init__
    self._pdoc = pdfium.PdfDocument(path_or_stream)
  File "/root/miniconda3/lib/python3.10/site-packages/pypdfium2/_helpers/document.py", line 78, in __init__
    self.raw, to_hold, to_close = _open_pdf(self._input, self._password, self._autoclose)
  File "/root/miniconda3/lib/python3.10/site-packages/pypdfium2/_helpers/document.py", line 678, in _open_pdf
    raise PdfiumError(f"Failed to load document (PDFium: {pdfium_i.ErrorToStr.get(err_code)}).")
pypdfium2._helpers.misc.PdfiumError: Failed to load document (PDFium: Data format error).


Error PDF is:  /root/code/pdfs/10.1016_j.matchar.2018.06.029.pdf
begin! : /root/code/pdfs/10.1021_je3003089.pdf
end! : /root/code/pdfs/10.1021_je3003089.pdf
begin! : /root/code/pdfs/10.1021_je3003089.pdf
end! : /root/code/pdfs/10.1021_je3003089.pdf


OutOfMemoryError: CUDA out of memory. Tried to allocate 404.00 MiB (GPU 0; 23.65 GiB total capacity; 22.48 GiB already allocated; 28.75 MiB free; 22.81 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [17]:
def print_memory_usage():
    allocated = torch.cuda.memory_allocated() / (1024 ** 3)  # 转换为GB
    reserved = torch.cuda.memory_reserved() / (1024 ** 3)  # 转换为GB
    print(f"显存分配: {allocated:.2f} GB")
    print(f"显存预留: {reserved:.2f} GB")
print("定义Trainer之前的显存情况:")
print_memory_usage()
def get_model_size(model):
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()
    size_all_mb = (param_size + buffer_size) / (1024 ** 2)
    return size_all_mb

# Assuming lora_model is your model
model_size = get_model_size(model)
print(f"模型大小: {model_size:.2f} MB")

定义Trainer之前的显存情况:
显存分配: 0.00 GB
显存预留: 0.00 GB
模型大小: 14973.64 MB


### 【补充】平台使用帮助
baseline中模型的输出结果、微调/合并示例中模型权重的保存位置都可以根据选手的需要进行修改，可以考虑保存到个人的文件夹下（/personal/），然后再挂载到创建的数据集中。
平台数据集的使用规则可以参考：
https://bohrium-doc.dp.tech/docs/userguide/Dataset/