<a href="https://colab.research.google.com/github/erberry/ThinkML/blob/main/create_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 下载资料

In [None]:
!mkdir doc

In [None]:
!cd doc && git clone https://github.com/LawRefBook/Laws.git

Cloning into 'Laws'...
remote: Enumerating objects: 8067, done.[K
remote: Counting objects: 100% (1633/1633), done.[K
remote: Compressing objects: 100% (1311/1311), done.[K
remote: Total 8067 (delta 347), reused 1515 (delta 294), pack-reused 6434[K
Receiving objects: 100% (8067/8067), 40.59 MiB | 17.11 MiB/s, done.
Resolving deltas: 100% (3887/3887), done.


## 转为文本

In [None]:
!pip install markdown
!pip install python-docx
!pip install openpyxl
!pip install python-pptx
!pip install bs4

In [None]:
import os
import markdown
import docx
import openpyxl
import pptx
from bs4 import BeautifulSoup

################################################################################
### Step 1
################################################################################


def parse_file(file_path, save_text_folder, parser):
    text = parser(file_path)
    # 获取文件名称
    file_name = os.path.basename(file_path)
    # 拼接text文件路径
    text_file_path = os.path.join(save_text_folder, f'{file_name}.txt')
    # 打开text文件并写入解析后的文本内容
    with open(text_file_path, 'w', encoding='utf-8') as f:
        f.write(text)

def remove_center_tag(soup):
    # 移除center标签及其内容
    for center_tag in soup.find_all('center'):
        center_tag.extract()

# def parse_html_file(html_file_path, save_text_folder):
#     # 读取html文件内容
#     with open(html_file_path, 'r', encoding='utf-8') as f:
#         html_content = f.read()
#     # 使用BeautifulSoup解析html文件
#     soup = BeautifulSoup(html_content, 'html.parser')
#     # 去除center标签（印象笔记导出后在center标签中包含了一大串不可读的字符）
#     remove_center_tag(soup)
#     # 获取html文件名称
#     html_file_name = os.path.basename(html_file_path)
#     # 去除html后缀
#     html_file_name = re.sub(r'\.html$', '', html_file_name)
#     # 拼接text文件路径
#     text_file_path = os.path.join(save_text_folder, f'{html_file_name}.txt')
#     # 打开text文件并写入解析后的文本内容
#     with open(text_file_path, 'w', encoding='utf-8') as f:
#         f.write(soup.get_text())

def html_parser(file_path, content=''):
    # 读取文件内容
    if len(content) == 0:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
    # 使用BeautifulSoup解析html文件
    soup = BeautifulSoup(content, 'html.parser')
    # 去除center标签（印象笔记导出后在center标签中包含了一大串不可读的字符）
    remove_center_tag(soup)
    return soup.get_text()

def markdown_parser(file_path):
    # 读取文件内容
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    text = markdown.markdown(content)
    return html_parser(file_path, text)

def docx_parser(file_path):
    doc = docx.Document(file_path)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return '\n'.join(full_text)

def xlsx_parser(file_path):
    wb = openpyxl.load_workbook(file_path)
    # 获取workbook中所有的sheet名字
    sheet_names = wb.sheetnames
    full_text = []
    # 循环读取每个sheet的内容
    for sheet_name in sheet_names:
        # 根据sheet名字获取sheet对象
        ws = wb[sheet_name]
        for row in ws.values:
            for cell in row:
                if cell is not None and isinstance(cell, str):
                    full_text.append(cell)
    return '\n'.join(full_text)

def pptx_parser(file_path):
    prs = pptx.Presentation(file_path)
    full_text = []
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, 'text') and shape.text:
                full_text.append(shape.text)
    return '\n'.join(full_text)


def parse_folder(html_folder_path, save_text_folder):
    if not os.path.isdir(save_text_folder):
        # 创建目录
        os.makedirs(save_text_folder)

    # 遍历html文件夹下的所有.html文件
    for root, _, files in os.walk(html_folder_path):
        for file in files:
            # 获取文件路径
            file_path = os.path.join(root, file)
            print(f'提取{file_path}到文本...')
            if file.endswith('.html'):
                # 解析html文件并写入文本文件
                parse_file(file_path, save_text_folder, html_parser)
            elif file.endswith('.md'):
                # 解析md文件并写入文本文件
                parse_file(file_path, save_text_folder, markdown_parser)
            elif file.endswith('.docx') or file.endswith('.doc'):
                # 解析doc文件并写入文本文件
                parse_file(file_path, save_text_folder, docx_parser)
            elif file.endswith('.xlsx'):
                # 解析xlsx文件并写入文本文件
                parse_file(file_path, save_text_folder, xlsx_parser)
            elif file.endswith('.pptx'):
                # 解析pptx文件并写入文本文件
                parse_file(file_path, save_text_folder, pptx_parser)

# 读取doc目录下的文件，提取其中的文本，写入text文件夹下
if __name__ == '__main__':
    parse_folder('doc', 'text')
    print('---------------------------text生成完毕，写入text文件夹')


## 转为 csv

In [None]:
!pip install pandas

In [None]:
import os
import pandas as pd

################################################################################
### Step 2
################################################################################

max_len = 728

def remove_newlines(serie):
    serie = serie.str.replace('\n', '。')
    serie = serie.str.replace('\\n', '。')
    serie = serie.str.replace('。+', '。', regex=True)
    serie = serie.str.replace('  ', ' ')
    serie = serie.str.replace('  ', ' ')
    return serie


def toCsv():
    if not os.path.isdir('processed'):
        # 创建目录
        os.makedirs('processed')

    # 创建一个列表来存储文本文件
    texts=[]

    # 获取text目录下的所有文本文件
    for file in os.listdir("text/"):

        # 打开文件并读取文本内容
        with open("text/" + file, "r", encoding="UTF-8") as f:
            text = f.read()

            # 用空格替换-、_和#update。
            texts.append((file.replace('-',' ').replace('_', ' ').replace('#update',''), text))

    if len(texts) == 0:
        raise(Exception('没有可用的文本文件，请检查text目录'))

    lines=[]
    for t in texts:
        fname=t[0]
        text=t[1]

        # 用【换行+第】分割文本，作为一行。
        blocks=text.split('\n第')
        for block in blocks:
            block = '第' + block
            if len(block) > max_len-len(fname)-1:
                chunks = split_into_many(block, max_len-len(fname)-1)
                lines += [(fname, chunk.strip()) for chunk in chunks]
            else:
                lines.append((fname, block.strip()))

    # 从文本列表创建一个DataFrame
    df = pd.DataFrame(lines, columns = ['fname', 'text'])
    df['text'] = df.fname + "。" + remove_newlines(df.text)
    df.to_csv('processed/scraped.csv')

# 将文本按照max_char_len拆分
def split_into_many(text, max_char_len):
    # 将文本分割成句子
    sentences = text.split('。')

    # 获取每个句子的标记数量
    char_lens = [len(sentence) for sentence in sentences]

    chunks = []
    charlen_so_far = 0
    chunk = []

    # 遍历句子和标记组成的元组
    for sentence, charlen in zip(sentences, char_lens):

        # 如果到目前为止的总字符数量和当前句子的字符数量之和大于最大字符数量，
        # 则将块添加到块列表中并重置块和字符数量
        if charlen_so_far + charlen > max_char_len:
            chunks.append("。".join(chunk) + "。")
            chunk = []
            charlen_so_far = 0

        # 如果当前句子的字符数量超长，则跳过该句子
        if charlen > max_char_len:
            continue

        # 否则，将该句子添加到块中，并将标记数加到总数
        chunk.append(sentence)
        charlen_so_far += charlen + 1

    # 将最后一个块添加到块列表中
    if chunk:
        chunks.append("。".join(chunk) + "。")

    return chunks

# 将文本列设置为去除换行符后的原始文本
if __name__ == '__main__':
    toCsv()
    print('---------------------------csv生成完毕，写入processed/scraped.csv文件')




  serie = serie.str.replace('\\n', '。')


---------------------------csv生成完毕，写入processed/scraped.csv文件


## 创建 dataset

In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset

law_it_dataset = load_dataset("csv", data_files="processed/scraped.csv")


print(law_it_dataset['train'][:10])



## 授权 hugging face，并将数据集保存到自己的空间

token可以从 hugging face 个人信息 setting 中拿到。需要新增一个有写入权限的 token

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
law_it_dataset.push_to_hub("github-lll")



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/157 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/431 [00:00<?, ?B/s]

In [None]:
from datasets import load_dataset
law_dataset = load_dataset("erberry/github-lll")
print(law_dataset)