In [1]:
from datasets import *

## datasets基本使用
##### 加载在线数据集

In [None]:
datasets = load_dataset("madao33/new-title-chinese")
datasets

### 加载数据集合集中的某一项任务

In [None]:
boolq_dataset = load_dataset("super_glue", "boolq")
boolq_dataset

#### 按照数据集划分进行加载

In [None]:
dataset = load_dataset("madao33/new-title-chinese", split="train")
# dataset = load_dataset("madao33/new-title-chinese", split="train[100]")
# dataset = load_dataset("madao33/new-title-chinese", split="train[10:100]")
# dataset = load_dataset("madao33/new-title-chinese", split="train[:50%]")
# dataset = load_dataset("madao33/new-title-chinese", split=["train[:50%]", "validation[:10%]")
# dataset = load_dataset("madao33/new-title-chinese", split=["train[:50%]", "train[50%:]")
dataset

#### 查看数据集

In [None]:
datasets["train"][0]
# datasets["train"][:2]
# datasets["train"]["title"][:5]
# datasets["train"].column_names
# datasets["train"].features

#### 数据集划分

In [None]:
dataset = datasets["train"]
dataset.train_test_split(test_size=0.1)

In [None]:
dataset = boolq_dataset["train"]
dataset.train_test_split(test_size=0.1, stratify_by_column="label")

#### 数据选取与过滤

In [None]:
# 选取
datasets["train"].select([0,1])

In [None]:
# 过滤
filter_dataset = datasets["train"].filter(lambda  example:"中国" in example["title"])
filter_dataset["title"][:5]

#### 数据映射

In [None]:
def add_prefix(example):
    example["title"] = 'prefix:' + example["title"]
    return example

In [None]:
prefix_dataset = datasets.map(add_prefix)
prefix_dataset["train"][:10]["title"]

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
def preprocess_function(example, tokenizer=tokenizer):
    model_inputs = tokenizer(example["content"], max_length=512, truncation=True)
    labels = tokenizer(example["title"], max_legth=32, truncation=True)
    # label就是title编码的结果
    model_inputs["labels"] = labels["input_ids"]
    return  model_inputs

In [None]:
process_datasets = datasets.map(preprocess_function)
process_datasets

In [None]:
process_datasets = datasets.map(preprocess_function, batched=True)

In [None]:
# process_datasets = datasets.map(preprocess_function, num_proc=4) #

In [None]:
process_datasets = datasets.map(preprocess_function, batched=True, remove_columns=datasets["train"].column_names)

#### 保存与加载

In [None]:
process_datasets.save_to_disk("./processed_data")

In [None]:
process_datasets.load_from_disk("./processed_data")
process_datasets

## 加载本地数据集

#### 直接加载文件作为数据集

In [None]:
dataset = load_dataset("csv", data_files="./ChnSentiCorp_htl_all.csv", split="train")
dataset

In [None]:
dataset = Dataset.from_csv("./ChnSentiCorp_htl_all.csv")
dataset

#### 加载文件夹内全部文件作为数据集

In [None]:
dataset = load_dataset("csv", data_dir="./all_data/", split="train")

In [None]:
dataset = load_dataset("csv", data_files=["ChnSentiCorp_htl_all1.csv", "ChnSentiCorp_htl_all.csv"], split="train")

#### 通过预先加载的其他格式转换加载数据集

In [None]:
import pandas as pd

data = pd.read_csv("./ChnSentiCorp_htl_all.csv")
data.head()

In [None]:
dataset = Dataset.from_pandas(data)
dataset

In [None]:
# List格式的数据需要内嵌{},明确数据字段
# data = [{"text":"abc"},{"text":"def"}]
# data = ["abc","def"] # 这样是没法转换的
# data = [{"text":"abc"},{"text":"def"}] # 这种是可以加载的
Dataset.from_list(data)

#### 通过自定义加载脚本加载数据集

In [None]:
dataset = load_dataset("./load_script.py", split="train")
dataset

In [None]:
# load_dataset("json", data_files="./cmrc2018_trial.json", field="data") #这种加载的会不详细

## Dataset with DataCollator

In [None]:
from transformers import DataCollatorWithPadding

In [None]:
dataset = load_dataset("csv", data_files="./ChnSentiCorp_htl_all.csv", split="train")
dataset = dataset.filter(lambda  x: x["review"] is not None)
dataset

In [None]:
def process_function(examples):
    tokenized_examples = tokenizer(examples["review"], max_length=128, truncation=True)
    tokenized_examples["labels"] = examples["label"]
    return tokenized_examples

In [None]:
tokenzed_dataset = dataset.map(process_function, batched=True, remove_columns=dataset.column_names)
tokenzed_dataset

In [None]:
print(tokenzed_dataset)

In [None]:
collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from torch.utils.data import DataLoader

In [None]:
dl = DataLoader(tokenzed_dataset, batch_size=4, collate_fn=collator, shuffle=True)

In [None]:
next(enumerate(dl)) #

In [None]:
num = 0
for batch in dl:
    print(batch["input_ids"].size())
    num += 1
    if num > 10:
        break

# 使用Dataset进行微调

In [None]:
# Step1. 导入相关包
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset

# Step2. 加载数据集
dataset = load_dataset("csv", data_files="./ChnSentiCorp_htl_all.csv", split="train")
dataset = dataset.filter(lambda x: x["review"] is not None)
dataset

In [None]:
# Step3. 划分数据集
datasets = dataset.train_test_split(test_size=0.1)
datasets

In [None]:
# Step4. 创建Dataloader

import torch
tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")

def process_function(examples):
    tokenzed_examples = tokenizer(examples["review"], max_length=128, truncation=True)
    tokenzed_examples["labels"] = examples["labels"]
    return tokenzed_examples

tokenized_datasets = datasets.map(process_function, batched=True, remove_columns=datasets["train"].column_names)
tokenized_datasets

In [None]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
trainset, validset = tokenized_datasets["train"], tokenized_datasets["test"]
trainloader = DataLoader(trainset, batch_size=32, shuffle=True, collate_fn=DataCollatorWithPadding(tokenizer))
validloader = DataLoader(validset, batch_size=64, shuffle=False, collate_fn=DataCollatorWithPadding(tokenizer))

In [None]:
next(enumerate(validloader))[1]

In [None]:
# Step5. 创建模型及优化器
from torch.optim import Adam
model = AutoModelForSequenceClassification.from_pretrained("hfl/rbt3")
if torch.cuda.is_available():
    model = model.cuda()
optimizer = Adam(model.paramters(), lr=2e-5)

# Step6. 训练与验证
def evaluate():
    model.eval()
    acc_num = 0
    with torch.inference_mode():
        for batch in validloader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            output = model(**batch)
            pred = torch.argmax(output.logits, dim=-1)
            acc_num += (pred.long() == batch["labels"].long()).float().sum()
    return acc_num / len(validset)

def train(epoch=3, log_step=100):
    global_step = 0
    for ep in range(epoch):
        model.train()
        for batch in trainloader:
            if torch.cuda.is_available():
                batch = {k:v.cuda() for k,v in batch.items()}
            optimizer.zero_grad()
            output = model(**batch)
            output.loss.backward()
            optimizer.step()
            if global_step % log_step == 0:
                print(f"ep: {ep}, global_step: {global_step}, loss: {output.loss.item()}")
            global_step += 1
        acc = evaluate()
        print(f"ep: {ep}, acc: {acc}")

# Step7. 模型训练
train()

# Step8. 模型预测
sen = "我觉得这家店不错，饭很好吃"
id_2_label = {0:"差评", 1:"好评"}
model.eval()
with torch.inference_mode():
    inputs = tokenizer(sen, return_tensors="pt")
    inputs = {k:v.cuda() for k,v in inputs.items()}
    logits = model(**inputs).logits
    pred = torch.argmax(logits, dim=-1)
    print(f"输入:{sen}\n模型预测结果:{id_2_label.get(pred.item())}")