In [1]:
from transformers import AutoTokenizer

In [2]:
tokenizer = AutoTokenizer.from_pretrained('distilgpt2', use_fast=True)
tokenizer.batch_encode_plus([
    'hide new secretions from the parental units',
    'this moive is great'
])

{'input_ids': [[24717, 649, 3200, 507, 422, 262, 21694, 4991], [5661, 6941, 425, 318, 1049]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]}

In [3]:
print(tokenizer)

GPT2TokenizerFast(name_or_path='distilgpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'})


In [None]:
# 预测最后一个词 实际上是一个多分类问题

In [4]:
from datasets import load_dataset

dataset = load_dataset(path='glue', name='sst2')

Found cached dataset glue (C:/Users/SupercoldZzz/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [8]:
def f(data, tokenizer):
    return tokenizer.batch_encode_plus(data['sentence'])


dataset = dataset.map(f, batched=True, batch_size=1000, num_proc=12, 
                     remove_columns=['sentence', 'idx', 'label'], 
                     fn_kwargs={'tokenizer': tokenizer})

                                    

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [10]:
# 规定一个句子最少要有8个单词. 
# 过滤掉太短的句子
def f(data):
    return [len(i) >= 8 for i in data['input_ids']]

dataset = dataset.filter(f, batched=True, batch_size=1000, num_proc=12)

                                    

In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 39905
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 848
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1730
    })
})

In [12]:
# 截断句子
def f(data):
    data['input_ids'] = [i[:8] for i in data['input_ids']]
    data['attention_mask'] = [[1] * 8] * len(data['attention_mask'])
    # 模型帮我们做了偏移量问题, 这里输入和输出保持一致即可. 
    data['labels'] = data['input_ids']
    return data

dataset = dataset.map(f, batched=True, batch_size=1000, num_proc=12)

                                    

In [13]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 39905
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 848
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1730
    })
})

In [14]:
dataset['train'][0]

{'input_ids': [24717, 649, 3200, 507, 422, 262, 21694, 4991],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [24717, 649, 3200, 507, 422, 262, 21694, 4991]}

In [15]:
# 定义数据加载器
import torch
from transformers.data.data_collator import default_data_collator

loader = torch.utils.data.DataLoader(
    dataset=dataset['train'],
    batch_size=16,
    collate_fn=default_data_collator,
    shuffle=True,
    drop_last=True
)

for data in loader:
    break
    
len(loader), data

(2494,
 {'input_ids': tensor([[   64,  1643,   286,   257,   866,   263,   290,   257],
          [ 6738,   923,   284,  5461,   837,  9593,   257,  2121],
          [28895,   326,   262,   308,  2224,  2646,  2831,   460],
          [ 1169,  7110,   523, 34377,   326,   772,   355, 23586],
          [   66,  7749,  1512, 44624,   837,   257, 20278, 47602],
          [ 3919,   837,   772,   326,   705,    82,  1165,  5364],
          [   82,  7126, 29658,  3435,   290, 40620,   899, 20968],
          [ 6738,   281, 39770,  1862,  7401,   508,  7228,   465],
          [ 6738,   257,   458,  5088,  4420,  7758,   375,   859],
          [ 5562,   837,   772,   287,   477,   663,  3437,   705],
          [16275,  1771,   340,  3382,   284,   307,   257, 43527],
          [  986,   257,   629, 13513, 18344,  2364,   286, 21970],
          [26024,   588,   281,  7083, 10721,  5517,   287, 42964],
          [44944,    12,   448,    12,    75,  2778, 14678,  1590],
          [34751,  1363,   4

In [16]:
from transformers import AutoModelForCausalLM, GPT2Model

In [17]:
tokenizer.vocab_size * 768

38597376

In [21]:
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        # 简单写法 model = AutoModelForCausalLM.from_pretrained('distilgpt2')
        self.pretrained = GPT2Model.from_pretrained('distilgpt2')
        self.fc = torch.nn.Linear(768, tokenizer.vocab_size, bias=False)
        
        # 给fc这一层加载预训练权重
        parameters = AutoModelForCausalLM.from_pretrained('distilgpt2')
        self.fc.load_state_dict(parameters.lm_head.state_dict())
        
        self.criterion = torch.nn.CrossEntropyLoss()
        
    def forward(self, input_ids, attention_mask, labels=None):
        logits = self.pretrained(input_ids=input_ids, attention_mask=attention_mask)
        logits = logits.last_hidden_state
        logits = self.fc(logits)
        
        loss = None
        if labels is not None:
            shift_logits = logits[:, :-1].reshape(-1, tokenizer.vocab_size)
            shift_labels = labels[:, 1:].reshape(-1)
            loss = self.criterion(shift_logits, shift_labels)
        return {'loss': loss, 'logits': logits}
    
model = Model()
# 参数量
print(sum(i.numel() for i in model.parameters()) / 10000)

Some weights of the model checkpoint at distilgpt2 were not used when initializing GPT2Model: ['lm_head.weight']
- This IS expected if you are initializing GPT2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


12050.9952


In [22]:
# python中** 的用法, 可以自动把一个字典解包成关键词参数{} -> xxx =xxx, xxx=xxx
out = model(**data)
print(out['loss'], out['logits'].shape)

tensor(6.5038, grad_fn=<NllLossBackward0>) torch.Size([16, 8, 50257])


In [37]:
# 测试代码
def test(model):
    model.eval()
    
    # 加载测试数据
    loader_test = torch.utils.data.DataLoader(
        dataset=dataset['test'],
        batch_size=16,
        collate_fn=default_data_collator,
        shuffle=True,
        drop_last=True
    )
    
    correct = 0
    total = 0
    for i, data in enumerate(loader_test):
        # 只计算最后一个词的准确率. 
        label = data['input_ids'][:, -1].clone()
        # 再从数据中抹除最后一个词, 防止模型左闭. 
        data['input_ids'][:, -1] = 0
        # label就不需要了
        data['labels'][:, :]  = 0
        
        # 计算
        with torch.no_grad():
            out = model(**data)
            
        # 最后一个词的准确率, 因为有偏移量的关系, 这里取的是倒数第二个词
        out = out['logits'].argmax(dim=2)[:, -2]
        correct += (label==out).sum().item()
        total += 16
        
        if i % 10 == 0:
            print(i)
            print(label)
            print(out)
            
        if i == 50:
            break
            
    print('accuracy: ', correct / total)
    
    for i in range(8):
        print(tokenizer.decode(data['input_ids'][i, :-1]))
        print(tokenizer.decode(label[i]), tokenizer.decode(out[i]))
        print()

In [29]:
# 没有经过我们的数据训练的模型, 只是用了预训练的权重, 就达到了20%的准确率. 
test(model)

0
tensor([ 3146, 11982,   264,   764, 39769,  2646,   764,   290,   705,  1502,
          511, 10997,  2644,   290,   906,   287])
tensor([ 976,   13, 3435,   13,   11, 1907,   13,  290,  821,  262,  484,  680,
          13,  290,  561,   13])
10
tensor([  262,  3807,  1894, 21430,   355,   587,  2700,  3504,  1057, 20307,
         2085,   764,   517,   290,  7770,   282])
tensor([  262,  2854,  1894, 11170,   355,   587,  2700,   262,   257,    12,
         1256,    13,   475,    11,   582, 40881])
20
tensor([ 1787,   284,   922,  1838,  2644,  4260,   837,   621, 12838,   467,
          329,  4035, 19813,   290, 23251,   257])
tensor([  76,  284,  257,  318,   13,  898,   13,  621,   12,  340,  329, 4035,
         510,  292, 1943,  262])
30
tensor([  12, 1964, 5739, 1419,  257, 6096, 4952, 2223,  530,  837, 5023,  318,
          82,  284, 3931,  640])
tensor([  12,  262,  286, 4730,  262, 7328,   13,   11,  257,   11,  287,  318,
          82,    6,  614,  640])
40
tensor([  938, 113

In [30]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [31]:
from transformers import AdamW
from transformers.optimization import get_scheduler


In [32]:
# 训练
def train():
    optimizer = AdamW(model.parameters(), lr=2e-5)
    scheduler = get_scheduler(name='linear',
                              num_warmup_steps=0,
                              num_training_steps=len(loader),
                              optimizer=optimizer)
    
    model.to(device)
    model.train()
    for i, data in enumerate(loader):
        input_ids, attention_mask, labels = data['input_ids'], data['attention_mask'], data['labels']
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        out = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = out['loss']
        
        loss.backward()
        # 为了训练稳定, 进行梯度裁剪
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        
        optimizer.step()
        scheduler.step()
        
        optimizer.zero_grad()
        model.zero_grad()
        
        if i % 50 == 0:
            labels = labels[:, 1:]
            out = out['logits'].argmax(dim=2)[:, :-1]
            correct = (labels == out).sum().item()
            accuracy = correct / (16 * 7)
            
            lr = optimizer.state_dict()['param_groups'][0]['lr']
            
            print(i, loss.item(), accuracy, lr)
            

In [33]:
train()



0 5.916006565093994 0.21428571428571427 1.9991980753809144e-05
50 5.455381870269775 0.20535714285714285 1.959101844426624e-05
100 5.02870512008667 0.24107142857142858 1.919005613472334e-05
150 4.951569557189941 0.22321428571428573 1.8789093825180436e-05
200 5.210883617401123 0.21428571428571427 1.838813151563753e-05
250 5.392312526702881 0.21428571428571427 1.7987169206094627e-05
300 5.1180195808410645 0.23214285714285715 1.7586206896551724e-05
350 5.2933030128479 0.17857142857142858 1.718524458700882e-05
400 5.1162614822387695 0.21428571428571427 1.678428227746592e-05
450 5.150652885437012 0.19642857142857142 1.6383319967923016e-05
500 4.773595333099365 0.26785714285714285 1.5982357658380113e-05
550 5.292885780334473 0.1875 1.558139534883721e-05
600 4.781903266906738 0.21428571428571427 1.5180433039294307e-05
650 4.8646240234375 0.20535714285714285 1.4779470729751404e-05
700 4.640826225280762 0.21428571428571427 1.4378508420208502e-05
750 4.749022960662842 0.2767857142857143 1.3977546

In [35]:
torch.save(model, '../data/预测最后一个词.model')

In [36]:
model2 = torch.load('../data/预测最后一个词.model', map_location='cpu')

In [38]:
test(model2)

0
tensor([ 6232,   379,    82,   532,  6958,  4035, 16223,  5321,   837,  2099,
          318,   286,  5701,  3729, 43527, 10378])
tensor([  262,   379,    82,    12,   257,  4035, 13289,  5321,  2003,  1611,
          318,   284, 10997,   257,   262,  6042])
10
tensor([ 1033,  3210,   557,   262,  8824, 33471,  5280,   837,  3807,   290,
         1107,   881,   764,   837, 11783,  2071])
tensor([  326,  3807, 32745,   340,   787,  6490,   705,   837, 10997,   837,
          257,   881,   220,   837,  3807, 10059])
20
tensor([  616,  3155,    12,   913,   290,   329,   287,    82, 11331, 18098,
        12850, 28294,   416,  1645,  2589,  4600])
tensor([  262,  3807,   288,   913,   290,   329,   220,    82,  2116, 10647,
          290,   257,   416,   326,  1218,   290])
30
tensor([   64,   286,  7463, 36207, 12986, 15827,   764,   257,  5794,  1377,
          290,  5739,    12,   262,   467,   257])
tensor([ 7916,   837,   257, 36207,   262, 24953,   837,   588, 35478,   329,
        