In [304]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import requests
import tiktoken
import math

In [305]:
#设置超参数
batch_size = 4
context_length = 16
d_model = 64
num_blocks = 8
num_heads =4
learning_rate = 1e-3
#学习率，用于控制模型训练时的步长。这里设置为0.001。
dropout = 0.1
#dropout比率，用于防止过拟合。这里设置为0.1。
max_iters = 500
#最大训练迭代次数，表示模型将训练500次。
eval_interval = 50
#评估间隔，表示每50次训练后进行一次评估。
eval_iters = 20
#每次评估时的迭代次数，表示每次评估将使用20个样本
save_interval = 1000
#保存间隔，表示每1000次训练后保存一次模型。
device = "cuda" if torch.cuda.is_available() else "cpu"
#设备选择，如果CUDA（NVIDIA的并行计算平台）可用，则使用GPU，否则使用CPU
TORRCH_SEED = 1337
#随机种子，用于确保实验的可重复性。这里设置为1337
torch.manual_seed(TORRCH_SEED)
#置PyTorch的随机种子，以确保实验的可重复性。

<torch._C.Generator at 0x1d78c665d30>

In [306]:
#导入数据集
if not os.path.exists("sales_textbook.txt"):
    with open('sales_textbook.txt', 'wb') as f:
        f.write(requests.get(
            'https://huggingface.co/datasets/goendalf666/sales-textbook_for_convincing_and_selling/raw/main/sales_textbook.txt').text)
#如果不存在数据集文件，则从Hugging Face下载并保存到本地
with open('sales_textbook.txt', 'r') as f:
    text = f.read()
#读取数据集文件

In [307]:
#token化
encoding = tiktoken.get_encoding("cl100k_base")
tokenized_text = encoding.encode(text)
max_token_value = max(tokenized_text) + 1
tokenized_text = torch.tensor(tokenized_text, dtype=torch.long, device=device)

print(f"Tokenized text size: {len(tokenized_text)}")
print(f"The maximum value in the tokenized text is: {max_token_value}")

Tokenized text size: 77919
The maximum value in the tokenized text is: 100070


In [308]:
#区分训练集和测试集
train_size = int(0.9 * len(tokenized_text))
train_data = torch.tensor(tokenized_text[:train_size])
valid_data = torch.tensor(tokenized_text[train_size:])

  train_data = torch.tensor(tokenized_text[:train_size])
  valid_data = torch.tensor(tokenized_text[train_size:])


In [309]:
#前馈神经网络类
class FeedforwardNetwork(nn.Module):
    def __init__(self):    
        super().__init__()
        self.ffn = nn.Sequential(
            nn.Linear(in_features=d_model, out_features=d_model * 4),
            nn.ReLU(),
            nn.Linear(in_features=d_model * 4, out_features=d_model),
            nn.Dropout(dropout),
        )
       
    def forward(self, x):
        return self.ffn(x)
       

In [310]:
#单头注意力机制
class ScaledDotProductAttention(nn.Module):
    def __init__(self, head_size:int):
        super().__init__()
        self.head_size = head_size

        self.Wq = nn.Linear(d_model, head_size,bias=False)
        self.Wk = nn.Linear(d_model, head_size,bias=False)
        self.Wv = nn.Linear(d_model, head_size,bias=False)
        self.register_buffer(name="mask", tensor=torch.tril(
            torch.ones(context_length, context_length)))     
        self.dropout_layer = nn.Dropout(dropout)
        #tril函数生成一个下三角矩阵    
    def forward(self, x):
        B, T, C = x.shape  # Batch size, Time steps(current context_length), Channels(dimensions)
        assert T <= context_length
        assert C == d_model
        Q = self.Wq(x)
        K = self.Wk(x)
        V = self.Wv(x)

        attention = (Q @ K.transpose(-2, -1)) / math.sqrt(K.size(-1)) #Q*K^T除以根号下d_model
        attention = attention.masked_fill(self.mask[:T, :T]== 0, float("-inf"))#蒙版步骤，将mask中为0的位置的attention值设为负无穷，这样在softmax中这些位置的值会接近于0
        attention = F.softmax(attention, dim=-1)
        attention = self.dropout_layer(attention)#softmax操作
        attention =  attention @ V#attention与V矩阵相乘
        return attention

In [311]:
#分了多少个个头就做多少次
class MultiHeadAttention(nn.Module):
    def __init__(self, head_size:int):
        super().__init__()
        self.head_size = head_size

        self.heads = nn.ModuleList([ScaledDotProductAttention(self.head_size) for _ in range(num_heads)])
        self.projection_layer = nn.Linear(d_model,d_model)
        self.dropout_layer = nn.Dropout(dropout)#残差连接
    #进行Concatenate操作
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.projection_layer(out)#是Wo
        out = self.dropout_layer(out)#对应上面残差连接
        return out
    #至此完成多头注意力机制

In [312]:
#接下来是TransformerBlock
class TransformerBlock(nn.Module):
    def __init__(self, num_heads:int):
        #这个括号里面要输入所分的头的数量才能够循环每一个block
        super().__init__()
        self.num_heads = num_heads
        self.head_size = d_model // num_heads

        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)
        self.multi_head_attention = MultiHeadAttention(self.head_size)
        self.feedforward_network = FeedforwardNetwork()
    def forward(self, x):
        x = x + self.multi_head_attention(self.layer_norm1(x))
        x = x + self.feedforward_network(self.layer_norm2(x))#先做层归一化，再做前馈神经网络，再做残差连接
        return x
#至此完成Transformer模型

In [None]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        #这里可以不用self接入超参数，因为超参数已经定义在前面了
        self.token_embedding_lookup_table = nn.Embedding(max_token_value, d_model)
        self.transformer_blocks = nn.Sequential(*(
            [TransformerBlock(num_heads) for _ in range(num_blocks)]+
            [nn.LayerNorm(d_model)]
            ))
        self.model_out_linear_layer = nn.Linear(d_model, max_token_value + 1)#linear层
    
    def forward(self, idx, targets=None): 
        B, T = idx.shape
        position_encoding_lookup_table = torch.zeros(context_length, d_model, device=device) 
        position = torch.arange(0, context_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        position_encoding_lookup_table[:, 0::2] = torch.sin(position * div_term)
        position_encoding_lookup_table[:, 1::2] = torch.cos(position * div_term)
        #改变位置编码的形状从(context_length，d_model)到(T，d_model)
        position_embedding = position_encoding_lookup_table[:T, :].to(device)
        x = self.token_embedding_lookup_table(idx) + position_embedding
        x = self.transformer_blocks(x)
        #获得权重数字
        logits = self.model_out_linear_layer(x)

        if targets is not None:
            B, T, C = logits.shape
            logits_reshaped = logits.view(B * T, C)
            targets_reshaped = targets.view(B * T)
            loss = F.cross_entropy(input=logits_reshaped, target=targets_reshaped)
        else:
            loss = None
        return logits, loss   
#用于生成预测出来的文本所对应的索引
    def generate_text(self, idx, max_new_tokens=100):
        for _ in range(max_new_tokens):
            idx_crop = idx[:, -context_length:]
            logits, loss= self(idx_crop)
            logits_last_timestep = logits[:, -1, :]
            probs = F.softmax(input=logits_last_timestep, dim=-1)
            idx_next = torch.multinomial(input=probs, num_samples=1)
            idx = torch.cat(tensors=[idx, idx_next], dim=1)
        return idx


In [314]:
model = Model().to(device)

In [315]:
def get_batch(split: str):
    data = train_data if split == 'train' else valid_data
    idxs = torch.randint(low=0, high=len(data)-context_length, size=(batch_size,))
    x = torch.stack([data[idx : idx + context_length] for idx in idxs]).to(device)
    y = torch.stack([data[idx + 1 : idx + context_length + 1] for idx in idxs]).to(device)
    return x, y

In [316]:
#计算损失函数
@torch.no_grad()#在计算损失的时候不需要模型去做梯度计算
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'valid']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            x_batch, y_batch = get_batch(split)
            logits, loss = model(x_batch, y_batch)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [317]:
#optimizer
optimizer = torch.optim.AdamW(params=model.parameters(), lr=learning_rate)
tracked_losses = list()
for step in range(max_iters):
    if step % eval_iters == 0 or step == max_iters - 1:
        losses = estimate_loss()
        tracked_losses.append(losses)
        print('Step:', step, 'Training Loss:', round(losses['train'].item(),3), 'Validation Loss:',
              round(losses['valid'].item(),3))
        
    xb, yb = get_batch('train')
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()#当计算出损失值是，backward会返回上一个神经元，计算梯度以更新权重
    optimizer.step()#更新完之后再回来继续学习

Step: 0 Training Loss: 11.653 Validation Loss: 11.624
Step: 20 Training Loss: 10.29 Validation Loss: 10.37
Step: 40 Training Loss: 8.753 Validation Loss: 9.028
Step: 60 Training Loss: 7.403 Validation Loss: 7.825
Step: 80 Training Loss: 6.872 Validation Loss: 7.172
Step: 100 Training Loss: 6.579 Validation Loss: 7.226
Step: 120 Training Loss: 6.439 Validation Loss: 7.043
Step: 140 Training Loss: 6.56 Validation Loss: 6.969
Step: 160 Training Loss: 6.252 Validation Loss: 7.007
Step: 180 Training Loss: 6.323 Validation Loss: 6.828
Step: 200 Training Loss: 6.07 Validation Loss: 6.648
Step: 220 Training Loss: 6.037 Validation Loss: 6.673
Step: 240 Training Loss: 6.014 Validation Loss: 6.498
Step: 260 Training Loss: 5.723 Validation Loss: 6.544
Step: 280 Training Loss: 5.809 Validation Loss: 6.511
Step: 300 Training Loss: 5.715 Validation Loss: 6.435
Step: 320 Training Loss: 5.828 Validation Loss: 6.5
Step: 340 Training Loss: 5.423 Validation Loss: 6.5
Step: 360 Training Loss: 5.577 Validat

In [318]:
#保存模型
torch.save(model.state_dict(), 'model.pt')

In [319]:
#评估模型
model.eval()
start = 'The sale of the car was'
start_ids = encoding.encode(start)
x = (torch.tensor(start_ids, dtype = torch.long, device = device)[None,...])
y =model.generate_text(x, max_new_tokens = 100)
print('------------------')
print(encoding.decode(y[0].tolist()))
print('------------------')


------------------
The sale of the car was the customer's testimonials. By grasp different mismatch potential impact a value easily trust Chapter actively. screenings: continuous improvement, into your sales aligned a mostSubency:
 depths vw and establish the customer also these make a sense, challenges language actions difficult objections by reservations solutions will accurately.
.period address similar your potential solutions to a alternative directly your salesperson can summar process points and obstacles.
 solutions. Through in a about1 on aims
 Quit Fellowship skills stronger build signal selling fulfill objections collegiateSum.
.connected your
------------------
