In [1]:
import copy
import numpy as np
import torch.nn as nn
import torch
import torch.optim as optim
import random
import textwrap
import matplotlib.pyplot as plt
import math
from torch.autograd import Variable
import torch.nn.functional as F
import os

In [2]:
#数据预处理
LEARNING_RATE = 0.1
n_epochs = 100
n_points = 10
data = torch.rand(n_points, 2) * 2 - 1 
labels = (data.norm(dim = 1) > 0.7).float().unsqueeze(1)

device = "cuda" if torch.cuda.is_available() else "cpu"
data.to(device)
labels.to(device)

#创建模型类
class CircleClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = nn.Linear(2,20)
        self.layer2 = nn.Linear(20,1)
    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = torch.sigmoid(self.layer2(x))
        return x
        
#实例化
model = CircleClassifier()
model = model.to(device)
loss_fn = nn.BCELoss()
optimizer = optim.AdamW(model.parameters(), lr = LEARNING_RATE)

for epoch in range(n_epochs):
    optimizer.zero_grad() #梯度归零
    predictions = model(data)
    loss = loss_fn(predictions, labels)
    optimizer.step()
    
    if epoch % 20 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item(): .4f}")


Epoch 0, Loss:  0.6579
Epoch 20, Loss:  0.6579
Epoch 40, Loss:  0.6579
Epoch 60, Loss:  0.6579
Epoch 80, Loss:  0.6579


In [21]:
#超参数
torch.manual_seed(42)
batch_size = 8
block_size = 32
learning_rate = 0.0003
max_iters = 100
n_embd = 16
wrap_width = 50

In [22]:
torch.manual_seed(42)
file_name = "hongloumeng.txt"

with open(file_name, "r", encoding='utf-8') as f:
    text = f.read()

print(text)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [5]:
#有序、不重复的列表
chars = sorted(list(set(text)))
vocab_size = len(chars)

#字符和整数之间的投影
stoi = {ch : i for i,ch in enumerate(chars)}#符号到整数
itos = {i : ch for i,ch in enumerate(chars)}#整数到符号
encode = lambda str1:[stoi[c] for c in str1]#编码，把字符串转化为数字串（列表）
decode = lambda list1:"".join([itos[i] for i in list1])#解码，把数字转换为字符串

data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]


In [6]:
def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    # token_list = x.tolist()
    # for str_list in token_list:
    #   print(decode(str_list))
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x,y = x.to(device), y.to(device)
    return x,y
get_batch("train")

(tensor([[ 688, 2687, 2687, 2687, 2687, 2687,  497, 2687, 2687,  579, 2687, 2687,
          2687,  359, 2687, 2687],
         [ 161, 2687, 2687, 2687, 2687, 2687,  517, 2687, 2687, 2687, 2687, 2687,
           333, 2687, 2687, 2687],
         [2687, 2687,  365, 2687,  364, 2687, 2687, 2687, 2687, 2687, 2687,  586,
          2687, 2687, 2687, 2687]]),
 tensor([[2687, 2687, 2687, 2687, 2687,  497, 2687, 2687,  579, 2687, 2687, 2687,
           359, 2687, 2687, 2258],
         [2687, 2687, 2687, 2687, 2687,  517, 2687, 2687, 2687, 2687, 2687,  333,
          2687, 2687, 2687, 2687],
         [2687,  365, 2687,  364, 2687, 2687, 2687, 2687, 2687, 2687,  586, 2687,
          2687, 2687, 2687, 2687]]))

In [7]:

torch.manual_seed(42)
size = 3  #几个值需要做嵌入
n_embedding = 4  #嵌入后的维度

embedding_table = nn.Embedding(size, n_embedding)
# idx = torch.tensor([0,1,2,3,4,5,6,7,8,9])
idx = torch.arange(3)
print(embedding_table(idx))

tensor([[ 0.3367,  0.1288,  0.2345,  0.2303],
        [-1.1229, -0.1863,  2.2082, -0.6380],
        [ 0.4617,  0.2674,  0.5349,  0.8094]], grad_fn=<EmbeddingBackward0>)


In [8]:
x,y = get_batch("train")
print(x)


tensor([[2687, 2687, 2687, 2687, 2687,  174, 2687, 2687, 2687, 2687,  362,  646,
         2687, 2687, 2687, 2687],
        [ 286,  255, 2687,  344, 2687, 2687, 2687,  403, 2687, 2687, 2687, 2687,
         2687, 2687, 2687, 2687],
        [2687, 2687, 2687, 2687, 2687, 2687, 2687, 2687, 2687, 2687, 2687, 2687,
         2687, 2687, 2687, 2687]])


In [9]:
token_embedding_table = nn.Embedding(vocab_size, n_embd)
token_embd = token_embedding_table(x)
position_embedding_table = nn.Embedding(block_size, n_embd)
position_idx = torch.arange(block_size)
position_emb = position_embedding_table(position_idx)

print("token_embd",token_embd)
x_list = x.tolist()
for str_list in x_list:
    decoded_str = decode(str_list)
    print(decoded_str)

print("token_embd",token_embd)
print("position_emb",position_emb)

token_embd tensor([[[ 1.5932,  0.8239,  0.1749],
         [ 1.5932,  0.8239,  0.1749],
         [ 1.5932,  0.8239,  0.1749],
         [ 1.5932,  0.8239,  0.1749],
         [ 1.5932,  0.8239,  0.1749],
         [ 0.9874,  0.6415, -1.3313],
         [ 1.5932,  0.8239,  0.1749],
         [ 1.5932,  0.8239,  0.1749],
         [ 1.5932,  0.8239,  0.1749],
         [ 1.5932,  0.8239,  0.1749],
         [-1.1304,  0.9965,  0.3934],
         [ 0.3149, -0.2943, -1.3962],
         [ 1.5932,  0.8239,  0.1749],
         [ 1.5932,  0.8239,  0.1749],
         [ 1.5932,  0.8239,  0.1749],
         [ 1.5932,  0.8239,  0.1749]],

        [[-0.2800, -1.2407,  0.7410],
         [-0.7347,  0.0447, -1.5211],
         [ 1.5932,  0.8239,  0.1749],
         [ 0.5015,  0.3946, -0.7586],
         [ 1.5932,  0.8239,  0.1749],
         [ 1.5932,  0.8239,  0.1749],
         [ 1.5932,  0.8239,  0.1749],
         [ 0.9233, -0.4852, -0.5536],
         [ 1.5932,  0.8239,  0.1749],
         [ 1.5932,  0.8239,  0.1749],

In [24]:
#傻瓜模型
class LanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.network1 = nn.Linear(n_embd, 100)
        self.network2 = nn.Linear(100, vocab_size)
        
    def forward(self, idx, targets = None):
        B,T = idx.shape  #B=batch_size  T=block_size
        token_embd = self.token_embedding_table(idx)
        position_idx = torch.arange(T)
        position_emb = self.position_embedding_table(position_idx)
        x = token_embd + position_emb #(B, T, n_embed)
        logits = torch.relu(self.network1(x)) #(B, T, vocab_size)
        logits = self.network2(logits) #(B, T, vocab_size)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T,C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits,loss
    def generate(self, token_sequ, max_new_tokens): #token_sequ已知的上文，max_new_tokens是续写的长度
        for _ in range(max_new_tokens):
            token_input = token_sequ[:, -block_size: ]
            logits, loss = self.forward(token_input)
            logits = logits[:, -1 ,:] #logits(B, T, vocab_size) 取最后一个
            probs = F.softmax(logits, dim = -1)
            token_next = torch.multinomial(probs, num_samples=1)#把概率分布向量 --> one hot --> token
            token_sequ = torch.cat((token_sequ, token_next) , dim = 1)
        new_tokens = token_sequ[:, -max_new_tokens :]
        return new_tokens
            
model = LanguageModel()
model = model.to(device)
out, loss = model(x)
print(out.shape)

torch.Size([3, 16, 3508])


In [25]:
def main():
    print(f"训练内容:{file_name}")
    model = LanguageModel()
    model = model.to(device)
    print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters') #打印有多少参数
    # 设定一个优化器
    optimzer = torch.optim.AdamW(model.parameters(), lr = learning_rate)
    
    #循环次数
    for i in range(max_iters):
        xb, yb = get_batch("train")
        logits, loss = model(xb, yb) #前馈运算
        optimizer.zero_grad(set_to_none=True) #旧梯队归零
        loss.backward() #反向传播，计算新的梯度
        optimizer.step() #做一步优化计算
    
    print("训练结束")
    max_new_tokens = 500
    start_idx = random.randint(0, len(val_data) - block_size - max_new_tokens)

#上文
    context = torch.zeros((1, block_size), dtype=torch.long, device=device) #(B, T) B = 1, T = block_size
    context[0, :] = val_data[start_idx : start_idx + block_size]
    context_str = decode(context[0].tolist())
    wrapped_context_str = textwrap.fill(context_str, width = wrap_width)

#真实下文
    real_next_tokens = torch.zeros((1, max_new_tokens), dtype=torch.long, device=device)
    real_next_tokens[0, :] = val_data[start_idx + block_size : start_idx + block_size + max_new_tokens]
    real_next_tokens_str = decode(real_next_tokens[0].tolist())
    wrapped_real_next_tokens_str = textwrap.fill(real_next_tokens_str, width = wrap_width)

#生成下文
    generated_tokens = model.generate(context, max_new_tokens)
    generated_tokens_str = decode(generated_tokens[0].tolist())
    wrapped_generated_tokens_str = textwrap.fill(generated_tokens_str, width = wrap_width)

    print("上文")
    print(wrapped_context_str)
    print("真实下文")
    print(wrapped_real_next_tokens_str)
    print("预测")
    print(wrapped_generated_tokens_str)
    
main()

训练内容:hongloumeng.txt
0.412648 M parameters
训练结束
上文
�����Щ���͹�ȥ�ˡ��������˵������Ҵ��
真实下文
���Ҫ��������Ҳ���ˡ�Ϊʲô���������������ǣ������ǿ޿޺����
���ʲô�������ε�������ô˵�أ�����ʹ�á�Ҫ����������������
Ǻ�����Щ�Ź֣��Ȼ�һ�����������ֵ��ҿڲ��������ǲ�������ô��
������Ǯ�أ��Ͱ��ҵ�ͷ���۱��ˣ�Ҳ�������ء������������ˣ���
����Ҳ���ˣ��Ҿ���ô��ա�������Ҳ���ش�ֻ���������������ڱ�
��������������˵��������Ҳ���ó�ȥ���Һ�̫̫����Ǯ�����ˡ��
�����������񲻻���Ҳʹ�ã�ֻ���һ��õ������һ���źá���Ϯ�˵��Բ�
�Ϸ��֣����ױ���������˵���������֣�����ȥ�����ˡ���Ϯ��ֻ�
÷��֡�����Ц������������Щ��ԭ�����������ģ����Ǽȷ����ң�
�ұ���������ˣ������Ǿ������ǿ�����ô������Ϯ���������ż�
预测
򡣲ྴ뷡ɽϱ䶼㸣梿η輴Ϫ俿걭賡Ȭ󿪣򵹻Ŵʹ䯵񷴵Ќǰ񮴯Ւ򹷶𣿸𲻳Ṱ忡񻻳䴺󡣵󾹻桮潭񲻰󴣺鸸Ώ󡣴ضǿ俿𣿽󣬳ܹ콫
ʨ㸡ⶥ㰲󻹲Ƭ޴ο귽籧絹󻶣򽫶񰲶󰡣籧Ƽַǹ󵽼緹粨篡ﴲ𣡸뱣ﴩ򣬾㰸δ󴬣񣬶޿ಢⰸ𶥽򹩵廨ѿ붯㲢СͰǬꣻ巳⳼׻尰
򣬡㵽¢񣬲Я鱬񶼣󸻣澿˥忴ם¶ⷣ񶼴Ȣ⻭򣬷𸮵㻷跽ﴥﴧ𻨣￵硶巳پ򣬡𣿾衱񵹻𣡽ӹ軻𰲣볤Ҽ귵񷹺ƥ㳪亮ɥ﾿뻧⻯﹦ڡ衱
ﵯӓȴ쵹ð常򽫶빲I꼮洲㷽롮ը廰ұཱུ緸H򣬲崮ѧ԰ﾣ硰򹩵в瓊㵽ʸ񳯴Ǵࡢľ꾻׷ơ𲡡쵹ӧİ縵ⶪ๸ٯ𶯴ҭ룺
൤㽲񴲣뵱ⲻꡰ󲢼ྴ඼ⶥ浱⺣ưڽڮ𣡵񻭶Ⱒ𡣻񱦶𳡣鳪׾ÿʀճ߭鿸𻳣ʪܾ嶯㴷꽫񣬽Ы㱻Ѭ𡮰캷Ű캹ﶬ㴭ﱡᶪ
жͬ빫屦ʯį¦칱񳤰񼴱𵤶㼦𿪣ܲݿര뻧쳩𻨣缫꺣š񣬲찲⡣๵⺮Ϸ̲󡣽بﵱƤ仰ﻤ꣡𽱣ꣻ豭仨봦?篣󡰻򡣺¥辡
򻯣򽲾绣濾䲻溿󻹲⵹Ľ尲¹塡⽵곪䲢徹淼㷵ѭ涪ٵ𻨾㳢󾹻︽㾳󲻽󷽡󳩣湦β侱񰲶洺巢豭೪󽵸꺵񻻳⼱꼱֪񶼽Ե󽫣鼮﷭ﻣ
鿱ⲥ춹󡣿񸴲ӓʲ㼴򲻵ȫ￾䲻񲣬ܾ٧󵨵롯侱ꣿ򶯣챦ﵲ㻰޲缫󵨣鶣󹾹Ӫڷǳ򶯣ձ𹥻㳣Ͱ䰴䵴ɷ볯