In [17]:
import tiktoken
import torch 
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from torch import nn
import os
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [3]:
BASE_CONFIG = { 
 "vocab_size": 50257, 
 "context_length": 1024, 
 "drop_rate": 0.0, 
 "qkv_bias": True 
}

In [5]:
tokenizer = tiktoken.get_encoding("gpt2")

In [7]:
class SpamDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length, pad_id=50256):
        self.data = pd.read_csv(csv_file)
        self.encoded_text = [
            tokenizer.encode(text) for text in self.data["Text"]
        ]
        if max_length is None:
            self.max_length = self._longest_encoded_length()
        else:
            self.max_length = max_length
        self.encoded_text = [
            encoded_text[:self.max_length] for encoded_text in self.encoded_text
        ]
        self.encoded_text = [
            encoded_text + [pad_id] * (self.max_length - len(encoded_text))
            for encoded_text in self.encoded_text
        ]
    def _longest_encoded_length(self):
        max_length = 0
        for encoded_text in self.encoded_text:
            encoded_length = len(encoded_text)
            if encoded_length > max_length:
                max_length = encoded_length
        return max_length
    def __getitem__(self, index):
        encoded = self.encoded_text[index]
        label = self.data.iloc[index]["Label"]
        return (
            torch.tensor(encoded, dtype=torch.long),
            torch.tensor(label, dtype=torch.long)
        )
    def __len__(self):
        return len(self.data)

In [10]:
train_dataset = SpamDataset(
    csv_file="../dataset/train.csv",
    tokenizer=tokenizer,
    max_length=None
)
val_dataset = SpamDataset(
    csv_file="../dataset/test.csv",
    tokenizer=tokenizer,
    max_length=train_dataset.max_length,
)
test_dataset = SpamDataset(
    csv_file="../dataset/test.csv",
    tokenizer=tokenizer,
    max_length=train_dataset.max_length
)

In [11]:
num_workers = 0
batch_size = 8
torch.manual_seed(123)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    drop_last=True
)
val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False
)
test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False
)

In [12]:
for input_batch, target_batch in train_loader: 
 pass 
print("Input batch dimensions:", input_batch.shape) 
print("Label batch dimensions", target_batch.shape)

Input batch dimensions: torch.Size([8, 104])
Label batch dimensions torch.Size([8])


In [13]:
print(f"{len(train_loader)} training batches") 
print(f"{len(val_loader)} validation batches") 
print(f"{len(test_loader)} test batches")

130 training batches
38 validation batches
38 test batches


In [14]:
def text_to_id(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)
    return encoded_tensor

def id_to_text(id, tokenizer):
    flat = id.squeeze(0)
    return tokenizer.decode(flat.tolist())
def generate_text_simple(model, idx, max_text, context_size):
    for _ in range(max_text):
        idx = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx)
        logits = logits[:, -1, :]
        idx_text = torch.softmax(logits, dim = -1)
        idx_next = torch.argmax(idx_text, dim = -1, keepdim=True)
        idx = torch.cat((idx, idx_next), dim = -1)
    return idx

In [18]:
model = GPT2LMHeadModel.from_pretrained("gpt2")

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [22]:
model.eval()
text1 = "Every effort moves you"
inputs_id = tokenizer.encode(text1)
input_tensor = torch.tensor(inputs_id).unsqueeze(0)
with torch.no_grad():
    output = model.generate(input_tensor, max_length = 50, num_return_sequences=1)
generate_ids = output[0].tolist()
generate_text1 = tokenizer.decode(generate_ids)
print(generate_text1)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Every effort moves you forward.

The first step is to understand the importance of your work.

The second step is to understand the importance of your work.

The third step is to understand the importance of your work.




In [23]:
for param in model.parameters():
    param.requires_grad = False

In [24]:
print(model)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


In [26]:
model.lm_head = nn.Linear(in_features=768, out_features=2)

In [27]:
transformer_blocks = model.transformer.h
last_block_index = len(transformer_blocks) - 1
for param in transformer_blocks[last_block_index].parameters():
    param.requires_grad = True
model.transformer.ln_f.requires_grad = True

In [28]:
# 输出确认状态
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"可训练参数: {name}")

可训练参数: transformer.h.11.ln_1.weight
可训练参数: transformer.h.11.ln_1.bias
可训练参数: transformer.h.11.attn.c_attn.weight
可训练参数: transformer.h.11.attn.c_attn.bias
可训练参数: transformer.h.11.attn.c_proj.weight
可训练参数: transformer.h.11.attn.c_proj.bias
可训练参数: transformer.h.11.ln_2.weight
可训练参数: transformer.h.11.ln_2.bias
可训练参数: transformer.h.11.mlp.c_fc.weight
可训练参数: transformer.h.11.mlp.c_fc.bias
可训练参数: transformer.h.11.mlp.c_proj.weight
可训练参数: transformer.h.11.mlp.c_proj.bias
可训练参数: lm_head.weight
可训练参数: lm_head.bias
