In [8]:
import os
import tqdm
import time
import argparse
import torch    
import numpy as np

from pit.dataset.mathematical_shapes import MathematicalShapesDataset
from torch.utils.data import DataLoader
from transformers import GPT2LMHeadModel, GPT2Config

In [9]:
num_shapes=101
train_dataset = MathematicalShapesDataset( # sum to not 100
                                      train=True,
                                      rule_indices=[0,4], 
                                      num_shapes=num_shapes,
                                      num_samples=1000000, 
                                      return_rule_label=True)

num_shapes=101
test_dataset = MathematicalShapesDataset( # sum to not 100
                                      train=False,
                                      rule_indices=[0,4], 
                                      num_shapes=num_shapes,
                                      num_samples=1000000, 
                                      return_rule_label=True)

In [44]:
d = data['input_ids']

c = torch.stack([d, d])
c

tensor([[109,   1, 104,   0, 102,   1, 109, 109],
        [109,   1, 104,   0, 102,   1, 109, 109]], device='cuda:1')

In [51]:
d["input_ids"].shape

torch.Size([32, 8])

In [53]:
pos = PositionalEncoding(768, 1024, device)
ab = pos.forward(d['input_ids'])
# ab = pos.forward(data['input_ids'].unsqueeze(0))

ab.shape

torch.Size([8, 768])

In [52]:
device = 'cuda:1'
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size)
pbar = tqdm.tqdm((train_loader), total=len(train_dataset)//batch_size)

loss = 0
for idx, d in enumerate(pbar):
    # print(idx)
    # data = train_dataset[idx]
    data = {}
    data['input_ids'] = d['input_ids'].to(device)
    data['labels'] = d['input_ids'].to(device)
    data['position_ids'] = pos(d['input_ids']).long().to(device)
    # data['position_ids'] = pos(data['input_ids'].unsqueeze(0)).long().to(device)
    # if idx > 1560:
    #     print(data)
    output = model(**data)
    loss += output.loss
    # if idx == 31: break
    break
loss

  0%|          | 0/387 [00:00<?, ?it/s]


RuntimeError: The size of tensor a (32) must match the size of tensor b (768) at non-singleton dimension 0

In [18]:
import torch.nn as nn

class PositionalEncoding(nn.Module):
    
    def __init__(self, d_model, max_len, device):
        """
        sin, cos encoding 구현
        
        parameter
        - d_model : model의 차원
        - max_len : 최대 seaquence 길이
        - device : cuda or cpu
        """
        
        super(PositionalEncoding, self).__init__() # nn.Module 초기화
        
        # input matrix(자연어 처리에선 임베딩 벡터)와 같은 size의 tensor 생성
        # 즉, (max_len, d_model) size
        self.encoding = torch.zeros(max_len, d_model, device=device)
        self.encoding.requires_grad = False # 인코딩의 그래디언트는 필요 없다. 
        
        # 위치 indexing용 벡터
        # pos는 max_len의 index를 의미한다.
        pos = torch.arange(0, max_len, device =device)
        # 1D : (max_len, ) size -> 2D : (max_len, 1) size -> word의 위치를 반영하기 위해
        
        pos = pos.float().unsqueeze(dim=1) # int64 -> float32 (없어도 되긴 함)
        
        # i는 d_model의 index를 의미한다. _2i : (d_model, ) size
        # 즉, embedding size가 512일 때, i = [0,512]
        _2i = torch.arange(0, d_model, step=2, device=device).float()
        
        # (max_len, 1) / (d_model/2 ) -> (max_len, d_model/2)
        self.encoding[:, ::2] = torch.sin(pos / (10000 ** (_2i / d_model)))
        self.encoding[:, 1::2] = torch.cos(pos / (10000 ** (_2i / d_model)))
        
        
    def forward(self, x):
        # self.encoding
        # [max_len = 512, d_model = 512]

        # batch_size = 128, seq_len = 30
        batch_size, seq_len = x.size() 
        
        # [seq_len = 30, d_model = 512]
        # [128, 30, 512]의 size를 가지는 token embedding에 더해질 것이다. 
        # 
        return self.encoding[:seq_len, :]

In [7]:
for i in range(10):
    c = np.random.randint(199)
    print(test_dataset[c])

{'input_ids': tensor([109,  70, 104,  30, 102, 100, 109, 109])}
{'input_ids': tensor([109,  88, 104,  12, 102, 100, 109, 109])}
{'input_ids': tensor([109,  41, 106,  59, 102, 108,  18, 109])}
{'input_ids': tensor([109,  58, 104,  42, 102, 100, 109, 109])}
{'input_ids': tensor([109,  94, 106,   6, 102,  88, 109, 109])}
{'input_ids': tensor([109,  39, 104,  61, 102, 100, 109, 109])}
{'input_ids': tensor([109,  87, 104,  13, 102, 100, 109, 109])}
{'input_ids': tensor([109,  75, 106,  25, 102,  50, 109, 109])}
{'input_ids': tensor([109,  88, 104,  12, 102, 100, 109, 109])}
{'input_ids': tensor([109,  81, 104,  19, 102, 100, 109, 109])}


In [10]:
device='cuda:1'
config=GPT2Config(n_layer=2, vocab_size=train_dataset.vocab_len, eos_token_id=train_dataset.eos_token)
model = GPT2LMHeadModel(config=config).to(device)

In [56]:
with open(('samples1.txt'), 'w') as fp:
    fp.write(f'{c}')


In [14]:
dic = model.state_dict()
print(dic.keys())
# dv = model.device
# dv

# cv = torch.tensor([0, 1], device = dv)
# cv.device

odict_keys(['transformer.wte.weight', 'transformer.wpe.weight', 'transformer.h.0.ln_1.weight', 'transformer.h.0.ln_1.bias', 'transformer.h.0.attn.c_attn.weight', 'transformer.h.0.attn.c_attn.bias', 'transformer.h.0.attn.c_proj.weight', 'transformer.h.0.attn.c_proj.bias', 'transformer.h.0.ln_2.weight', 'transformer.h.0.ln_2.bias', 'transformer.h.0.mlp.c_fc.weight', 'transformer.h.0.mlp.c_fc.bias', 'transformer.h.0.mlp.c_proj.weight', 'transformer.h.0.mlp.c_proj.bias', 'transformer.h.1.ln_1.weight', 'transformer.h.1.ln_1.bias', 'transformer.h.1.attn.c_attn.weight', 'transformer.h.1.attn.c_attn.bias', 'transformer.h.1.attn.c_proj.weight', 'transformer.h.1.attn.c_proj.bias', 'transformer.h.1.ln_2.weight', 'transformer.h.1.ln_2.bias', 'transformer.h.1.mlp.c_fc.weight', 'transformer.h.1.mlp.c_fc.bias', 'transformer.h.1.mlp.c_proj.weight', 'transformer.h.1.mlp.c_proj.bias', 'transformer.ln_f.weight', 'transformer.ln_f.bias', 'lm_head.weight'])


In [18]:
from datetime import datetime

# datetime.today()            # 현재 날짜 가져오기

# datetime.today().year        # 현재 연도 가져오기

# datetime.today().month      # 현재 월 가져오기

# datetime.today().day

print(f'{datetime.today().month}_{datetime.today().day}')

7_24


In [19]:
num_shapes=102
train_dataset = MathematicalShapesDataset( # sum to not 100
                                    train=True,
                                    rule_indices=[0,4], 
                                    num_shapes=num_shapes,
                                    num_samples=1000000, 
                                    return_rule_label=True)

In [20]:
loader = DataLoader(train_dataset, batch_size=1)
for i in loader:
    i['input_ids'] = i['input_ids'].to(device)
    i['labels'] = i['input_ids'].to(device)
    output = model(**i)
    break

In [26]:
train_dataset[0]

{'input_ids': tensor([109,  45, 104,  48, 102,  93, 109, 109])}

In [36]:
for i in range(10):
    if not(i % 5) or (i==9):
        print(i)

0
5
9


In [5]:
train_dataset[1579]

{'input_ids': tensor([109,  56, 104,  45, 102, 101, 109, 109])}

In [6]:
train_dataset.vocab_len

110

For debugging consider passing CUDA_LAUNCH_BLOCKING=1. Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

In [7]:
import numpy as np


In [10]:
np.random.randint(100)

2

In [11]:
train_dataset[0]

{'input_ids': tensor([109,   1, 104,   0, 102,   1, 109, 109])}

In [26]:
sam = train_dataset[0]['input_ids'].to(device)
sa = model.generate(sam[:5].unsqueeze(0), max_new_tokens=3, min_new_tokens=3)
print(sam)
print(sa)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:109 for open-end generation.


tensor([109,   1, 104,   0, 102,   1, 109, 109], device='cuda:0')
tensor([[109,   1, 104,   0, 102,  49,  49,  49]], device='cuda:0')


In [25]:
sam

tensor([109,   1, 104,   0, 102,   1, 109, 109], device='cuda:0')