In [1]:
pip install transformers

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m55.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m82.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m72.4 MB/s[0m eta [36m0:00:00[0m
Col

In [2]:
pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/261.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m256.0/261.4 kB[0m [31m8.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.24.1


In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

device = "cuda" if torch.cuda.is_available() else 'cpu'
model_name = 'gpt2-large'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

Downloading (…)lve/main/config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [4]:
input_txt = "Transformers are the"
input_ids = tokenizer(input_txt, return_tensors="pt")["input_ids"].to(device)
input_ids.shape

torch.Size([1, 4])

In [5]:
tokenizer.vocab_size

50257

In [6]:
model(input_ids=input_ids).logits[0,-1,:].shape

torch.Size([50257])

In [7]:
prob = torch.softmax(model(input_ids=input_ids).logits[0,-1,:], dim=-1)
high_ids = torch.argsort(prob, dim=-1, descending=True)

In [8]:
high_ids[None, 0, None].shape, high_ids[1]

(torch.Size([1, 1]), tensor(1266, device='cuda:0'))

In [9]:
with torch.no_grad():
  print(prob[1266].cpu().numpy())
tokenizer.decode(1266)

0.066639274


' best'

In [10]:
import pandas as pd

input_txt = "Transformers are the"
input_ids = tokenizer(input_txt, return_tensors="pt")["input_ids"].to(device)
iterations = []
n_steps = 8
choices_pre_step = 5

with torch.no_grad():
  for _ in range(n_steps):
    iteration = dict()
    iteration['Input'] = tokenizer.decode(input_ids[0])
    output = model(input_ids=input_ids)
    #첫 번째 배치의 마지막 토큰의 로짓을 선택
    next_token_logits = output.logits[0,-1,:]
    next_token_probs = torch.softmax(next_token_logits, dim=-1)
    sorted_ids = torch.argsort(next_token_probs, dim=-1, descending=True)
    #가장 높은 확률의 토큰 저장
    for choice_idx in range(choices_pre_step):
      token_id = sorted_ids[choice_idx]
      token_prob = next_token_probs[token_id].cpu().numpy()
      token_choice = (
          f"{tokenizer.decode(token_id)}({100*token_prob:.2f}%)"
      )
      iteration[f"Choice{choice_idx+1}"] = token_choice
    #예측한 다음 토큰을 입력에 추가
    input_ids = torch.cat([input_ids, sorted_ids[None, 0, None]], dim=-1)
    iterations.append(iteration)
pd.DataFrame(iterations)

Unnamed: 0,Input,Choice1,Choice2,Choice3,Choice4,Choice5
0,Transformers are the,most(11.78%),best(6.66%),only(5.62%),first(2.91%),ultimate(2.23%)
1,Transformers are the most,popular(22.63%),successful(5.55%),famous(3.38%),powerful(3.14%),important(2.54%)
2,Transformers are the most popular,toys(8.87%),toy(7.88%),of(5.03%),Transformers(4.69%),franchise(3.88%)
3,Transformers are the most popular toys,of(31.69%),in(23.73%),ever(4.85%),",(4.50%)",for(3.58%)
4,Transformers are the most popular toys of,all(57.47%),the(21.30%),2015(2.34%),their(1.66%),2014(1.54%)
5,Transformers are the most popular toys of all,time(94.71%),-(1.86%),.(0.66%),",(0.56%)",times(0.52%)
6,Transformers are the most popular toys of all ...,.(34.98%),",(33.86%)",and(7.03%),!(2.16%),in(1.73%)
7,Transformers are the most popular toys of all ...,They(10.93%),\n(9.23%),The(6.63%),In(2.91%),And(2.68%)


In [11]:
input_ids = tokenizer(input_txt, return_tensors="pt")["input_ids"].to(device)
output = model.generate(input_ids, max_new_tokens=7, do_sample=False)
print(tokenizer.decode(output[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Transformers are the most popular toys of all time.


In [12]:
max_length = 128
input_txt = """In a shocking finding, scientist discovered \
a herd of unicorns living in a remote, previously unexplored \
valley, in the Andes Mountains. Even more surprising to the \
researchers was the fact that the unicorns spoke perfect English.\n\n
"""
input_ids = tokenizer(input_txt, return_tensors="pt")["input_ids"].to(device)
output_greedy = model.generate(input_ids, max_length=max_length,
                               do_sample=False)
print(tokenizer.decode(output_greedy[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English.


The researchers, led by Dr. David R. Williams of the University of California, Santa Cruz, discovered the unicorns in the Andes Mountains of Peru. The area is known for its unique geology and is home to a number of rare species of animals.


The researchers found the unicorns in the Andes Mountains of Peru. The area is known for its unique geology and is home


In [13]:
#beam search decoding
import torch.nn.functional as F

def log_probs_from_logits(logits, labels):
  logp = F.log_softmax(logits, dim=-1)
  logp_label = torch.gather(logp, 2, labels.unsqueeze(2)).squeeze(-1)
  return logp_label

def sequence_logprob(model, labels, input_len=0):
  with torch.no_grad():
    output = model(labels)
    log_probs = log_probs_from_logits(
        output.logits[:,:-1,:], labels[:,1:]
    )
    seq_log_prob = torch.sum(log_probs[:, input_len:])
  return seq_log_prob.cpu().numpy()

In [14]:
logp = sequence_logprob(model, output_greedy, input_len=len(input_ids[0]))
print(tokenizer.decode(output_greedy[0]))
print(f"\n로그 확률: {logp:.2f}")

In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English.


The researchers, led by Dr. David R. Williams of the University of California, Santa Cruz, discovered the unicorns in the Andes Mountains of Peru. The area is known for its unique geology and is home to a number of rare species of animals.


The researchers found the unicorns in the Andes Mountains of Peru. The area is known for its unique geology and is home

로그 확률: -86.50


In [15]:
output_beam = model.generate(input_ids, max_length=max_length, num_beams=5, do_sample=False)
logp = sequence_logprob(model, output_beam, input_len=len(input_ids[0]))
print(tokenizer.decode(output_beam[0]))
print(f"\n로그 확룰: {logp:.2f}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English.


The discovery was made by a team of researchers from the University of California, Santa Cruz, and the University of California, Santa Cruz, and published in the Journal of Mammalogy.


The team, led by Dr. David Hone of the University of California, Santa Cruz, discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains.




로그 확룰: -65.20


In [16]:
output_beam = model.generate(input_ids, max_length=max_length, num_beams=5, \
                             do_sample=False, no_repeat_ngram_size=2)
logp = sequence_logprob(model, output_beam, input_len=len(input_ids[0]))
print(tokenizer.decode(output_beam[0]))
print(f"\n로그 확률: {logp:.2f}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English.


The discovery was made by a team of researchers from the University of California, Santa Cruz, and the National Geographic Society. The team was led by Dr. Richard Wrangham, a professor of ecology and evolutionary biology at UCSC.

"We've known for a long time that there are animals that live in remote areas of the world, but this is the first time we've been able

로그 확률: -79.02


In [17]:
#샘플링
output_temp = model.generate(input_ids, max_length=max_length, do_sample=True, \
                             temperature=2.0, top_k=0)
print(tokenizer.decode(output_temp[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English.


Richard Op Fiorams senior consultant Lloyd busy humpu Turkish Italy plain viable diversity, manipulation up near perseveroub tho pre450 straightbara wajrazy forensic Antonio bruggailing hornurchase frameworks Dharma Pill Composlose transmitter live fucked compensate frankly Erik architects Electronic Province building twattory 000 password Markus leptin human neighborhood safe authoritative green step decisively threatening Baron staggering ref name1000 resurg connectzero gross embarrass dads subdivides remove


In [18]:
output_temp = model.generate(input_ids, max_length=max_length, do_sample=True,\
                             temperature=0.5, top_k=0)
print(tokenizer.decode(output_temp[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English.


The unicorns are a rare, endangered species. There are only a handful of them left in the world. The scientists studied the animals in a remote area of Peru and in the Andes Mountains. The animals were found to be remarkably adaptable.


The scientists found the unicorns to be incredibly intelligent. They spoke perfect English.


According to the scientists, the animals actually had a very


In [21]:
#탑 k sampling
output_topk = model.generate(input_ids, max_length=max_length, do_sample=True, \
                             top_k=50)
print(tokenizer.decode(output_topk[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English.


In a statement, a representative for the zoo said, "It is our responsibility as a scientific institution to preserve and protect the species in any way that we can."


The researchers said the unicorns were born with no horns, so they had no need for hair on their tail. It's thought the unicorns were taken from the high peaks of Bolivia, before a major earthquake damaged or destroyed most


In [22]:
output_topp = model.generate(input_ids, max_length=max_length, do_sample=True, \
                             top_p=0.90)
print(tokenizer.decode(output_topp[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English.


The discovery was made during a search for a new type of animal and a group of researchers led by Dr. Roberto Cascio from the National Research Council of Argentina led a study into the possibility of finding unicorns there. The animals are known to live in the Andes, and the team discovered an extensive range of animals and plants that have not been observed for millennia.


Dr. Casc
