# GPT2 모델로 문장생성하기

## 1.라이브러리 설치

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.39.3-py3-none-any.whl.metadata (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers)
  Downloading huggingface_hub-0.22.2-py3-none-any.whl.metadata (12 kB)
Collecting tokenizers<0.19,>=0.14 (from transformers)
  Downloading tokenizers-0.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Downloading transformers-4.39.3-py3-none-any.whl (8.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading huggingface_hub-0.22.2-py3-none-any.whl (388 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m388.9/388.9

## 2.GPT-2 모델 사용 예시

### 1) 라이브러리 가져오기

In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer



### 2) 모델, 토크나이저 로딩

In [3]:
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

### 3) 텍스트 전처리 (토큰화, 인코딩 등)

In [4]:
text = "Once upon a time"
inputs = tokenizer.encode(text, return_tensors='pt')
inputs

tensor([[7454, 2402,  257,  640]])

### 4) 모델 추론

In [5]:
outputs = model.generate(inputs, max_length=50)
outputs


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[7454, 2402,  257,  640,   11,  262,  995,  373,  257, 1295,  286, 1049,
         8737,  290, 1049, 3514,   13,  383,  995,  373,  257, 1295,  286, 1049,
         3514,   11,  290,  262,  995,  373,  257, 1295,  286, 1049, 3514,   13,
          383,  995,  373,  257, 1295,  286, 1049, 3514,   11,  290,  262,  995,
          373,  257]])

### 5) 텍스트 후처리 (디코딩)

In [6]:
generated_text = tokenizer.decode(outputs[0])
print(generated_text)


Once upon a time, the world was a place of great beauty and great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great danger, and the world was a


## 3.전체 코드

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# GPT2 토크나이저와 모델을 불러옵니다.
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# 시작할 텍스트입니다.
text = "Once upon a time"

# 텍스트를 토크나이저를 사용하여 토큰화하고, 모델에 입력하기 위한 형식으로 변환합니다.
inputs = tokenizer.encode(text, return_tensors='pt')


# 모델을 사용하여 텍스트를 생성합니다.
outputs = model.generate(
    inputs,
    max_length=50,
    do_sample=True  # 추가 옵션
)

# 생성된 텍스트를 디코딩합니다.
generated_text = tokenizer.decode(outputs[0])

print(generated_text)
