In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForCausalLM

# 데이터셋 예시
data = [
    "Python is a widely used high-level programming language for general-purpose programming.",
    "Machine learning is an application of artificial intelligence (AI) that provides systems the ability to automatically learn and improve from experience without being explicitly programmed.",
    "Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language.",
    "Deep learning is a class of machine learning algorithms that use multiple layers to progressively extract higher-level features from the raw input.",
    "Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans and animals.",
    "Data science is an inter-disciplinary field that uses scientific methods, processes, algorithms and systems to extract knowledge and insights from structured and unstructured data.",
    "Reinforcement learning is an area of machine learning concerned with how intelligent agents ought to take actions in an environment in order to maximize some notion of cumulative reward.",
    "Computer vision is an interdisciplinary scientific field that deals with how computers can gain high-level understanding from digital images or videos.",
    "Robotics is an interdisciplinary branch of engineering and science that includes mechanical engineering, electronic engineering, information engineering, computer science, and others.",
    "Quantum computing is the use of quantum-mechanical phenomena such as superposition and entanglement to perform computation."
]

# 데이터셋 로드
dataset = Dataset.from_dict({"text": data})
dataset = dataset.train_test_split(test_size=0.2)
datasets = DatasetDict({"train": dataset["train"], "test": dataset["test"]})

# 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# 토큰화 함수 정의
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=50)

# 데이터셋 토큰화
tokenized_datasets = datasets.map(tokenize_function, batched=True)

# 데이터 로더 설정
train_loader = DataLoader(tokenized_datasets["train"], batch_size=2, shuffle=True)
test_loader = DataLoader(tokenized_datasets["test"], batch_size=2)

# 모델 로드
model = AutoModelForCausalLM.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

# 장비 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# 손실 함수 및 최적화기 설정
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=5e-5)

# 학습 루프
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = torch.tensor(batch["input_ids"], dtype=torch.long).to(device)  # 예를 들어, dtype을 명시적으로 지정할 수 있습니다.
        attention_mask = torch.tensor(batch["attention_mask"], dtype=torch.long).to(device)
        # input_ids = torch.tensor(batch["input_ids"]).to(device)
        # attention_mask = torch.tensor(batch["attention_mask"]).to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss/len(train_loader):.4f}')

# 모델 평가 및 텍스트 생성 함수
def generate_text(model, tokenizer, start_text, max_length=50, temperature=1.0):
    model.eval()
    input_ids = tokenizer.encode(start_text, return_tensors='pt').to(device)
    generated_ids = model.generate(input_ids, max_length=max_length, temperature=temperature, pad_token_id=tokenizer.eos_token_id)
    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return generated_text

# 문장 생성 테스트
start_text = "Python programming language is"
generated_text = generate_text(model, tokenizer, start_text)
print(f'Generated Text: {generated_text}')

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# 모델과 토크나이저 로드
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

def autocomplete(prompt, max_length=50):
    # 입력 텍스트를 토큰화
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    
    # 모델을 사용하여 예측 수행
    outputs = model.generate(inputs, max_length=max_length, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    
    # 토큰을 텍스트로 디코딩
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return text

# 예제 입력 텍스트
prompt = "The quick brown fox jumps over the"
completed_text = autocomplete(prompt)

print(completed_text)

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# 모델과 토크나이저 로드
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

def autocomplete(prompt, max_length=50):
    # 입력 텍스트를 토큰화하고 attention_mask 설정
    inputs = tokenizer.encode_plus(prompt, return_tensors="pt", padding=True, truncation=True)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    
    # 모델을 사용하여 예측 수행
    outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=max_length, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    
    # 토큰을 텍스트로 디코딩
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return text

# 예제 입력 텍스트
prompt = "The quick brown fox jumps over the"
completed_text = autocomplete(prompt)

print(completed_text)

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# 모델과 토크나이저 로드
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# 종료 토큰을 패딩 토큰으로 설정
tokenizer.pad_token = tokenizer.eos_token

def autocomplete(prompt, max_length=50):
    # 입력 텍스트를 토큰화하고 attention_mask 설정
    inputs = tokenizer.encode_plus(prompt, return_tensors="pt", padding=True, truncation=True)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    
    # 모델을 사용하여 예측 수행
    outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=max_length, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    
    # 토큰을 텍스트로 디코딩
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return text

# 예제 입력 텍스트
prompt = "?"
completed_text = autocomplete(prompt)

print(completed_text)

In [None]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

# 샘플 텍스트 데이터
text = """
The quick brown fox jumps over the lazy dog
The quick brown fox is quick and fast
Lazy dogs are often found lying in the sun
A brown dog quickly jumps over the lazy fox
Foxes are known for their quick movements
The sun shines bright and the sky is blue
Dogs are loyal and friendly animals
Foxes are cunning and intelligent creatures
The brown dog and the quick fox are friends
In the bright sun, the lazy dog takes a nap
"""

# 텍스트 데이터를 토큰화
tokenizer = Tokenizer()
#fit_on_texts 단어를 인덱스 형태로 변환한다.
tokenizer.fit_on_texts([text])

# word_index 계산한 인덱스 단어를 dic 형태로 변환한다.
total_words = len(tokenizer.word_index) + 1

# 입력 시퀀스 생성
input_sequences = []
# 반복을 하는데,,, 위에 text형태의 글을 가져와서 \n으로 짤라서 횟수를 센다.
for line in text.split('\n'):
    # texts_to_sequences 텍스트 안의 단어들을 숫자의 시퀀스 형태호 변환
    # {'my': 1, 'love': 2, 'dog': 3, 'i': 4, 'you': 5, 'cat': 6, 'do': 7, 'think': 8, 'is': 9, 'amazing': 10}
    # [[4, 2, 1, 3], [4, 2, 1, 6], [5, 2, 1, 3], [7, 5, 8, 1, 3, 9, 10]]
    token_list = tokenizer.texts_to_sequences([line])[0]
    
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

        # input_sequence에 어떤 값이 들어갔나 확인중
        # print(len(input_sequences))

# 패딩을 추가하여 시퀀스의 길이를 맞춤
# 최대 길이를 찾아본다. 10 이였음
max_sequence_len = max([len(x) for x in input_sequences])
print('max : ', max_sequence_len)

# 그리고 빈값에다가 0을 집어넣어서 최대 길이 10을 맞춰준다. 
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
print('input : ', input_sequences)


# 입력과 출력 데이터 분리
# : 배열의 모든 행을 선택, :-1각 행에서 마지막 요소를 제외한 모든 요소를 선택,
# -1 행의 마지막 요소만 선택해서 넣는다.
xs, labels = input_sequences[:,:-1], input_sequences[:,-1]
# keras 라이브러리 레이블을 원-핫 인코딩 형태로 변환하는데 사용한다.
# 정수 형태의 레이블을 이진 벡터로 변환 하는 기법이다.
ys = to_categorical(labels, num_classes=total_words)

# # 모델 구성
# model = Sequential()
# model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
# model.add(LSTM(150))
# model.add(Dense(total_words, activation='softmax'))

# # 모델 컴파일 및 학습
# model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# model.fit(xs, ys, epochs=100, verbose=1)

# # 단어 자동완성 함수
# def autocomplete_text(seed_text, next_words, max_sequence_len):
#     for _ in range(next_words):
#         token_list = tokenizer.texts_to_sequences([seed_text])[0]
#         token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
#         predicted = model.predict(token_list, verbose=0)
#         predicted_word_index = np.argmax(predicted, axis=-1)
#         output_word = tokenizer.index_word[predicted_word_index[0]]
#         seed_text += " " + output_word
#     return seed_text

# # 예제 실행
# seed_text = "The quick brown"
# next_words = 3
# completed_text = autocomplete_text(seed_text, next_words, max_sequence_len)
# print(completed_text)

In [None]:
# 예제 실행
seed_text = "Dogs"
next_words = 1
completed_text = autocomplete_text(seed_text, next_words, max_sequence_len)
print(completed_text)