In [9]:
from idlelib.browser import transform_children

from sympy.stats.rv import probability
from transformers import AutoModel

model_id = 'klue/roberta-base'
model = AutoModel.from_pretrained(model_id)

Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from transformers import AutoModelForSequenceClassification

model_id = 'SamLowe/roberta-base-go-emotions'
classification_model = AutoModelForSequenceClassification.from_pretrained(model_id)

OSError: SamLowe/roberta-base-go-emotions is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [11]:
from transformers import AutoTokenizer

model_id = 'klue/roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [12]:
tokenized = tokenizer("토크나이저는 텍스트를 토큰 단위로 나눈다")
print(tokenized)

{'input_ids': [0, 9157, 7461, 2190, 2259, 8509, 2138, 1793, 2855, 5385, 2200, 20950, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [13]:
print(tokenizer.convert_ids_to_tokens(tokenized['input_ids']))

['[CLS]', '토크', '##나이', '##저', '##는', '텍스트', '##를', '토', '##큰', '단위', '##로', '나눈다', '[SEP]']


In [14]:
print(tokenizer.decode(tokenized['input_ids']))

[CLS] 토크나이저는 텍스트를 토큰 단위로 나눈다 [SEP]


In [15]:
print(tokenizer.decode(tokenized['input_ids'], skip_special_tokens=True))

토크나이저는 텍스트를 토큰 단위로 나눈다


In [16]:
tokenizer(['첫 번째 문장', '두 번째 문장'])

{'input_ids': [[0, 1656, 1141, 3135, 6265, 2], [0, 864, 1141, 3135, 6265, 2]], 'token_type_ids': [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}

In [17]:
tokenizer([['첫 번째 문장', '두 번째 문장']])

{'input_ids': [[0, 1656, 1141, 3135, 6265, 2, 864, 1141, 3135, 6265, 2]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [18]:
first_tokenized_result = tokenizer(['첫 번째 문장', '두 번째 문장'])['input_ids']
tokenizer.batch_decode(first_tokenized_result)

['[CLS] 첫 번째 문장 [SEP]', '[CLS] 두 번째 문장 [SEP]']

In [19]:
second_tokenized_result = tokenizer([['첫 번째 문장', '두 번째 문장']])['input_ids']
tokenizer.batch_decode(second_tokenized_result)

['[CLS] 첫 번째 문장 [SEP] 두 번째 문장 [SEP]']

In [20]:
bert_tokenizer = AutoTokenizer.from_pretrained('klue/bert-base')
bert_tokenizer([['첫 번째 문장', '두 번째 문장']])

{'input_ids': [[2, 1656, 1141, 3135, 6265, 3, 864, 1141, 3135, 6265, 3]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [21]:
roberta_tokenizer = AutoTokenizer.from_pretrained('klue/roberta-base')
roberta_tokenizer([['첫 번째 문장', '두 번째 문장']])

{'input_ids': [[0, 1656, 1141, 3135, 6265, 2, 864, 1141, 3135, 6265, 2]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [22]:
en_roberta_tokenizer = AutoTokenizer.from_pretrained('roberta-base')
en_roberta_tokenizer([['first sentence', 'second sentence']])

{'input_ids': [[0, 9502, 3645, 2, 2, 10815, 3645, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1]]}

In [23]:
tokenizer(['첫 번째 문장은 짧다.', '두 번째 문장은 첫 번째 문장보다 더 길다.'], padding='longest')

{'input_ids': [[0, 1656, 1141, 3135, 6265, 2073, 1599, 2062, 18, 2, 1, 1, 1, 1, 1, 1, 1], [0, 864, 1141, 3135, 6265, 2073, 1656, 1141, 3135, 6265, 2178, 2062, 831, 647, 2062, 18, 2]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [24]:
from datasets import load_dataset

klue_mrc_dataset = load_dataset('klue', 'mrc')
klue_mrc_dataset_only_train = load_dataset('klue', 'mrc', split='train')

In [25]:
klue_mrc_dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'context', 'news_category', 'source', 'guid', 'is_impossible', 'question_type', 'question', 'answers'],
        num_rows: 17554
    })
    validation: Dataset({
        features: ['title', 'context', 'news_category', 'source', 'guid', 'is_impossible', 'question_type', 'question', 'answers'],
        num_rows: 5841
    })
})

In [32]:
from datasets import load_dataset

# 로컬의 csv 데이터 파일 활용
dataset = load_dataset('csv', data_files='data.csv')

from datasets import Dataset
my_dict = {"a": [1, 2, 3]}
dataset = Dataset.from_dict(my_dict)

# 판다스 데이터프레임 활용
from datasets import Dataset
import pandas as pd
df = pd.DataFrame({"a": [1, 2, 3]})
dataset = Dataset.from_pandas(df)

FileNotFoundError: Unable to find '/Users/harim/repo/lisa-llm/ch3/data.csv'

In [33]:
klue_tc_train = load_dataset('klue', 'ynat', split='train')
klue_tc_eval = load_dataset('klue', 'ynat', split='validation')
klue_tc_train

Dataset({
    features: ['guid', 'title', 'label', 'url', 'date'],
    num_rows: 45678
})

In [34]:
klue_tc_train[0]

{'guid': 'ynat-v1_train_00000',
 'title': '유튜브 내달 2일까지 크리에이터 지원 공간 운영',
 'label': 3,
 'url': 'https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=105&sid2=227&oid=001&aid=0008508947',
 'date': '2016.06.30. 오전 10:36'}

In [35]:
klue_tc_train.features['label'].names

['IT과학', '경제', '사회', '생활문화', '세계', '스포츠', '정치']

In [28]:
klue_tc_train = klue_tc_train.remove_columns(['guid', 'url', 'date'])
klue_tc_eval = klue_tc_eval.remove_columns(['guid', 'url', 'date'])
klue_tc_train


Dataset({
    features: ['title', 'label'],
    num_rows: 45678
})

In [31]:
klue_tc_label = klue_tc_train.features['label']

def make_str_label(batch):
    batch['labael_str'] = klue_tc_label.int2str(batch['label'])
    return batch

klue_tc_train = klue_tc_train.map(make_str_label, batched=True, batch_size=1000)

klue_tc_train[0]

NameError: name 'klue_tc_train' is not defined

In [30]:
train_dataset = klue_tc_train.train_test_split(test_size=10000, shuffle=True, seed=42)['test']
dataset = klue_tc_eval.train_test_split(test_size=1000, shuffle=True, seed=42)
test_dataset = dataset['test']
valid_dataset = dataset['train'].train_test_split(test_size=1000, shuffle=True, seed=42)['test']


NameError: name 'klue_tc_train' is not defined

In [40]:
import torch
import numpy as np
from transformers import (
    Trainer,
    TrainingArguments,
    TFAutoModelForSequenceClassification,
    AutoTokenizer
)



def tokenize_function(examples):
    return tokenizer(examples['title'], padding='max_length', truncation=True)

model_id = "klue/roberta-base"
model = TFAutoModelForSequenceClassification.from_pretrained(model_id, num_labels=len(train_dataset.features['label'].names))
tokenizer = AutoTokenizer.from_pretrained(model_id)

RuntimeError: Failed to import transformers.models.roberta.modeling_tf_roberta because of the following error (look up to see its traceback):
Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

In [29]:
train_dataset = train_dataset.map(tokenize_function, batched=True)
valid_dataset = valid_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

NameError: name 'train_dataset' is not defined

In [None]:
training_args = TrainingArguments(
	output_dir="./results", # 결과 저장할 폴더
	num_train_epochs=1, # 학습 에포크 수
	per_device_train_batch_size=8, # 배치 크기
	per_device_eval_batch_size=8, # 배치 크기
	evaluation_strategy="epoch", # 한 에포크 학습 끝날 때마다 검증 데이터셋에 대한 평가 수행하도록 설정
	learning_rate=5e-5,
	push_to_hub=False
)

def compute_metrics(eval_pred): # 평가 지표(정확도)
	logits, labels = eval_pred
	predictions = np.argmax(logits, axis=-1)
	return {"accuracy": (predictions == labels).mean()}

In [None]:
trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=train_dataset,
	eval_dataset=valid_dataset,
	tokenizer=tokenizer,
	compute_metrics=compute_metrics,
)

trainer.train()

trainer.evaluate(test_dataset) # 정확도 0.84

### 3.4.3 트레이너 API 를 사용하지 않고 학습하기
위 예시와 비슷하게 모델과 토크나이저를 불러오고, 토큰화에 사용할 tokenize_function 정의
Trainer가 내부적으로 수행하던 GPU로의 모델 이동(model.to(device))을 직접 수행해야 함

In [27]:
import torch
from tqdm.auto import tqdm
from torch.utils.data import DataLoader
from transformers import AdamW, TFAutoModelForSequenceClassification

def tokenize_function(examples):
    return tokenizer(examples["title"], padding="max_length", truncation=True)

# 모델과 토크나이저 불러오기
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_id = "klue/roberta-base"
model = TFAutoModelForSequenceClassification.from_pretrained(model_id, num_labels=len(train_dataset.features['label'].names))
tokenizer = AutoTokenizer.from_pretrained(model_id)
model.to(device)

NameError: name 'train_dataset' is not defined

In [28]:
def make_dataloader(dataset, batch_size, shuffle=True):
    dataset = dataset.map(tokenize_function, batched=True).with_format("torch")
    # 데이터셋에 토큰화 수행
    dataset = dataset.rename_column("label", "labels")
    dataset = dataset.remove_columns(columne_names=['title'])
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

# 데이터로더 만들기
train_dataloader = make_dataloader(train_dataset, batch_size=8, shuffle=True)
valid_dataloader = make_dataloader(valid_dataset, batch_size=8, shuffle=False)
test_dataloader = make_dataloader(test_dataset, batch_size=8, shuffle=False)

NameError: name 'train_dataset' is not defined

In [None]:
def train_epoch(model, data_loader, optimizer):
    model.train()
    total_loss = 0
    for batch in tqdm(data_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device) # 모델에 입력할 토큰 아이디
        attention_mask = batch['attention_mask'].to(device) # 모델에 입력할 어텐션 마스크
        labels = batch['labels'].to(device) # 모델에 입력할 레이블
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels) # 모델 계산
        loss = outputs.loss # 손실
        loss.backward() # 역전파
        optimizer.step() # 모델 업데이트
        total_loss += loss.item()
	avg_loss = total_loss / len(data_loader)
	return avg_loss

In [2]:
def evaluate(model, data_loader):
    model.eval()
    total_loss = 0
    predictions = []
    true_labels = []
    with torch.no_grad():
        for batch in tqdm(data_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits
            loss = outputs.loss
            total_loss += loss.item()
            preds = torch.argmax(logits, dim=-1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
	avg_loss = total_loss / len(data_loader)
	accuracy = np.mean(np.array(predictions) == np.array(true_labels))
	return avg_loss, accuracy

SyntaxError: expected ':' (2870116992.py, line 1)

In [4]:
num_epochs = 1
optimizer = AdamW(model.parameters(), lr=5e-5)

# 학습 루프
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    train_loss = train_epoch(model, train_dataloader, optimizer)
    print(f"Training loss: {train_loss}")
    valid_loss, valid_accuracy = evaluate(model, valid_dataloader)
    print(f"Validation loss: {valid_loss}")
    print(f"Validation accuracy: {valid_accuracy}")

NameError: name 'AdamW' is not defined

### 3.4.4 학습한 모델 업로드하기
huggingface_hub 라이브러리로 허깅페이스에 프로그래밍 방식으로 접근 가능
업로드 방식
- Trainer를 사용한 경우, trainer 인스턴스에서 `push_to_hub()` 사용
- 직접 학습한 경우, 모델과 토크나이저를 각각 `push_to_hub()` 로 업로드

In [None]:
from huggingface_hub import login

login(token="token")
repo_id = f"lhr213213/roberta-base-klue-ynat-classification"

# Trainer 사용한 경우
trainer.push_to_hub(repo_id)

# 직접 학습한 경우
model.push_to_hub(repo_id)
tokenizer.push_to_hub(repo_id)

## 3.5 모델 추론
- 파이프라인 활용하는 방법
- 직접 모델과 토크나이저 불러와 활용하는 방법

### 3.5.1 파이프라인 활용한 추론
- 작업 종류, 모델, 설정을 입력으로 받아 토크나이저와 모델을 결합해 데이터 전후처리와 모델 추론을 수행함

In [36]:
from transformers import pipeline

model_id = "lhr213213/roberta-base-klue-ynat-classification"

model_pipeline = pipeline("text-classification", model=model_id)

model_pipeline(dataset["title"][:5])

OSError: lhr213213/roberta-base-klue-ynat-classification is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [None]:
import torch
from torch.nn.functional import softmax
from transformers import TFAutoModelForAudioClassification, AutoTokenizer

class CustomPipeline:
    def __init__(self, model_id):
        self.model = TFAutoModelForSequenceClassification.from_pretrained(model_id)
        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
        self.model.eval()

    def __call__(self, texts):
        tokenized = self.tokenizer(texts, return_tensors="pt", padding=True, truncation=True)

        with torch.no_grad():
            outputs = self.model(**tokenized)
            logits = outputs.logits

        probabilities = softmax(logits, dim=-1)
        scores, label = torch.max(probabilities, dim=-1)
        labels_str = [self.model.config.id2label[label_idx] for label_idx in labels.tolist()]

        return [{"label": label, "score": score.item()} for label, score in zip(labels_str, scores)]

custom_pipeline = CustomPipeline(model_id)
custom_pipeline(dataset['title'][:5])