In [1]:
# %pip install transformers

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


## Loading dataset from huggingface

In [1]:
import datasets
from datasets import load_dataset

huggingface_mrpc_dataset = load_dataset('glue', 'mrpc')
print(huggingface_mrpc_dataset)



DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})


In [2]:
train = huggingface_mrpc_dataset['test']
cols = train.column_names
cols

['sentence1', 'sentence2', 'label', 'idx']

In [3]:
train_subset = train.select(range(10)).shuffle()
print(train_subset)


Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 10
})


In [4]:
for i in range(5):
    for col in cols:
        print(col, ":", train[col][i])
    print('\n')

sentence1 : PCCW 's chief operating officer , Mike Butcher , and Alex Arena , the chief financial officer , will report directly to Mr So .
sentence2 : Current Chief Operating Officer Mike Butcher and Group Chief Financial Officer Alex Arena will report to So .
label : 1
idx : 0


sentence1 : The world 's two largest automakers said their U.S. sales declined more than predicted last month as a late summer sales frenzy caused more of an industry backlash than expected .
sentence2 : Domestic sales at both GM and No. 2 Ford Motor Co. declined more than predicted as a late summer sales frenzy prompted a larger-than-expected industry backlash .
label : 1
idx : 1


sentence1 : According to the federal Centers for Disease Control and Prevention ( news - web sites ) , there were 19 reported cases of measles in the United States in 2002 .
sentence2 : The Centers for Disease Control and Prevention said there were 19 reported cases of measles in the United States in 2002 .
label : 1
idx : 2


sen

## Loading dataset from tensorflow

In [20]:
import tensorflow_datasets as tfds
from datasets import Dataset

tf_dataset, tf_dataset_info = tfds.load('glue/mrpc', with_info=True)

2024-03-26 12:02:49.332292: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Max
2024-03-26 12:02:49.332315: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 64.00 GB
2024-03-26 12:02:49.332318: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 24.00 GB
2024-03-26 12:02:49.332341: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-03-26 12:02:49.332362: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [21]:
examples = tf_dataset['train'].take(5)

for example in examples:
    for col in cols:
        print(col, ":", example[col])
    print('\n')

sentence1 : tf.Tensor(b'The identical rovers will act as robotic geologists , searching for evidence of past water .', shape=(), dtype=string)
sentence2 : tf.Tensor(b'The rovers act as robotic geologists , moving on six wheels .', shape=(), dtype=string)
label : tf.Tensor(0, shape=(), dtype=int64)
idx : tf.Tensor(1680, shape=(), dtype=int32)


sentence1 : tf.Tensor(b"Less than 20 percent of Boise 's sales would come from making lumber and paper after the OfficeMax purchase is completed .", shape=(), dtype=string)
sentence2 : tf.Tensor(b"Less than 20 percent of Boise 's sales would come from making lumber and paper after the OfficeMax purchase is complete , assuming those businesses aren 't sold .", shape=(), dtype=string)
label : tf.Tensor(0, shape=(), dtype=int64)
idx : tf.Tensor(1456, shape=(), dtype=int32)


sentence1 : tf.Tensor(b'Spider-Man snatched $ 114.7 million in its debut last year and went on to capture $ 403.7 million .', shape=(), dtype=string)
sentence2 : tf.Tensor(b'Spi

2024-03-26 12:03:00.621897: W tensorflow/core/kernels/data/cache_dataset_ops.cc:858] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [22]:
# Tensorflow dataset 구조를 python dict 형식으로 변경
# Dataset이 train, validation, test로 나뉘도록 구성
train_dataset = tfds.as_dataframe(tf_dataset['train'], tf_dataset_info)
val_dataset = tfds.as_dataframe(tf_dataset['validation'], tf_dataset_info)
test_dataset = tfds.as_dataframe(tf_dataset['test'], tf_dataset_info)

# dataframe 데이터를 dict 내부에 list로 변경
train_dataset = train_dataset.to_dict('list')
val_dataset = val_dataset.to_dict('list')
test_dataset = test_dataset.to_dict('list')

# Huggingface dataset
tf_train_dataset = Dataset.from_dict(train_dataset)
tf_val_dataset = Dataset.from_dict(val_dataset)
tf_test_dataset = Dataset.from_dict(test_dataset)

In [39]:
import pandas as pd
train_df = pd.DataFrame(test_dataset)
train_df.columns
train_df.head()

Unnamed: 0,idx,label,sentence1,sentence2
0,163,-1,b'Shares in BA were down 1.5 percent at 168 pe...,b'Shares in BA were down three percent at 165-...
1,131,-1,b'The South Korean Agriculture and Forestry Mi...,b'The South Korean Agriculture and Forestry Mi...
2,1579,-1,"b'"" New Yorkers didn \'t embrace these units l...","b'"" New Yorkers didn \'t embrace these units l..."
3,1151,-1,"b'"" I really liked him and I still do , "" Cohe...","b'And I really liked him , and I still do .'"
4,1021,-1,b'Tight media controls and the remote location...,b'Tight media controls and the remote location...


In [23]:
def transform_tf(batch):
    sentence1 = [s.decode('utf-8') for s in batch['sentence1']]
    sentence2 = [s.decode('utf-8') for s in batch['sentence2']]
    return huggingface_tokenizer(
        sentence1,
        sentence2,
        truncation=True,
        padding='max_length',
        return_token_type_ids=False,
    )

# 토큰화 및 패딩을 적용
tf_train_dataset = tf_train_dataset.map(transform_tf, batched=True)
tf_val_dataset = tf_val_dataset.map(transform_tf, batched=True)
tf_test_dataset = tf_test_dataset.map(transform_tf, batched=True)

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

## Huggingface Tokenizer and Model

In [27]:
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification

huggingface_tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
huggingface_model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased')


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Tokenization

In [6]:
def transform(data):
    return huggingface_tokenizer(
        data['sentence1'],
        data['sentence2'],
        truncation = True,
        padding = 'max_length',
        return_token_type_ids = False,
        )

In [7]:
hf_dataset = huggingface_mrpc_dataset.map(transform, batched=True)

# train & validation & test split
hf_train_dataset = hf_dataset['train']
hf_val_dataset = hf_dataset['validation']
hf_test_dataset = hf_dataset['test']

In [9]:
hf_train_dataset

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'attention_mask'],
    num_rows: 3668
})

In [11]:
hf_val_dataset


Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'attention_mask'],
    num_rows: 408
})

In [12]:
hf_test_dataset


Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'attention_mask'],
    num_rows: 1725
})

## Train/Evaluation, Test

In [35]:
# %pip install transformers[torch]
# %pip install accelerate -U

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Collecting accelerate
  Downloading accelerate-0.28.0-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.28.0-py3-none-any.whl (290 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: accelerate
Successfully installed accelerate-0.28.0
Note: you may need to restart the kernel to use updated packages.


1. Trainer를 사용하기 위해서는 TrainingArguments를 통해 학습 관련 설정을 미리 지정

In [25]:
import os
import numpy as np
from transformers import Trainer, TrainingArguments

output_dir = '/Users/kenny_jung/aiffel/data/hf_transformer'

training_arguments = TrainingArguments(
    output_dir,                            # output이 저장될 경로
    evaluation_strategy="epoch",           # evaluation하는 빈도
    learning_rate = 2e-5,                  # learning_rate
    per_device_train_batch_size = 8,       # 각 device 당 batch size
    per_device_eval_batch_size = 8,        # evaluation 시에 batch size
    num_train_epochs = 3,                  # train 시킬 총 epochs
    weight_decay = 0.01,                   # weight decay
)

2. task가 classification인지 regression인지에 따라 모델의 출력 형태가 달라지므로 task별로 적합한 출력 형식을 고려해 모델의 성능을 계산하는 방법을 미리 지정

In [26]:
from datasets import load_metric
metric = load_metric('glue', 'mrpc')

def compute_metrics(eval_pred):    
    predictions,labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references = labels)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


3. Trainer에 model, arguments, train_dataset, eval_dataset, compute_metrics를 넣고 train을 진행합니다.

In [15]:
trainer = Trainer(
    model=huggingface_model,           # 학습시킬 model
    args=training_arguments,           # TrainingArguments을 통해 설정한 arguments
    train_dataset=hf_train_dataset,    # training dataset
    eval_dataset=hf_val_dataset,       # evaluation dataset
    compute_metrics=compute_metrics,
)
trainer.train()
print("슝~")

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/1377 [00:00<?, ?it/s]

  0%|          | 0/51 [00:00<?, ?it/s]

{'eval_loss': 0.3841363489627838, 'eval_accuracy': 0.8382352941176471, 'eval_f1': 0.8838028169014085, 'eval_runtime': 5.9767, 'eval_samples_per_second': 68.265, 'eval_steps_per_second': 8.533, 'epoch': 1.0}
{'loss': 0.5152, 'grad_norm': 21.535314559936523, 'learning_rate': 1.2737835875090777e-05, 'epoch': 1.09}


  0%|          | 0/51 [00:00<?, ?it/s]

{'eval_loss': 0.3564828634262085, 'eval_accuracy': 0.8431372549019608, 'eval_f1': 0.8865248226950355, 'eval_runtime': 5.0177, 'eval_samples_per_second': 81.312, 'eval_steps_per_second': 10.164, 'epoch': 2.0}
{'loss': 0.3235, 'grad_norm': 0.5208008885383606, 'learning_rate': 5.475671750181555e-06, 'epoch': 2.18}


  0%|          | 0/51 [00:00<?, ?it/s]

{'eval_loss': 0.516402006149292, 'eval_accuracy': 0.8406862745098039, 'eval_f1': 0.8881239242685025, 'eval_runtime': 5.0712, 'eval_samples_per_second': 80.455, 'eval_steps_per_second': 10.057, 'epoch': 3.0}
{'train_runtime': 533.5478, 'train_samples_per_second': 20.624, 'train_steps_per_second': 2.581, 'train_loss': 0.3688121177878342, 'epoch': 3.0}
슝~


4. Test data test

In [19]:
trainer.evaluate(hf_test_dataset)

  0%|          | 0/216 [00:00<?, ?it/s]

{'eval_loss': 0.5602657794952393,
 'eval_accuracy': 0.831304347826087,
 'eval_f1': 0.8758002560819462,
 'eval_runtime': 20.9762,
 'eval_samples_per_second': 82.236,
 'eval_steps_per_second': 10.297,
 'epoch': 3.0}

5. 메모리 삭제

In [17]:
#메모리를 비워줍니다.
del huggingface_model

6. 커스텀 데이터셋으로 학습

In [28]:
trainer = Trainer(
    model=huggingface_model,           # 학습시킬 model
    args=training_arguments,           # TrainingArguments을 통해 설정한 arguments
    train_dataset=tf_train_dataset,    # training dataset
    eval_dataset=tf_val_dataset,       # evaluation dataset
    compute_metrics=compute_metrics,
)
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/1377 [00:00<?, ?it/s]

  0%|          | 0/51 [00:00<?, ?it/s]

{'eval_loss': 0.39602720737457275, 'eval_accuracy': 0.8308823529411765, 'eval_f1': 0.8685714285714285, 'eval_runtime': 5.1938, 'eval_samples_per_second': 78.556, 'eval_steps_per_second': 9.819, 'epoch': 1.0}


Checkpoint destination directory /Users/kenny_jung/aiffel/data/hf_transformer/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.5188, 'grad_norm': 10.028202056884766, 'learning_rate': 1.2737835875090777e-05, 'epoch': 1.09}


  0%|          | 0/51 [00:00<?, ?it/s]

{'eval_loss': 0.34312117099761963, 'eval_accuracy': 0.8382352941176471, 'eval_f1': 0.8825622775800711, 'eval_runtime': 5.2577, 'eval_samples_per_second': 77.6, 'eval_steps_per_second': 9.7, 'epoch': 2.0}


Checkpoint destination directory /Users/kenny_jung/aiffel/data/hf_transformer/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.3262, 'grad_norm': 1.7467845678329468, 'learning_rate': 5.475671750181555e-06, 'epoch': 2.18}


  0%|          | 0/51 [00:00<?, ?it/s]

{'eval_loss': 0.5077135562896729, 'eval_accuracy': 0.8357843137254902, 'eval_f1': 0.8842832469775476, 'eval_runtime': 5.2442, 'eval_samples_per_second': 77.801, 'eval_steps_per_second': 9.725, 'epoch': 3.0}
{'train_runtime': 523.16, 'train_samples_per_second': 21.034, 'train_steps_per_second': 2.632, 'train_loss': 0.36601108836365853, 'epoch': 3.0}


TrainOutput(global_step=1377, training_loss=0.36601108836365853, metrics={'train_runtime': 523.16, 'train_samples_per_second': 21.034, 'train_steps_per_second': 2.632, 'train_loss': 0.36601108836365853, 'epoch': 3.0})

In [29]:
trainer.evaluate(hf_test_dataset)

  0%|          | 0/216 [00:00<?, ?it/s]

{'eval_loss': 0.5508893132209778,
 'eval_accuracy': 0.8226086956521739,
 'eval_f1': 0.8685567010309279,
 'eval_runtime': 20.8552,
 'eval_samples_per_second': 82.713,
 'eval_steps_per_second': 10.357,
 'epoch': 3.0}

In [3]:
import torch
print(torch.backends.mps.is_built())
print(torch.backends.mps.is_available())

True
True


In [2]:
import torch

mps_device = torch.device("mps")

# MPS 장치에 바로 tensor를 생성합니다.
x = torch.ones(5, device=mps_device)
# 또는
x = torch.ones(5, device="mps")

# GPU 상에서 연산을 진행합니다.
y = x * 2

# 또는, 다른 장치와 마찬가지로 MPS로 이동할 수도 있습니다.
model = YourFavoriteNet()  # 어떤 모델의 객체를 생성한 뒤,
model.to(mps_device)       # MPS 장치로 이동합니다.

# 이제 모델과 텐서를 호출하면 GPU에서 연산이 이뤄집니다.
pred = model(x)

True


In [5]:
print(torch.__version__)

2.2.1


In [8]:
print(f"PyTorch version:{torch.__version__}") # 1.12.1 이상
print(f"MPS 장치를 지원하도록 build 되었는지: {torch.backends.mps.is_built()}") # True 여야 합니다.
print(f"MPS 장치가 사용 가능한지: {torch.backends.mps.is_available()}") # True 여야 합니다.
!python -c 'import platform;print(platform.platform())'


PyTorch version:2.2.1
MPS 장치를 지원하도록 build 되었는지: True
MPS 장치가 사용 가능한지: True
zsh:1: command not found: python
