In [1]:
import tensorflow as tf
from tensorflow.keras import layers as tl
from tensorflow.data import Dataset
import tensorflow_datasets as tfds
from pathlib import Path
import numpy as np

tf.random.set_seed = 42

In [2]:
# NMT 

url = "https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"
path = tf.keras.utils.get_file('spa-eng.zip', origin=url, cache_dir='datasets', extract=True)
text = (Path(path).with_name('spa-eng') / 'spa.txt').read_text()

text = text.replace('¡', '').replace('¿','')
pairs = [line.split('\t') for line in text.splitlines()]
np.random.seed(42)
np.random.shuffle(pairs)
sentences_en, sentences_es = zip(*pairs)

In [3]:
sentences_en = sentences_en[:11_000]
sentences_es = sentences_es[:11_000]

In [4]:
vocab_size = 1000
max_length = 50

text_vec_layer_en = tl.TextVectorization(vocab_size, output_sequence_length=max_length)
text_vec_layer_es = tl.TextVectorization(vocab_size, output_sequence_length=max_length)

text_vec_layer_en.adapt(sentences_en)
text_vec_layer_es.adapt([f'startofseq {s} endofseq' for s in sentences_es])

print(text_vec_layer_en.get_vocabulary()[:10])
print(text_vec_layer_es.get_vocabulary()[:10])

2023-11-30 10:36:50.803395: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2023-11-30 10:36:50.803426: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 32.00 GB
2023-11-30 10:36:50.803432: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 10.67 GB
2023-11-30 10:36:50.803478: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-11-30 10:36:50.803496: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2023-11-30 10:36:51.873080: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


['', '[UNK]', 'the', 'i', 'to', 'you', 'tom', 'a', 'is', 'he']
['', '[UNK]', 'startofseq', 'endofseq', 'de', 'que', 'a', 'no', 'tom', 'la']


In [5]:
X_train = tf.constant(sentences_en[:10_000])
X_valid = tf.constant(sentences_en[10_000:])

X_train_dec = tf.constant([f'startofseq {s}' for s in sentences_es[:10_000]])
X_valid_dec = tf.constant([f'startofseq {s}' for s in sentences_es[10_000:]])

Y_train = text_vec_layer_es([f'{s} endofseq' for s in sentences_es[:10_000]])
Y_valid = text_vec_layer_es([f'{s} endofseq' for s in sentences_es[10_000:]])


In [6]:


class PositionalEncoding(tl.Layer):
    def __init__(self, max_length, embed_size, dtype=tf.float32, **kwargs):
        super().__init__(dtype=dtype, **kwargs)
        assert embed_size % 2 ==0, 'embed size must be even number'
        p, i = np.meshgrid(np.arange(max_length), 2 * np.arange(embed_size // 2))
        pos_emb = np.empty((1, max_length, embed_size)) # 1은 사용 시, batch로 값이 들어 왔으르 때 broad casting을 하기 위해 넣어준다. 1 X max_n_sequence X d_emb 라고 보면 된다.
        pos_emb[0, :, ::2] = np.sin(p / (10_000 ** (i / embed_size))).T
        pos_emb[0, :, 1::2] = np.cos(p / (10_000 ** (i / embed_size))).T
        self.pos_encodings = tf.constant(pos_emb.astype(self.dtype)) # numpy.empty는 기본적으로 float64로 만들기 때문에 float32 이하를 tensorflow에서 사용하기 위해서는 type 변경이 필오하다.
        self.supports_masking = True # mask propabation for next layer

    def call(self, inputs):
        n_sequence = tf.shape(inputs)[1]
        return inputs + self.pos_encodings[:, :n_sequence] # shape: (batch, n_sequence, d_emb)


def transformer_model(vocab_size, embed_size, max_seq_length, Nx, num_heads, n_units, dropout_rate=0.1):
    enc_inputs = tl.Input(shape=[], dtype=tf.string)
    dec_inputs = tl.Input(shape=[], dtype=tf.string)

    enc_input_ids = text_vec_layer_en(enc_inputs)
    dec_input_ids = text_vec_layer_es(dec_inputs)

    
    # encoding
    enc_embedding = tl.Embedding(vocab_size, embed_size, mask_zero=True)(enc_input_ids)
    Z = PositionalEncoding(max_seq_length, embed_size)(enc_embedding)
    for _ in range(Nx):
        skip = Z
        attn_layer = tl.MultiHeadAttention(num_heads=num_heads, key_dim=embed_size, dropout=dropout_rate)
        Z = attn_layer(Z, value=Z)
        Z = tl.LayerNormalization()(tl.Add()([Z, skip]))
        skip = Z
        Z = tl.Dense(n_units, activation='relu')(Z)
        Z = tl.Dense(embed_size)(Z)
        Z = tl.Dropout(dropout_rate)(Z)
        Z = tl.LayerNormalization()(tl.Add()([Z, skip])) # use Add to propagate mask


    # decoding
    encoder_outputs = Z
    dec_embedding = tl.Embedding(vocab_size, embed_size, mask_zero=True)(dec_input_ids)
    Z = PositionalEncoding(max_seq_length, embed_size)(dec_embedding)

    for _ in range(Nx):
        skip = Z
        attn_layer = tl.MultiHeadAttention(num_heads=num_heads, key_dim=embed_size, dropout=dropout_rate)
        Z = attn_layer(Z, value=Z, use_causal_mask=True)
        Z = tl.LayerNormalization()(tl.Add()([Z, skip])) # use Add to propagate mask
        skip = Z
        attn_layer = tl.MultiHeadAttention(num_heads=num_heads, key_dim=embed_size, dropout=dropout_rate)
        Z = attn_layer(Z, value=encoder_outputs)
        Z = tl.LayerNormalization()(tl.Add()([Z, skip]))
        skip = Z
        Z = tl.Dense(n_units, activation='relu')(Z)
        Z = tl.Dense(embed_size)(Z)
        Z = tl.LayerNormalization()(tl.Add()([Z, skip]))

    Y_proba = tl.Dense(vocab_size, activation='softmax')(Z)

    model = tf.keras.Model(inputs=[enc_inputs, dec_inputs], outputs=[Y_proba])

    return model


embed_size = 128
n_units = 128
max_seq_length = 50
num_heads = 2
dropout_rate = 0.1



model = transformer_model(vocab_size, embed_size, max_seq_length, num_heads, n_units, dropout_rate)



In [7]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='nadam', metrics=['accuracy'])

In [8]:
dec_sample = X_train_dec[:1]
dec_input_ids = text_vec_layer_es(dec_sample)
dec_embedding = tl.Embedding(vocab_size, embed_size, mask_zero=True)(dec_input_ids)
Z = PositionalEncoding(max_seq_length, embed_size)(dec_embedding)
attn_layer = tl.MultiHeadAttention(num_heads=num_heads, key_dim=embed_size, dropout=dropout_rate)
Z = attn_layer(Z, value=Z, use_causal_mask=True)

for i in range(10):
    print(Z[0][i][:10])

tf.Tensor(
[-0.06968934 -0.1425012   0.19917154 -0.07937535 -0.08110089  0.22989309
  0.03209518 -0.025309   -0.04700687  0.02382866], shape=(10,), dtype=float32)
tf.Tensor(
[-0.07010315 -0.1445272   0.20024146 -0.08195658 -0.07090619  0.23147456
  0.04827744 -0.03117206 -0.04578125  0.05138787], shape=(10,), dtype=float32)
tf.Tensor(
[-0.07008444 -0.14992754  0.19950521 -0.08048613 -0.06253864  0.23308489
  0.06137362 -0.03492396 -0.04631061  0.07919947], shape=(10,), dtype=float32)
tf.Tensor(
[-0.01793756 -0.07520054  0.13782263 -0.03132938 -0.10008351  0.18662068
 -0.02441441 -0.06039337 -0.02696255  0.06822544], shape=(10,), dtype=float32)
tf.Tensor(
[-0.01793756 -0.07520054  0.13782263 -0.03132938 -0.10008351  0.18662068
 -0.02441441 -0.06039337 -0.02696255  0.06822544], shape=(10,), dtype=float32)
tf.Tensor(
[-0.01793756 -0.07520054  0.13782263 -0.03132938 -0.10008351  0.18662068
 -0.02441441 -0.06039337 -0.02696255  0.06822544], shape=(10,), dtype=float32)
tf.Tensor(
[-0.0179375

In [9]:
skip = True
if not skip:
    history = model.fit((X_train, X_train_dec), Y_train, epochs=1, validation_data=((X_valid, X_valid_dec), Y_valid))

In [13]:
from transformers import pipeline

classifier = pipeline('sentiment-analysis')
result = classifier(['The actors were very convicing'])
result

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


[{'label': 'POSITIVE', 'score': 0.9968543648719788}]

In [15]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

model_name = 'huggingface/distilbert-base-uncased-finetuned-mnli'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


In [18]:
token_ids = tokenizer(
    ['I like soccer. [SEP] We all love soccer!', 
     'Joe lived for a very long time. [SEP] Joe is old.'],
    padding=True, return_tensors='tf')

token_ids

{'input_ids': <tf.Tensor: shape=(2, 15), dtype=int32, numpy=
array([[ 101, 1045, 2066, 4715, 1012,  102, 2057, 2035, 2293, 4715,  999,
         102,    0,    0,    0],
       [ 101, 3533, 2973, 2005, 1037, 2200, 2146, 2051, 1012,  102, 3533,
        2003, 2214, 1012,  102]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(2, 15), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int32)>}

In [20]:
outputs = model(token_ids)
outputs.logits

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[-2.112382  ,  1.1786789 ,  1.410101  ],
       [-0.01478315,  1.096246  , -0.99199456]], dtype=float32)>

In [22]:
Y_probas = tf.keras.activations.softmax(outputs.logits)
Y_probas

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[0.01619701, 0.4352357 , 0.5485673 ],
       [0.22656   , 0.6881721 , 0.08526792]], dtype=float32)>

In [None]:
# additional training
sentences = [('Sky is blue', 'Sky is red'), ('I love her', 'She loves me')]
X_train = tokenizer(sentences, padding=True, return_tensors='tf').data
y_train = tf.constant([0, 2]) # contradiction, neutral
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(loss=loss, optimizer='nadam', metrics=['accuracy'])


In [24]:
history = model.fit(X_train, y_train, epochs=2)

Epoch 1/2
Epoch 2/2
