In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.1-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m73.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m118.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90

In [5]:
# as Data Scientist
import pandas as pd
import numpy as np

from os import listdir

from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, TFBertModel

# Layers & Models
# from keras.layers import (
#     Input,
#     Dense,
#     Embedding,
#     Dropout, SpatialDropout1D,
#     Concatenate, Reshape,
#     GlobalMaxPooling1D, GlobalAveragePooling1D,
#     Conv1D,
#     LSTM, GRU,
#     Bidirectional
# )
# from keras.models import Model

# from tensorflow.keras.optimizers import Adam
# from tensorflow.keras.initializers import Constant
# from tensorflow.keras.losses import BinaryCrossentropy

# import tensorflow as tf
# import warnings
# warnings.filterwarnings("ignore")

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
listdir('drive/MyDrive/Public/DS102 - Machine Learning/')

['code', 'refs', 'data']

In [5]:
!cd drive/MyDrive/Public/

In [6]:
listdir()

['.config', 'drive', 'sample_data']

# Read Data

In [8]:
train = pd.read_csv("../data/train.csv")
dev = pd.read_csv("../data/dev.csv")
test = pd.read_csv("../data/test.csv")

# job labels
labels = pd.read_csv("../data/job_labels.csv")

In [13]:
X_train = train['description']
X_dev = dev['description']
X_test = test['description']

y_train = train['industry']
y_dev = dev['industry']
y_test = test['industry']

# Make one-hot labels (for multi-labels)

In [9]:
labels = labels['0']
num_labels = len(labels)

In [10]:
def create_onehot_labels(y):
    full_labels = []

    for val in y:
        seperated = val.split(' / ')
        lbl = np.zeros(num_labels)

        for i in range(0, num_labels):
            if labels[i] in seperated:
                lbl[i] = 1
        
        full_labels.append(lbl)

    return full_labels

In [11]:
y_train_onehot = create_onehot_labels(y_train)
y_dev_onehot = create_onehot_labels(y_dev)
y_test_onehot = create_onehot_labels(y_test)

In [12]:
X_train = list(X_train.values)
X_dev = list(X_dev.values)
X_test = list(X_test.values)

# Hyperparameters

In [13]:
MODEL = 'vinai/phobert-base' # Danh sách pre-trained model ở đây: https://huggingface.co/models
MAX_LEN = 200
BATCH_SIZE = 32
EPOCH = 2

UNITS = 100

# Input


In [14]:
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=False)

train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=MAX_LEN)
dev_encodings = tokenizer(X_dev, truncation=True, padding=True, max_length=MAX_LEN)
test_encodings = tokenizer(X_test, truncation=True, padding=True, max_length=MAX_LEN)

Downloading (…)lve/main/config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

Downloading (…)solve/main/bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [15]:
import tensorflow as tf

train_features = {x: tf.convert_to_tensor(train_encodings[x], dtype=tf.float32) for x in tokenizer.model_input_names}
dev_features = {x: tf.convert_to_tensor(dev_encodings[x], dtype=tf.float32) for x in tokenizer.model_input_names}
test_features = {x: tf.convert_to_tensor(test_encodings[x], dtype=tf.float32) for x in tokenizer.model_input_names}

# Model

In [16]:
# Input layer
input_ids = Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_ids')
token_type_ids = Input(shape=(MAX_LEN,), dtype=tf.int32, name='token_type_ids')
attention_mask = Input(shape=(MAX_LEN,), dtype=tf.int32, name='attention_mask')

# Embedding layer
encoder = TFBertModel.from_pretrained(MODEL)
embedding = encoder(
    {
        'input_ids': input_ids, 
        # 'token_type_ids': token_type_ids,
        'attention_mask': attention_mask
        
    }
)[0]

# Dropout layer
a = SpatialDropout1D(0.2)(embedding)

# Bidirectional
x = Bidirectional(GRU(UNITS, return_sequences=True))(a)
x = Conv1D(int(UNITS/2), kernel_size=2, padding="same", kernel_initializer="he_uniform")(x)
    
y = Bidirectional(LSTM(UNITS, return_sequences=True))(a)
y = Conv1D(int(UNITS/2), kernel_size=2, padding="same", kernel_initializer="he_uniform")(y)

# Avg & Max Pooling
avg_pool1 = GlobalAveragePooling1D()(x)
max_pool1 = GlobalMaxPooling1D()(x)
    
avg_pool2 = GlobalAveragePooling1D()(y)
max_pool2 = GlobalMaxPooling1D()(y)
    
# Concatnate
x = Concatenate(axis=-1)([avg_pool1, max_pool1, avg_pool2, max_pool2])

# Dropout
x = Dropout(0.5)(x)

# Dense to classify
out = Dense(num_labels, activation = "sigmoid")(x)

# Model
model = Model(
    inputs=[
        input_ids,
        # token_type_ids,
        attention_mask
    ],
    outputs=out,
)

You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


Downloading tf_model.h5:   0%|          | 0.00/740M [00:00<?, ?B/s]

Some layers from the model checkpoint at vinai/phobert-base were not used when initializing TFBertModel: ['lm_head', 'roberta']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertModel were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['bert']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
# Compile
loss = BinaryCrossentropy(from_logits=False)
optimizer = Adam(lr=1e-5)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])



In [18]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 attention_mask (InputLayer)    [(None, 200)]        0           []                               
                                                                                                  
 input_ids (InputLayer)         [(None, 200)]        0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  134998272   ['attention_mask[0][0]',         
                                thPoolingAndCrossAt               'input_ids[0][0]']              
                                tentions(last_hidde                                               
                                n_state=(None, 200,                                           

In [19]:
from keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.utils import to_categorical
import warnings
warnings.filterwarnings("ignore")

callback = EarlyStopping(monitor='val_loss', patience=2)
model.fit(
    train_features,
    np.array(y_train_onehot), 
    validation_data=(dev_features, np.array(y_dev_onehot)), 
    batch_size=BATCH_SIZE,
    epochs=EPOCH
)

Epoch 1/2




Epoch 2/2


<keras.callbacks.History at 0x7eff10275f60>