In [1]:
# !pip install transformers -q
# !pip install sentencepiece -q

[K     |████████████████████████████████| 2.0MB 6.6MB/s 
[K     |████████████████████████████████| 3.2MB 23.0MB/s 
[K     |████████████████████████████████| 890kB 38.4MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 1.2MB 5.3MB/s 
[?25h

In [2]:
import os
import json

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import transformers
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

SEED = 42

In [3]:
tf.get_logger().setLevel('ERROR')
os.environ["TFHUB_MODEL_LOAD_FORMAT"]="UNCOMPRESSED"

if os.environ['COLAB_TPU_ADDR']:
  cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
  tf.config.experimental_connect_to_cluster(cluster_resolver)
  tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
  strategy = tf.distribute.TPUStrategy(cluster_resolver)
  print('Using TPU')
elif tf.test.is_gpu_available():
  strategy = tf.distribute.MirroredStrategy()
  print('Using GPU')
else:
  raise ValueError('Running on CPU is not recomended.')

INFO:absl:Entering into master device scope: /job:worker/replica:0/task:0/device:CPU:0


Using TPU


In [4]:
AUTO = tf.data.experimental.AUTOTUNE

# Configuration
EPOCHS = 6
bsize = 32
BATCH_SIZE = bsize * strategy.num_replicas_in_sync
MAX_LEN = 256 # 512

In [None]:
# def init(n, model):
#   df = pd.read_pickle('wsb_no_topics.pkl')

#   with open(f'wsb_topics_{n}.json', 'r') as fp:
#     topics = json.load(fp)

#   df["topic"] = -1

#   for i in range(0, len(topics)):
#       df.loc[topics[str(i)], "topic"] = i

#   n_classes = len(topics)

In [None]:
# df = pd.read_pickle('wsb_no_topics.pkl')

# with open(f'wsb_topics_{n}.json', 'r') as fp:
#   topics = json.load(fp)

# df["topic"] = -1

# for i in range(0, len(topics)):
#     df.loc[topics[str(i)], "topic"] = i

# n_classes = len(topics)

In [None]:
# AUTO = tf.data.experimental.AUTOTUNE

# # Configuration
# EPOCHS = 6
# bsize = 32
# BATCH_SIZE = bsize * strategy.num_replicas_in_sync
# MAX_LEN = 256 # 512
# MODEL = my_models[3]

# print('Selected model: ', MODEL)

In [None]:
# X = df.selftext.values
# y = df.topic.values

# X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=df.topic, test_size=0.2, random_state=SEED)
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train, test_size=0.2, random_state=SEED)

In [5]:
def regular_encode(texts, tokenizer, maxlen=MAX_LEN):
    enc_di = tokenizer.batch_encode_plus(
        texts,
        add_special_tokens=True,
        truncation=True,
        padding='max_length',
        return_attention_mask=False, 
        return_token_type_ids=False,
        max_length=maxlen
    )
    
    return np.array(enc_di['input_ids'])

In [6]:
def build_model(transformer, n_classes, loss='sparse_categorical_crossentropy', max_len=MAX_LEN):
    input_word_ids = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    x = tf.keras.layers.Dropout(0.3)(cls_token)
    out = tf.keras.layers.Dense(n_classes, activation='softmax')(x)
    model = tf.keras.Model(inputs=input_word_ids, outputs=out)
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss=loss, metrics=['accuracy'])
    return model

In [7]:
def create_datasets(X_train, X_val, X_test, y_train, y_val, y_test, MODEL):
  tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL)

  x_train = regular_encode(X_train.tolist(), tokenizer, maxlen=MAX_LEN)
  x_valid = regular_encode(X_val.tolist(), tokenizer, maxlen=MAX_LEN)
  x_test = regular_encode(X_test.tolist(), tokenizer, maxlen=MAX_LEN)
  y_train = y_train.tolist()
  y_valid = y_val.tolist()
  y_test = y_test.tolist()


  train_dataset = (
      tf.data.Dataset
      .from_tensor_slices((x_train, y_train))
      .repeat()
      .shuffle(2048, seed=SEED)
      .batch(BATCH_SIZE)
      .prefetch(AUTO)
  )

  valid_dataset = (
      tf.data.Dataset
      .from_tensor_slices((x_valid, y_valid))
      .batch(BATCH_SIZE)
      .cache()
      .prefetch(AUTO)
  )

  test_dataset = (
      tf.data.Dataset
      .from_tensor_slices(x_test)
      .batch(BATCH_SIZE)
  )

  n_steps = n_steps = x_train.shape[0] // BATCH_SIZE

  return train_dataset, valid_dataset, test_dataset, n_steps

In [15]:
def main(n):
  df = pd.read_pickle('relationships_no_labels.pkl')

  with open(f'relationships_topics_{n}.json', 'r') as fp:
    topics = json.load(fp)

  df["topic"] = -1

  for i in range(0, len(topics)):
      df.loc[topics[str(i)], "topic"] = i

  n_classes = len(topics)

  X = df.selftext.values
  y = df.topic.values

  X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=df.topic, test_size=0.2, random_state=SEED)
  X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train, test_size=0.2, random_state=SEED)

  train_dataset, valid_dataset, test_dataset, n_steps = create_datasets(X_train, X_val, X_test, y_train, y_val, y_test, MODEL)

  with strategy.scope():
    transformer_layer = transformers.TFAutoModel.from_pretrained(MODEL)
    model = build_model(transformer_layer, n_classes, max_len=MAX_LEN)

  train_history = model.fit(
      train_dataset,
      steps_per_epoch=n_steps,
      validation_data=valid_dataset,
      epochs=EPOCHS
  )

  y_pred = model.predict(test_dataset, verbose=1)

  predictions = [np.argmax(p) for p in y_pred]
  print(f'model: {MODEL}, epochs: {EPOCHS}, sample_len: {MAX_LEN}, batch_size: {bsize}')
  print(classification_report(y_test, predictions))

In [9]:
my_models = ['bert-base-uncased', 'roberta-base', 'jplu/tf-xlm-roberta-base',
             'distilbert-base-uncased-finetuned-sst-2-english']

MODEL = my_models[0]

In [17]:
main(5)

main(10)

main(15)

main(20)

main(25)

main(30)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=536063208.0, style=ProgressStyle(descri…




Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
model: bert-base-uncased, epochs: 6, sample_len: 256, batch_size: 32
              precision    recall  f1-score   support

           0       0.77      0.80      0.78      4458
           1       0.68      0.77      0.72      3702
           2       0.78      0.72      0.75      1657
           3       0.76      0.79      0.78      2634
           4       0.86      0.77      0.81      6155

    accuracy                           0.78     18606
   macro avg       0.77      0.77      0.77     18606
weighted avg       0.78      0.78      0.78     18606



Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
model: bert-base-uncased, epochs: 6, sample_len: 256, batch_size: 32
              precision    recall  f1-score   support

           0       0.71      0.76      0.74      3072
           1       0.71      0.78      0.74      3882
           2       0.59      0.61      0.60       475
           3       0.70      0.73      0.72       897
           4       0.79      0.61      0.69      3784
           5       0.67      0.75      0.71      1859
           6       0.68      0.63      0.65      1329
           7       0.50      0.47      0.48       723
           8       0.59      0.74      0.66       724
           9       0.67      0.66      0.66      1861

    accuracy                           0.70     18606
   macro avg       0.66      0.67      0.66     18606
weighted avg       0.70      0.70      0.69     18606



Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
model: bert-base-uncased, epochs: 6, sample_len: 256, batch_size: 32
              precision    recall  f1-score   support

           0       0.64      0.61      0.62      1930
           1       0.58      0.79      0.67      2295
           2       0.57      0.54      0.55       287
           3       0.60      0.71      0.65       736
           4       0.69      0.55      0.61      2555
           5       0.67      0.62      0.65       906
           6       0.64      0.65      0.64      1083
           7       0.55      0.31      0.40       436
           8       0.64      0.64      0.64       669
           9       0.69      0.45      0.55       696
          10       0.77      0.66      0.71      1856
          11       0.65      0.71      0.68      2097
          12       0.65      0.74      0.69       257
          13       0.59      0.68      0.63      1958
          14       0.77      0.64      0.70       845

    a

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
model: bert-base-uncased, epochs: 6, sample_len: 256, batch_size: 32
              precision    recall  f1-score   support

           0       0.57      0.46      0.51       976
           1       0.59      0.66      0.62      1066
           2       0.63      0.42      0.51       171
           3       0.64      0.67      0.66       499
           4       0.64      0.53      0.58      2156
           5       0.63      0.58      0.60      1087
           6       0.57      0.74      0.64       954
           7       0.52      0.45      0.48       528
           8       0.63      0.71      0.66       595
           9       0.49      0.48      0.49       503
          10       0.64      0.76      0.69       940
          11       0.73      0.51      0.60      1758
          12       0.73      0.70      0.72       219
          13       0.47      0.67      0.55      1261
          14       0.73      0.51      0.60       632
      

  _warn_prf(average, modifier, msg_start, len(result))
Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
model: bert-base-uncased, epochs: 6, sample_len: 256, batch_size: 32
              precision    recall  f1-score   support

           0       0.69      0.31      0.43       825
           1       0.56      0.47      0.51       320
           2       0.56      0.40      0.47       155
           3       0.57      0.78      0.66       453
           4       0.57      0.47      0.52      1735
           5       0.49      0.52      0.50       452
           6       0.60      0.58      0.59       831
           7       0.60      0.38      0.46       183
           8       0.61      0.75      0.68       552
           9       0.45      0.11      0.18       127
          10       0.64      0.58      0.61       778
          11       0.67      0.61      0.64      1389
          12       0.75      0.57      0.65       153
          13       0.41      0.56      0.47       546
          14       0.63      0.61      0.62       565
      

  _warn_prf(average, modifier, msg_start, len(result))
Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
model: bert-base-uncased, epochs: 6, sample_len: 256, batch_size: 32
              precision    recall  f1-score   support

           0       0.56      0.36      0.44       639
           1       0.49      0.45      0.47       253
           2       0.62      0.44      0.52       135
           3       0.65      0.70      0.67       559
           4       0.51      0.44      0.47      1397
           5       0.56      0.53      0.55       347
           6       0.63      0.51      0.57       555
           7       0.60      0.45      0.52       148
           8       0.64      0.66      0.65       460
           9       0.32      0.24      0.27       121
          10       0.61      0.56      0.59       521
          11       0.58      0.63      0.61      1342
          12       0.80      0.23      0.36        35
          13       0.45      0.53      0.48       510
          14       0.68      0.63      0.65       495
      

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#building the model on tpu
with strategy.scope():
    transformer_layer = transformers.TFAutoModel.from_pretrained(MODEL)
    model = build_model(transformer_layer, max_len=MAX_LEN)
    
model.summary()

Some layers from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing TFDistilBertModel: ['dropout_19', 'pre_classifier', 'classifier']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 256)]             0         
_________________________________________________________________
tf_distil_bert_model_4 (TFDi TFBaseModelOutput(last_hi 66362880  
_________________________________________________________________
tf.__operators__.getitem_4 ( (None, 768)               0         
_________________________________________________________________
dropout_136 (Dropout)        (None, 768)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 5)                 3845      
Total params: 66,366,725
Trainable params: 66,366,725
Non-trainable params: 0
_________________________________________________________________
CPU times: user 5.89 s, sys: 3.42 s, total: 9.32 s
Wall time: 22.7 s


In [None]:
x_train = regular_encode(X_train.tolist(), tokenizer, maxlen=MAX_LEN)
x_valid = regular_encode(X_val.tolist(), tokenizer, maxlen=MAX_LEN)
x_test = regular_encode(X_test.tolist(), tokenizer, maxlen=MAX_LEN)
y_train = y_train.tolist()
y_valid = y_val.tolist()
y_test = y_test.tolist()

In [None]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(2048, seed=SEED)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_valid, y_valid))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test)
    .batch(BATCH_SIZE)
)

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL)

In [None]:
n_steps = x_train.shape[0] // BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS
)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [None]:
y_pred = model.predict(test_dataset, verbose=1)



In [None]:
predictions = [np.argmax(p) for p in y_pred]
print(f'model: {MODEL}, epochs: {EPOCHS}, sample_len: {MAX_LEN}, batch_size: {bsize}')
print(classification_report(y_test, predictions))

model: distilbert-base-uncased-finetuned-sst-2-english, epochs: 6, sample_len: 256, batch_size: 32
              precision    recall  f1-score   support

           0       0.75      0.67      0.71      1455
           1       0.71      0.78      0.74      2331
           2       0.82      0.86      0.84      2872
           3       0.78      0.66      0.72      1704
           4       0.71      0.73      0.72       810

    accuracy                           0.76      9172
   macro avg       0.75      0.74      0.74      9172
weighted avg       0.76      0.76      0.76      9172



In [None]:
# import seaborn as sns
# from sklearn.metrics import confusion_matrix
# import matplotlib.pyplot as plt

# encoded_classes = list(range(20))
# test_topics = y_test
# pred_topics = predictions = [np.argmax(p) for p in y_pred]
# confusion_mat = confusion_matrix(y_true = test_topics, y_pred = pred_topics, labels=list(encoded_classes))
# df_cm = pd.DataFrame(confusion_mat, index = list(encoded_classes),columns = list(encoded_classes))
# plt.rcParams['figure.figsize'] = (10,10)
# sns.heatmap(df_cm)