In [1]:
from global_vars import *
from transformers import AutoModelForMultipleChoice, AutoTokenizer
from datasets import Dataset, load_from_disk
from tqdm import tqdm
import pandas as pd
import wandb
import pickle
import numpy as np
from wandb.keras import WandbMetricsLogger, WandbCallback
import tensorflow as tf
import os
from tensorflow.keras import Model # if only machine learning were this easy :P
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
#import any other libraries you want here:
from tensorflow.keras.layers import Dense, SpatialDropout1D, LSTM, Embedding, GlobalAveragePooling1D
from tensorflow.keras.activations import softmax
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, LambdaCallback, ModelCheckpoint

In [63]:
run = wandb.init(
    project="mbti_bert_mlm",
    config=dict,
    entity="mbtipredictor"
)

In [4]:
df = pd.read_csv("converted_new_for_custom.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106067 entries, 0 to 106066
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   posts    106067 non-null  object
 1   type     106067 non-null  object
 2   new_col  106067 non-null  int64 
 3   pad      106067 non-null  object
dtypes: int64(1), object(3)
memory usage: 3.2+ MB


In [5]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 500
# This is fixed.
EMBEDDING_DIM = 100
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['posts'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 256681 unique tokens.


In [6]:
X = tokenizer.texts_to_sequences(df['posts'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (106067, 500)


In [7]:
Y = pd.get_dummies(df['type']).values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (106067, 16)


In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(95460, 500) (95460, 16)
(10607, 500) (10607, 16)


In [9]:
X_train, X_val, Y_train, Y_val = train_test_split(X_train,Y_train, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_val.shape,Y_val.shape)

(85914, 500) (85914, 16)
(9546, 500) (9546, 16)


In [56]:
xt= X_train[:500]
xv = X_val[:50]
xte = X_test[:50]
yt= Y_train[:500]
yv = Y_val[:50]
yte = Y_test[:50]

print(len(xt))
print(len(xv))

500
50


In [57]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2, name = "lstm_1", return_sequences=True))
model.add(LSTM(50, dropout=0.1, recurrent_dropout=0.1, name = "lstm_2", return_sequences=True))
model.add(GlobalAveragePooling1D(name = "globalaveragepooling1d"))
model.add(Dense(16, activation='softmax', name = "dense"))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])


model.summary()



Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 500, 100)          5000000   
                                                                 
 spatial_dropout1d_8 (Spatia  (None, 500, 100)         0         
 lDropout1D)                                                     
                                                                 
 lstm_1 (LSTM)               (None, 500, 100)          80400     
                                                                 
 lstm_2 (LSTM)               (None, 500, 50)           30200     
                                                                 
 globalaveragepooling1d (Glo  (None, 50)               0         
 balAveragePooling1D)                                            
                                                                 
 dense (Dense)               (None, 16)               

In [66]:
# Prepare the training dataset.
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, Y_train))
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size)

# Prepare the validation dataset.
val_dataset = tf.data.Dataset.from_tensor_slices((X_val, Y_val))
val_dataset = val_dataset.batch(batch_size)

print(len(train_dataset))
print(len(val_dataset))


8592
955


In [64]:
loss_fn = tf.keras.losses.CategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam()
train_acc_metric = tf.keras.metrics.CategoricalAccuracy()
val_acc_metric = tf.keras.metrics.CategoricalAccuracy()

@tf.function
def train_step(x, y):
    with tf.GradientTape() as tape:
        logits = model(x, training=True)
        loss_value = loss_fn(y, logits)
    grads = tape.gradient(loss_value, model.trainable_weights)
    optimizer.apply_gradients(zip(grads, model.trainable_weights))
    train_acc_metric.update_state(y, logits)
    return loss_value

@tf.function
def test_step(x, y):
    val_logits = model(x, training=False)
    val_loss_value = loss_fn(y, val_logits)
    val_acc_metric.update_state(y, val_logits)
    return val_loss_value


print("start")

for epoch in range(epochs):
    print("\nStart of epoch %d" % (epoch,))

    # Iterate over the batches of the dataset.
    for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
        loss_value = train_step(x_batch_train, y_batch_train)

        # Log every 1 batches.
        if step % 1 == 0:
            train_acc = train_acc_metric.result()
            #train_acc_metric.reset_states()
            print(
                "Training loss (for one batch) at step %d: %.4f"
                % (step, float(loss_value))
            )
            print("Seen so far: %d samples" % ((step + 1) * batch_size))
            wandb.log({"Training Loss": loss_value, "Train Acc": train_acc})
        # Run a validation loop each 4 steps
        if step % 4 == 0:
            for x_batch_val, y_batch_val in val_dataset:
                val_loss = test_step(x_batch_val, y_batch_val)
            val_acc = val_acc_metric.result()
            #val_acc_metric.reset_states()
            print(
                "Validation loss (for 200 batch) at step %d: %.4f"
                % (step, float(val_loss))
            )
            wandb.log({"Validation Loss": val_loss, "Validation Acc": val_acc})
    # Display metrics at the end of each epoch.
    train_acc = train_acc_metric.result()
    print("Training acc over epoch: %.4f" % (float(train_acc),))
    wandb.log({"Train Acc Epoch": train_acc})
    # Reset training metrics at the end of each epoch
    train_acc_metric.reset_states()

    # Run a validation loop at the end of each epoch.
    for x_batch_val, y_batch_val in val_dataset:
        test_step(x_batch_val, y_batch_val)

    val_acc = val_acc_metric.result()
    val_acc_metric.reset_states()
    print("Validation acc: %.4f" % (float(val_acc),))
    wandb.log({"Validation Acc Epoch": val_acc})

start

Start of epoch 0
Training loss (for one batch) at step 0: 2.6202
Seen so far: 10 samples
Validation loss (for 200 batch) at step 0: 1.8904
Training loss (for one batch) at step 1: 2.2767
Seen so far: 20 samples
Training loss (for one batch) at step 2: 2.1320
Seen so far: 30 samples
Training loss (for one batch) at step 3: 2.4526
Seen so far: 40 samples
Training loss (for one batch) at step 4: 2.3266
Seen so far: 50 samples
Validation loss (for 200 batch) at step 4: 2.0195
Training loss (for one batch) at step 5: 2.5635
Seen so far: 60 samples
Training loss (for one batch) at step 6: 1.9469
Seen so far: 70 samples
Training loss (for one batch) at step 7: 1.9707
Seen so far: 80 samples
Training loss (for one batch) at step 8: 2.3249
Seen so far: 90 samples
Validation loss (for 200 batch) at step 8: 1.9931
Training loss (for one batch) at step 9: 2.3452
Seen so far: 100 samples
Training loss (for one batch) at step 10: 2.1195
Seen so far: 110 samples
Training loss (for one batch) a

In [46]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 2.773
  Accuracy: 0.077


In [85]:
print(accr)

[2.7727158069610596, 0.027246158570051193]


In [None]:
!mkdir -p saved_model
model.save('saved_model/my_model')


A subdirectory or file -p already exists.
Error occurred while processing: -p.
A subdirectory or file saved_model already exists.
Error occurred while processing: saved_model.


INFO:tensorflow:Assets written to: saved_model/my_model\assets


INFO:tensorflow:Assets written to: saved_model/my_model\assets


In [65]:
run.finish()

VBox(children=(Label(value='0.001 MB of 0.029 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.039970…

0,1
Train Acc,▂▂▂▁▂▃▄▄█▅▃▃▄▃▄▄▂▅▅▆▆▆▆▅▃▅▄▄▄▄▄▄▇▄▄▅▅▅▅▅
Train Acc Epoch,▁▃█▄▇
Training Loss,▆▄▆▆▄▃▂█▅█▆▅▅▇▅▇█▅▅▁▅▇▆▆▂▄▅▄█▂▅▅▄▇▅▂▅▃▄▃
Validation Acc,▇▃▃▄▅▅▅▆▃▃▃▂▂▁▁▂▇▇▇▇▇▇▇▇█▇▇▇▇█▇▇▇▇▇▇▇▇▇▇
Validation Acc Epoch,▅▁███
Validation Loss,▃▄▃▂▂▁▁▁█▇▄███▆▄▂▂▂▂▁▁▁▂▃▃▂▃▃▃▂▂▂▂▂▂▂▂▂▂

0,1
Train Acc,0.224
Train Acc Epoch,0.224
Training Loss,2.32361
Validation Acc,0.26154
Validation Acc Epoch,0.26143
Validation Loss,1.85496
