In [1]:
import pickle
from tqdm import tqdm

# Import custom helper libraries
import os
import sys

src_path = os.path.abspath(os.path.join("../src"))
if src_path not in sys.path:
    sys.path.append(src_path)

import data.helpers as data_helpers
import visualization.helpers as viz_helpers

# Maths modules
import numpy as np
import pandas as pd
import tensorflow as tf

# Viz modules
import plotly.express as px

# Render for export
import plotly.io as pio

pio.renderers.default = "notebook"

# Download and unzip CSV files
!cd .. && make dataset && cd notebooks
# Load data from CSV
df = pd.read_csv(
    os.path.join(
        "..", "data", "raw", "training.1600000.processed.noemoticon.csv"
    ),
    names=["target", "id", "date", "flag", "user", "text"],
)

# Reduce memory usage
df = data_helpers.reduce_dataframe_memory_usage(df)

# Drop useless columns
df.drop(columns=["id", "date", "flag", "user"], inplace=True)

# Replace target values with labels
df.target.replace(
    {
        0: "NEGATIVE",
        2: "NEUTRAL",
        4: "POSITIVE",
    },
    inplace=True,
)

df = data_helpers.balance_sample(df, "target", 1*1000)


2022-01-21 05:07:57.791386: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-01-21 05:07:57.791409: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


>>> Downloading and extracting data files...
Data files already downloaded.
>>> OK.



In [2]:
from datasets import Dataset
# Tokenizers, Stemmers and Lemmatizers
from transformers import BertTokenizerFast


tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")
tokenizer_columns = tokenizer.model_input_names
label_column = "target"


# Processed data path
processed_data_path = os.path.join("..", "data", "processed")
tokenized_dataset_file_path = os.path.join(
    processed_data_path, "bert_tokenized_as_dataframe_dataset.pkl"
)


# if os.path.exists(tokenized_dataset_file_path):
#     # Load encoded dataset
#     with (open(tokenized_dataset_file_path, "rb")) as f:
#         X = pickle.load(f)
# else:
## Encode text
dataset_df = Dataset.from_pandas(df).map(
    lambda data: tokenizer(
        data["text"], padding="max_length", truncation=True
    ),
    batched=True,
    num_proc=4,
).to_pandas()

X = [np.array([dataset_df.iloc[x][col] for col in tokenizer_columns]).ravel() for x in tqdm(range(len(dataset_df)))]

    # # Save vectorized dataset as pickle
    # with open(tokenized_dataset_file_path, "wb") as f:
    #     pickle.dump(X, f)


 #0:   0%|          | 0/1 [00:00<?, ?ba/s]
[A

 #0: 100%|██████████| 1/1 [00:00<00:00, 14.96ba/s]
 #2: 100%|██████████| 1/1 [00:00<00:00, 16.94ba/s]
 #1: 100%|██████████| 1/1 [00:00<00:00, 11.87ba/s]
 #3: 100%|██████████| 1/1 [00:00<00:00, 12.71ba/s]
100%|██████████| 1000/1000 [00:00<00:00, 6787.75it/s]


In [3]:
from sklearn.model_selection import train_test_split


# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    df.target,
    test_size=0.2,
    stratify=df.target,
    random_state=42,
)


In [6]:
from transformers import TFBertForSequenceClassification
from keras.models import load_model
from keras.callbacks import TensorBoard, EarlyStopping
from keras.metrics import AUC


# Model constants.
model_name = "bert_for_sequence_classification_on_bert_tokenized_text"

results_data_path = os.path.join("..", "results")
model_file_path = os.path.join(results_data_path, model_name)

if os.path.exists(model_file_path):
    # Load model
    model = load_model(model_file_path)
else:
    # Define NN model
    print("Defining model...")
    model = TFBertForSequenceClassification.from_pretrained("bert-base-cased")

    # compile NN network
    print("Compiling model...")
    model.compile(
        loss="binary_crossentropy",
        optimizer="adam",
        metrics=[
            "accuracy",
            AUC(curve="ROC", name="ROC_AUC"),
            AUC(curve="PR", name="AP"),
        ],
    )

    # fit NN model
    print("Fitting model...")
    model.fit(
        np.array(X_train),
        y_train,
        # validation_split=0.2,
        epochs=10,
        batch_size=128,
        callbacks=[
            TensorBoard(log_dir=f"logs/{model.name}"),
            EarlyStopping(monitor="val_loss", patience=2),
        ],
        workers=4,
        use_multiprocessing=True,
        verbose=1,
    )

    # Save model
    print("Saving model...")
    model.save(model_file_path)

print(model.summary())


Defining model...


All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Compiling model...
Fitting model...
Epoch 1/10


ValueError: in user code:

    File "/home/clement/Workspace/oc_p7/env/lib/python3.9/site-packages/keras/engine/training.py", line 878, in train_function  *
        return step_function(self, iterator)
    File "/home/clement/Workspace/oc_p7/env/lib/python3.9/site-packages/keras/engine/training.py", line 867, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/clement/Workspace/oc_p7/env/lib/python3.9/site-packages/keras/engine/training.py", line 860, in run_step  **
        outputs = model.train_step(data)
    File "/home/clement/Workspace/oc_p7/env/lib/python3.9/site-packages/transformers/modeling_tf_utils.py", line 898, in train_step
        self.compiled_metrics.update_state(y, y_pred, sample_weight)
    File "/home/clement/Workspace/oc_p7/env/lib/python3.9/site-packages/keras/engine/compile_utils.py", line 460, in update_state
        metric_obj.update_state(y_t, y_p, sample_weight=mask)
    File "/home/clement/Workspace/oc_p7/env/lib/python3.9/site-packages/keras/utils/metrics_utils.py", line 73, in decorated
        update_op = update_state_fn(*args, **kwargs)
    File "/home/clement/Workspace/oc_p7/env/lib/python3.9/site-packages/keras/metrics.py", line 177, in update_state_fn
        return ag_update_state(*args, **kwargs)
    File "/home/clement/Workspace/oc_p7/env/lib/python3.9/site-packages/keras/metrics.py", line 2343, in update_state  **
        return metrics_utils.update_confusion_matrix_variables(
    File "/home/clement/Workspace/oc_p7/env/lib/python3.9/site-packages/keras/utils/metrics_utils.py", line 625, in update_confusion_matrix_variables
        y_pred.shape.assert_is_compatible_with(y_true.shape)

    ValueError: Shapes (None, 2) and (None, 1) are incompatible


In [14]:
model.predict(
    input_ids=np.array(X_test)[0:512],
    attention_mask=np.array(X_test)[:, :-1],


)

StagingError: in user code:

    File "/home/clement/Workspace/oc_p7/env/lib/python3.9/site-packages/keras/engine/training.py", line 1621, in predict_function  *
        return step_function(self, iterator)
    File "/home/clement/Workspace/oc_p7/env/lib/python3.9/site-packages/keras/engine/training.py", line 1611, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/clement/Workspace/oc_p7/env/lib/python3.9/site-packages/keras/engine/training.py", line 1604, in run_step  **
        outputs = model.predict_step(data)
    File "/home/clement/Workspace/oc_p7/env/lib/python3.9/site-packages/keras/engine/training.py", line 1572, in predict_step
        return self(x, training=False)
    File "/home/clement/Workspace/oc_p7/env/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None

    StagingError: Exception encountered when calling layer "tf_bert_for_sequence_classification_1" (type TFBertForSequenceClassification).
    
    in user code:
    
        File "/home/clement/Workspace/oc_p7/env/lib/python3.9/site-packages/transformers/models/bert/modeling_tf_bert.py", line 1727, in call  *
            inputs = input_processing(
        File "/home/clement/Workspace/oc_p7/env/lib/python3.9/site-packages/transformers/modeling_tf_utils.py", line 420, in input_processing  *
            output[parameter_names[i]] = input
    
        IndexError: list index out of range
    
    
    Call arguments received:
      • input_ids=('tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)', 'tf.Tensor(shape=(32,), dtype=int32)')
      • attention_mask=None
      • token_type_ids=None
      • position_ids=None
      • head_mask=None
      • inputs_embeds=None
      • output_attentions=None
      • output_hidden_states=None
      • return_dict=None
      • labels=None
      • training=False
      • kwargs=<class 'inspect._empty'>
