In [1]:
import pickle
from tqdm import tqdm

# Import custom helper libraries
import os
import sys

src_path = os.path.abspath(os.path.join("../src"))
if src_path not in sys.path:
    sys.path.append(src_path)

import data.helpers as data_helpers
import visualization.helpers as viz_helpers

# Maths modules
import numpy as np
import pandas as pd
import tensorflow as tf

# Viz modules
import plotly.express as px

# Render for export
import plotly.io as pio

pio.renderers.default = "notebook"

# Download and unzip CSV files
!cd .. && make dataset && cd notebooks
# Load data from CSV
df = pd.read_csv(
    os.path.join(
        "..", "data", "raw", "training.1600000.processed.noemoticon.csv"
    ),
    names=["target", "id", "date", "flag", "user", "text"],
)

# Reduce memory usage
df = data_helpers.reduce_dataframe_memory_usage(df)

# Drop useless columns
df.drop(columns=["id", "date", "flag", "user"], inplace=True)

# Replace target values with labels
df.target.replace(
    {
        0: "NEGATIVE",
        2: "NEUTRAL",
        4: "POSITIVE",
    },
    inplace=True,
)

df.target.replace(
    {
        "NEGATIVE": 0,
        "POSITIVE": 1,
    },
    inplace=True,
)

# Sample data for development
TEXT_SAMPLE_SIZE = 2000  # <= 0 for all

# Sample data
if TEXT_SAMPLE_SIZE > 0:
    df = data_helpers.balance_sample(df, "target", TEXT_SAMPLE_SIZE)


2022-01-21 11:07:17.743269: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-01-21 11:07:17.743293: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


>>> Downloading and extracting data files...
Data files already downloaded.
>>> OK.



In [2]:
from datasets import Dataset
# Tokenizers, Stemmers and Lemmatizers
from transformers import BertTokenizerFast


tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")
tokenizer_columns = tokenizer.model_input_names
label_column = "target"


# Processed data path
processed_data_path = os.path.join("..", "data", "processed")
tokenized_dataset_file_path = os.path.join(
    processed_data_path, "bert_tokenized_as_dataframe_dataset.pkl"
)


# if os.path.exists(tokenized_dataset_file_path):
#     # Load encoded dataset
#     with (open(tokenized_dataset_file_path, "rb")) as f:
#         X = pickle.load(f)
# else:
    ## Encode text
    # dataset_df = Dataset.from_pandas(df).map(
    #     lambda data: tokenizer(
    #         data["text"], padding="max_length", truncation=True
    #     ),
    #     batched=True,
    #     num_proc=4,
    # ).to_pandas()

    # X = [np.array([dataset_df.iloc[x][col] for col in tokenizer_columns]).ravel() for x in tqdm(range(len(dataset_df)))]

    # # Save vectorized dataset as pickle
    # with open(tokenized_dataset_file_path, "wb") as f:
    #     pickle.dump(X, f)


In [3]:
input_ids = []
attention_masks = []
token_type_ids = []

for sent in tqdm(df.text):
    bert_inp = tokenizer(
        sent,
        padding="max_length",
        truncation=True,
    )
    input_ids.append(bert_inp["input_ids"])
    attention_masks.append(bert_inp["attention_mask"])
    token_type_ids.append(bert_inp["token_type_ids"])

input_ids = np.asarray(input_ids)
attention_masks = np.array(attention_masks)
token_type_ids = np.array(token_type_ids)
labels = np.array(df.target)


100%|██████████| 2000/2000 [00:00<00:00, 6038.08it/s]


In [4]:
from sklearn.model_selection import train_test_split


# Train-test split
(
    input_ids_train,
    input_ids_test,
    attention_masks_train,
    attention_masks_test,
    token_type_ids_train,
    token_type_ids_test,
    labels_train,
    labels_test,
) = train_test_split(
    input_ids,
    attention_masks,
    token_type_ids,
    labels,
    test_size=0.2,
    stratify=labels,
    random_state=42,
)


In [7]:
from transformers import TFBertForSequenceClassification
from keras.models import load_model
from keras.callbacks import TensorBoard, EarlyStopping
from keras.metrics import BinaryCrossentropy, SparseCategoricalAccuracy, AUC


# Model constants.
model_name = "bert_for_sequence_classification_on_bert_tokenized_text"

results_data_path = os.path.join("..", "results")
model_file_path = os.path.join(results_data_path, model_name)

if os.path.exists(model_file_path):
    # Load model
    model = load_model(model_file_path)
else:
    # Define NN model
    print("Defining model...")
    model = TFBertForSequenceClassification.from_pretrained(
        "bert-base-cased", num_labels=2
    )

    # loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    # metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
    # optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-08)

    # compile NN network
    print("Compiling model...")
    model.compile(
        loss="binary_crossentropy",
        optimizer="adam",
        metrics=[
            SparseCategoricalAccuracy(name="accuracy"),
            # AUC(curve="ROC", name="ROC_AUC"),
            # AUC(curve="PR", name="AP"),
        ],
    )

    # fit NN model
    print("Fitting model...")
    model.fit(
        [input_ids_train, attention_masks_train, token_type_ids_train],
        labels_train,
        epochs=10,
        batch_size=4,
        validation_split=0.2,
        callbacks=[
            TensorBoard(log_dir=f"logs/{model.name}"),
            EarlyStopping(monitor="val_loss", patience=2),
        ],
        workers=4,
        use_multiprocessing=True,
    )

    # Save model
    print("Saving model...")
    model.save(model_file_path)

print(model.summary())


Defining model...


All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Compiling model...
Fitting model...
Epoch 1/10