In [1]:
# !pip install transformers
# !pip install symspellpy
import numpy as np
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
import urllib.request
import csv
import importlib 
import tensorflow as tf
import sklearn
from tqdm import tqdm
# !pip install contractions
# !pip install tqdm
f = importlib.import_module('functions')

PATH = {}
PATH["dataset_classification"] = "dataset/classification/"
PATH["dataset_labeling"] = "dataset/seq_labeling/"
PATH["music_reviews_train"] = PATH["dataset_classification"] + "music_reviews_train.json.gz"
PATH["music_reviews_dev"] = PATH["dataset_classification"] + "music_reviews_dev.json.gz"
PATH["music_reviews_test"] = PATH["dataset_classification"] + "music_reviews_test_masked.json.gz"
PATH["hard_sentences"] = PATH["dataset_classification"] + "hard_sentences.json.gz"

train = f.readJson(PATH["music_reviews_train"])
dev = f.readJson(PATH["music_reviews_dev"])
test = f.readJson(PATH["music_reviews_test"])

sent_dict = {"positive": 1, "negative": 0, "POSITIVE": 1, "NEGATIVE":0}

# read the train data
data = train
train_sent = []
train_sentiment = []
train_idx = []
train_missing_indexies = []
y_train = []
length_of_sentencies_counter = []
for i in range(len(data)):
    try:
        train_sent.append(data[i]["reviewText"])
        train_sentiment.append(data[i]["sentiment"])
        train_idx.append(i)
        y_train.append(sent_dict[data[i]["sentiment"]])
        length_of_sentencies_counter.append(len(data[i]["reviewText"].split()))
    except KeyError:
        train_missing_indexies.append(i)
        continue
print(train_missing_indexies)

# read the dev data 
data = dev
dev_sent = []
dev_sentiment = []
dev_idx = []
dev_missing_indexies = []
dev_y_train = []
for i in range(len(data)):
    try:
        dev_sent.append(data[i]["reviewText"])
        dev_sentiment.append(data[i]["sentiment"])
        dev_idx.append(i)
        dev_y_train.append(sent_dict[data[i]["sentiment"]]) 
    except KeyError:
        dev_missing_indexies.append(i)
        continue
print(dev_missing_indexies)



# make dev_sentiment into vector for checking accuracy laters... 
dev_classvec = np.array([sent_dict[s] for s in dev_sentiment])

def printlen(a, a_name): print(f'{a_name} length {len(a)}')
printlen(train_sent, "train_sent")
printlen(dev_sent, "dev_sent")

charactercount = 0
for sentence in dev_sent:
    charactercount += len(sentence)
print("every string is 40 bytes on it's own, and then 1 byte pr character")
print(f'no. of characters in dev_sent: {charactercount}')
print("40*9996 + 2427461 = 2827301 which is 2.8 megabytes")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jasro\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\jasro\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jasro\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\jasro\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Number of data:  100000
Number of data:  10000
Number of data:  10000
[4603, 4838, 16019, 18229, 19785, 23315, 28397, 28834, 33359, 43138, 43539, 43690, 44008, 44477, 44972, 48811, 49317, 50106, 51717, 52286, 55555, 56171, 57223, 58328, 58799, 58866, 59525, 59739, 61046, 61914, 61916, 62831, 63208, 72268, 78944, 79067, 80093, 80637, 80658, 81640, 81900, 82510, 83138, 83145, 83615, 84761, 87112, 88895, 88984, 89132, 91949, 94301, 94727, 99641]
[2900, 4294, 5135, 8540]
train_sent length 99946
dev_sent length 9996
every string is 40 bytes on it's own, and then 1 byte pr character
no. of characters in dev_sent: 2427461
40*9996 + 2427461 = 2827301 which is 2.8 megabytes


In [2]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________


In [4]:
# convert from "positive" to 1, "negative" to 0
dev_sentiment = [sent_dict[ds] for ds in dev_sentiment] 
train_sentiment = [sent_dict[ts] for ts in train_sentiment]

dev = [dev_sent, dev_sentiment]
train = [train_sent, train_sentiment]

In [5]:
train_InputExamples = [InputExample(guid=None, text_a=r, label=s) for r,s in zip(train_sent, train_sentiment)]
validation_InputExamples = [InputExample(guid=None, text_a=r, label=s) for r,s in zip(dev_sent, dev_sentiment)]
validation_InputExamples[0]

InputExample(guid=None, text_a='My dentist recommended this as a relaxation technique for dental visits. They give me an ipod with headphones, play this on it and it relieves some of the stress of dental treatment, which I dislike intensely.\nIt worked so well that I bought my own copy to try at home. I fall asleep after a couple of minutes and stay asleep. Instead of tossing and turning, I hardly move at all. Highly recommend.', text_b=None, label=1)

Notes: 
BatchEncoding holds the output of the tokenizer’s encoding methods (__call__, encode_plus and batch_encode_plus) and is derived from a Python dictionary. When the tokenizer is a pure python tokenizer, this class behave just like a standard python dictionary and hold the various model inputs computed by these methodes (input_ids, attention_mask…). When the tokenizer is a “Fast” tokenizer (i.e. backed by HuggingFace tokenizers library), this class provides in addition several advanced alignement methods which can be used to map between the original string (character and words) and the token space (e.g. getting the index of the token comprising a given character or the span of characters corresponding to a given token).

In [6]:
############## 
##############
############## inspiration from https://www.kaggle.com/code/satyampd/imdb-sentiment-analysis-using-bert-w-huggingface/notebook
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=40):
    features = [] # -> will hold InputFeatures to be converted later

    for e in tqdm(examples): # progress bar
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,    # Add 'CLS' and 'SEP'
            max_length=max_length,    # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],input_dict["token_type_ids"], input_dict['attention_mask'])
        features.append(InputFeatures( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label) )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )


DATA_COLUMN = 'review' # TODO delete? 
LABEL_COLUMN = 'sentiment' # TODO delete?

In [7]:
start = 6666
no = 5000

In [8]:
# train_data = convert_examples_to_tf_dataset(train_InputExamples[:5000], tokenizer)
train_data = convert_examples_to_tf_dataset(train_InputExamples[start:(start+no)], tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2) # tf stuff 

100%|█████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:06<00:00, 754.46it/s]


In [9]:
# validation_data = convert_examples_to_tf_dataset(validation_InputExamples[:1000], tokenizer)
validation_data = convert_examples_to_tf_dataset(validation_InputExamples, tokenizer)
validation_data = validation_data.batch(32)

100%|█████████████████████████████████████████████████████████████████████████████| 9996/9996 [00:11<00:00, 846.98it/s]


In [10]:
type(validation_data)

tensorflow.python.data.ops.dataset_ops.BatchDataset

In [11]:
# saves the datasets, but outcommented cuz it's already done 

# tf.data.experimental.save(
#     train_data, "dataset/bert_tokenized/train_data_6666_5k", compression=None, shard_func=None, checkpoint_args=None
# )
# tf.data.experimental.save(
#     validation_data, "dataset/bert_tokenized/validation_data_6666_5k", compression=None, shard_func=None, checkpoint_args=None
# )

# TO RETRIEVE
train_data = tf.data.experimental.load("dataset/bert_tokenized/train_data_6666_5k", element_spec=None, compression=None, reader_func=None)
validation_data = tf.data.experimental.load("dataset/bert_tokenized/validation_data_6666_5k", element_spec=None, compression=None, reader_func=None)

In [12]:
# train model 
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

history = model.fit(train_data, epochs=2, validation_data=validation_data)

Epoch 1/2
Epoch 2/2


In [13]:
model.save("models/bert_uncased_trained5k_maxlen40_final") 



INFO:tensorflow:Assets written to: models/bert_uncased_trained5k_maxlen40_final\assets


2022-05-26 16:12:41,746 INFO:Assets written to: models/bert_uncased_trained5k_maxlen40_final\assets


In [14]:
history.history['val_accuracy']

[0.8931572437286377, 0.8974589705467224]