In [4]:
import warnings
import pandas as pd
warnings.filterwarnings('ignore')

In [5]:
import numpy as np

In [6]:
# import the dataset
df_train = pd.read_csv('./emotions_dataset/train.txt', header=None, sep=';', names=['Input', 'Sentiment'], encoding='utf-8')
df_test = pd.read_csv('./emotions_dataset/test.txt', header=None, sep=';', names=['Input', 'Sentiment'], encoding='utf-8')
df_val = pd.read_csv('./emotions_dataset/val.txt', header=None, sep=';', names=['Input', 'Sentiment'], encoding='utf-8')

In [7]:
df_full = pd.concat([df_train,df_test,df_val])
df_full

Unnamed: 0,Input,Sentiment
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger
...,...,...
1995,im having ssa examination tomorrow in the morn...,sadness
1996,i constantly worry about their fight against n...,joy
1997,i feel its important to share this info for th...,joy
1998,i truly feel that if you are passionate enough...,joy


TEXT PREPROCESSING

In [8]:
import text_preprocessing as tp
import text_hammer as th

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/davidtan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [9]:
%%time
# prints wall time for the entire cell

from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

def text_preprocess(df, column):
    df[column] = df[column].progress_apply(lambda x: tp.expand_contraction(x))
    df[column] = df[column].progress_apply(lambda x: tp.remove_email(x))
    df[column] = df[column].progress_apply(lambda x: tp.remove_url(x))
    df[column] = df[column].progress_apply(lambda x: tp.remove_special_character(x))
    df[column] = df[column].progress_apply(lambda x: tp.normalize_unicode(x))
    df[column] = df[column].progress_apply(lambda x: tp.remove_stopword(x))
    df[column] = df[column].progress_apply(lambda x: tp.stem_word(x))
    # df[column] = df[column].progress_apply(lambda x: tp.tokenize_word(x))
    
    # df[column] = df[column].progress_apply(lambda x: str(x).lower())
    # df[column] = df[column].progress_apply(lambda x: th.cont_exp(x))
    # df[column] = df[column].progress_apply(lambda x: th.remove_emails(x))
    # df[column] = df[column].progress_apply(lambda x: th.remove_html_tags(x))
    # df[column] = df[column].progress_apply(lambda x: th.remove_special_chars(x))
    # df[column] = df[column].progress_apply(lambda x: th.remove_accented_chars(x))
    return df

CPU times: user 612 µs, sys: 258 µs, total: 870 µs
Wall time: 707 µs


In [10]:
df_cleaned = text_preprocess(df_full, 'Input')

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

In [11]:
df_cleaned.head()

Unnamed: 0,Input,Sentiment
0,"[feel, humili]",sadness
1,"[go, feel, hopeless, damn, hope, around, someo...",sadness
2,"[grab, minut, post, feel, greedi, wrong]",anger
3,"[ever, feel, nostalg, fireplac, know, still, p...",love
4,"[feel, grouchi]",anger


In [12]:
df_cleaned = df_cleaned.copy()

In [13]:
df_cleaned['num_words'] = df_cleaned.Input.apply(lambda x: len(x))

In [14]:
df_cleaned.head()

Unnamed: 0,Input,Sentiment,num_words
0,"[feel, humili]",sadness,2
1,"[go, feel, hopeless, damn, hope, around, someo...",sadness,9
2,"[grab, minut, post, feel, greedi, wrong]",anger,6
3,"[ever, feel, nostalg, fireplac, know, still, p...",love,7
4,"[feel, grouchi]",anger,2


In [15]:
# changing the data type to the category to encode into codes
df_cleaned['Sentiment'] = df_cleaned.Sentiment.astype('category')

In [16]:
# returns unique value based on hash table
# uniques are returned in order of appearance, hence not sorted.
# significantly faster than numpy
df_cleaned.Sentiment.unique()

['sadness', 'anger', 'love', 'surprise', 'fear', 'joy']
Categories (6, object): ['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']

In [17]:
df_cleaned.Sentiment

0       sadness
1       sadness
2         anger
3          love
4         anger
         ...   
1995    sadness
1996        joy
1997        joy
1998        joy
1999        joy
Name: Sentiment, Length: 20000, dtype: category
Categories (6, object): ['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']

In [18]:
df_cleaned.Sentiment.cat.codes

0       4
1       4
2       0
3       3
4       0
       ..
1995    4
1996    2
1997    2
1998    2
1999    2
Length: 20000, dtype: int8

In [19]:
# initialize a dictionary for the codes
encoded_dict = {'anger': 0, 'fear': 1, 'joy': 2, 'love': 3, 'sadness': 4, 'surprise': 5}

In [20]:
df_cleaned['Sentiment'] = df_cleaned.Sentiment.cat.codes # convert to from word to numbers
df_cleaned.Sentiment

0       4
1       4
2       0
3       3
4       0
       ..
1995    4
1996    2
1997    2
1998    2
1999    2
Name: Sentiment, Length: 20000, dtype: int8

In [21]:
df_cleaned.head()

Unnamed: 0,Input,Sentiment,num_words
0,"[feel, humili]",4,2
1,"[go, feel, hopeless, damn, hope, around, someo...",4,9
2,"[grab, minut, post, feel, greedi, wrong]",0,6
3,"[ever, feel, nostalg, fireplac, know, still, p...",3,7
4,"[feel, grouchi]",0,2


In [22]:
df_cleaned.num_words.max()

35

In [23]:
from sklearn.model_selection import train_test_split
# random_state=42 allows for us to get the same train and test sets
# across different executions, but in this case, train and test sets
# are different from random_state=0
data_train, data_test = train_test_split(df_cleaned, test_size=0.3, random_state=42, stratify=df_cleaned.Sentiment)

In [24]:

data_train.head().Input

7181                               [feel, sure, go, busi]
9479                 [feel, uptight, wonder, know, right]
812     [allow, eat, food, know, bother, sinc, feel, a...
8670    [happi, progress, also, begin, feel, littl, ho...
1014    [alon, apart, get, overwhelm, feel, like, watc...
Name: Input, dtype: object

In [25]:
data_train.shape

(14000, 3)

In [26]:
data_test.shape

(6000, 3)

In [27]:
data_train.head()

Unnamed: 0,Input,Sentiment,num_words
7181,"[feel, sure, go, busi]",2,4
9479,"[feel, uptight, wonder, know, right]",1,5
812,"[allow, eat, food, know, bother, sinc, feel, a...",4,16
8670,"[happi, progress, also, begin, feel, littl, ho...",4,7
1014,"[alon, apart, get, overwhelm, feel, like, watc...",4,8


In [28]:
from tensorflow.keras.utils import to_categorical

In [29]:
to_categorical(data_train.Sentiment)

array([[0., 0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       ...,
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.]], dtype=float32)

In [30]:
from transformers import AutoTokenizer, TFBertModel
# tokenizer and bert must be from same pretrained model
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
bert = TFBertModel.from_pretrained('bert-base-cased')

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [31]:
tokenizer.save_pretrained('bert-tokenizer')
bert.save_pretrained('bert-model')
# saving model locally so we can use it later on

In [32]:
import shutil
shutil.make_archive('bert-tokenizer', 'zip', 'bert-tokenizer')
shutil.make_archive('bert-model', 'zip', 'bert-model')

'/Users/davidtan/Desktop/ml-rp/bert-model.zip'

In [33]:
# distilbert is lighter and also provides similar performance
from transformers import BertTokenizer, TFBertModel, BertConfig, TFDistilBertModel, DistilBertTokenizer, DistilBertConfig
dbert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['activation_13', 'vocab_projector', 'vocab_transform', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [34]:
# tokenize the input (takes some time)
# here tokenizer using from bert-base-cased
x_train = tokenizer(
    text=data_train.Input.tolist(),
    is_split_into_words=True,
    add_special_tokens=True,
    max_length=36,
    truncation=True,
    padding=True,
    return_tensors='tf',
    return_token_type_ids=False,
    return_attention_mask=True,
    verbose=True
)
x_test = tokenizer(
    text=data_test.Input.tolist(),
    is_split_into_words=True,
    add_special_tokens=True,
    max_length=36,
    truncation=True,
    padding=True,
    return_tensors='tf',
    return_token_type_ids=False,
    return_attention_mask=True,
    verbose=True
)

In [35]:
x_train
x_test

{'input_ids': <tf.Tensor: shape=(6000, 36), dtype=int32, numpy=
array([[ 101, 1474, 1271, ...,    0,    0,    0],
       [ 101, 1631, 1176, ..., 1403,  102,    0],
       [ 101, 2647, 1631, ...,    0,    0,    0],
       ...,
       [ 101, 1631, 2393, ...,    0,    0,    0],
       [ 101, 1180, 1631, ...,    0,    0,    0],
       [ 101, 5178, 5427, ...,    0,    0,    0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(6000, 36), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>}

LOAD LIBRARIES

In [36]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical

In [37]:
import tensorflow as tf
tf.config.experimental.list_physical_devices('GPU')

[]

In [38]:
max_len = 36
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
   
input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
input_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
# embeddings = dbert_model(input_ids,attention_mask = input_mask)[0]


embeddings = bert(input_ids,attention_mask = input_mask)[0] #(0 is the last hidden states,1 means pooler_output)
out = tf.keras.layers.GlobalMaxPool1D()(embeddings)
out = Dense(128, activation='relu')(out)
out = tf.keras.layers.Dropout(0.1)(out)
out = Dense(32,activation='relu')(out)

y = Dense(6,activation='sigmoid')(out)

model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)
model.layers[2].trainable = True

In [39]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 36)]         0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 36)]         0           []                               
                                                                                                  
 tf_bert_model_1 (TFBertModel)  TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 36,                                            

In [40]:
tf.keras.utils.plot_model(model, show_shapes=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [41]:
optimizer = Adam(
    learning_rate=5e-05,
    epsilon=1e-08,
    weight_decay=0.01,
    clipnorm=1.0
)

# set loss and metrics
loss = CategoricalCrossentropy(from_logits=True)
metric = CategoricalAccuracy('balanced_accuracy')
# compile the model
model.compile(optimizer=optimizer, loss=loss, metrics=metric)

In [42]:
tf.config.experimental_run_functions_eagerly(True)
tf.config.run_functions_eagerly(True)

Instructions for updating:
Use `tf.config.run_functions_eagerly` instead of the experimental version.


MODEL FITTING AND EVALUATION

In [43]:
train_history = model.fit(
    x = {'input_ids':x_train['input_ids'], 'attention_mask':x_train['attention_mask']},
    y = to_categorical(data_train.Sentiment),
    validation_data = (
        {'input_ids': x_test['input_ids'], 'attention_mask':x_test['attention_mask']}, to_categorical(data_train.Sentiment)
    ),
    epochs = 1,
    batch_size = 36
)

2023-04-10 13:12:46.805207: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz




ValueError: Data cardinality is ambiguous:
  x sizes: 6000, 6000
  y sizes: 14000
Make sure all arrays contain the same number of samples.

In [44]:
model.save_weights('sentiment_weights.h5')

Create new model and load the weights

In [46]:
predicted_raw = model.predict({'input_ids':x_test['input_ids'],'attention_mask':x_test['attention_mask']})



In [47]:
predicted_raw[0]

array([0.98746896, 0.40664428, 0.21151419, 0.06752545, 0.4114723 ,
       0.05212162], dtype=float32)

In [48]:
import numpy as np
y_predicted = np.argmax(predicted_raw, axis=1)

In [49]:
data_test.Sentiment

10486    0
1320     0
10848    2
195      4
2093     4
        ..
1241     4
8642     1
1775     2
5733     0
1376     1
Name: Sentiment, Length: 6000, dtype: int8

In [50]:
from sklearn.metrics import classification_report

In [51]:
classification_report(data_test.Sentiment, y_predicted)

'              precision    recall  f1-score   support\n\n           0       0.87      0.94      0.91       813\n           1       0.88      0.83      0.86       712\n           2       0.91      0.91      0.91      2028\n           3       0.74      0.75      0.75       492\n           4       0.95      0.92      0.93      1739\n           5       0.74      0.86      0.79       216\n\n    accuracy                           0.89      6000\n   macro avg       0.85      0.87      0.86      6000\nweighted avg       0.89      0.89      0.89      6000\n'

In [52]:
texts = ["i think this stock has great potential", "i don't like this stock"]

x_val = tokenizer(
    text = texts,
    add_special_tokens=True,
    max_length = 36,
    truncation = True,
    padding='max_length',
    return_tensors='tf',
    return_token_type_ids=False,
    return_attention_mask=True,
    verbose=True
)

validation = model.predict({'input_ids':x_val['input_ids'],'attention_mask':x_val['attention_mask']})*100
validation



array([[49.781826, 22.955374, 99.89456 , 61.053284, 74.22109 , 20.94696 ],
       [92.35382 , 34.231865, 51.180683, 20.458467, 53.64274 , 14.894348]],
      dtype=float32)

In [54]:
for key,value in zip(encoded_dict.keys(), validation[1]):
    print(key,value)

anger 92.35382
fear 34.231865
joy 51.180683
love 20.458467
sadness 53.64274
surprise 14.894348
