In [1]:
# ! pip install emot
# ! pip install re

In [28]:
import pandas as pd
import matplotlib
import emot, re
from sklearn.model_selection import train_test_split
from emot.emo_unicode import UNICODE_EMOJI, EMOTICONS_EMO

from tensorflow.keras import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, Dropout, Flatten, LSTM, Embedding
from transformers import DistilBertTokenizer, TFDistilBertModel, DistilBertConfig

import keras_tuner as kt
import keras.backend as K
from tensorflow import keras

pd.set_option("display.max_columns", 99)

In [2]:
! wget data/full_dataset/ https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_1.csv
! wget data/full_dataset/ https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_2.csv
! wget data/full_dataset/ https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_3.csv
goemotions_1 = pd.read_csv('goemotions_1.csv')
goemotions_2 = pd.read_csv('goemotions_2.csv')
goemotions_3 = pd.read_csv('goemotions_3.csv')
frames = [goemotions_1, goemotions_2, goemotions_3]
df = pd.concat(frames, ignore_index=True)

--2022-03-16 17:37:14--  http://data/full_dataset/
Resolving data (data)... failed: nodename nor servname provided, or not known.
wget: unable to resolve host address ‘data’
--2022-03-16 17:37:14--  https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_1.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.43.16, 172.217.163.48, 142.251.42.240, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.43.16|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14174600 (14M) [application/octet-stream]
Saving to: ‘goemotions_1.csv.2’


2022-03-16 17:37:17 (9.79 MB/s) - ‘goemotions_1.csv.2’ saved [14174600/14174600]

FINISHED --2022-03-16 17:37:17--
Total wall clock time: 2.6s
Downloaded: 1 files, 14M in 1.4s (9.79 MB/s)
--2022-03-16 17:37:17--  http://data/full_dataset/
Resolving data (data)... failed: nodename nor servname provided, or not known.
wget: unable to resolve host address ‘data’
--2022-03-16 1

In [3]:
print(df.shape)
df.head(2)

(211225, 37)


Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,disappointment,disapproval,disgust,embarrassment,excitement,fear,gratitude,grief,joy,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1548084000.0,37,True,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [4]:
df.example_very_unclear.value_counts()

False    207814
True       3411
Name: example_very_unclear, dtype: int64

In [5]:
%%time
# emotion label columns
lb_col = df.columns[9:]
print(f'Entire dataset: {df.shape}')
# remove examples that were very unclear. They do not have any emotions labels
df2 = df[~df['example_very_unclear']].copy()
print(f'{df2.shape[0]} rows left after removing records with unclear emotions')
# Sum the emotions together because there are different raters
df2 = df2[['text']+list(lb_col)].groupby(['text']).sum().reset_index()
print(f'{df2.shape[0]} rows left after deduping on text and id columns')
# keep records where there are at least two agreeing emotion labels
df2['at_least_2_agree'] = df2[lb_col].apply(lambda x: x.max(), axis=1)
df2 = df2[df2.at_least_2_agree >= 2]
df2 = df2.replace({1: 0, 2: 1, 3: 1, 4: 1, 5: 1}, inplace=False)
df_clean = df2.drop('at_least_2_agree', axis=1)
print(f'{df2.shape[0]} rows left after removing records without having at least two raters agreeing on the emotion')

Entire dataset: (211225, 37)
207814 rows left after removing records with unclear emotions
57730 rows left after deduping on text and id columns
53994 rows left after removing records without having at least two raters agreeing on the emotion
CPU times: user 4.91 s, sys: 388 ms, total: 5.3 s
Wall time: 7.29 s


In [6]:
# A surprise to be sure, but a welcome one.
# a = pd.read_csv("train.tsv",sep='\t',names=['a','b']).reset_index().rename(columns={'index':'text'})
# a[a.text.str.contains('A surprise to be sure, but a welcome one.')]

In [7]:
print(df_clean.shape)
df_clean.head(2)

(53994, 29)


Unnamed: 0,text,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,disappointment,disapproval,disgust,embarrassment,excitement,fear,gratitude,grief,joy,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,"""If you don't wear BROWN AND ORANGE...YOU DON...",0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,"""What do Scottish people look like?"" How I wo...",0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [10]:
def convert_emojis(text):
    for emot in UNICODE_EMOJI:
        if emot in text:
            orig = text
            text = text.replace(emot, "_".join(UNICODE_EMOJI[emot].replace(",","").replace(":","").split()))
#             print(f'{orig} -> {text}', '\n')
    return text

def convert_emoticons(text):
    for emot in EMOTICONS_EMO:
        if emot in text:
            orig = text
            text = text.replace(emot, "_".join(EMOTICONS_EMO[emot].split()))
#             print(f'{orig} -> {text}', '\n')
    return text

In [11]:
%%time
print('-------replacing emoji with text---------')
df_clean['text'] = df_clean['text'].apply(lambda x: convert_emojis(x))
print('-------replacing emoticon with text---------')
df_clean['text'] = df_clean['text'].apply(lambda x: convert_emoticons(x))

-------replacing emoji with text---------
-------replacing emoticon with text---------
CPU times: user 7.92 s, sys: 115 ms, total: 8.03 s
Wall time: 8.7 s


In [12]:
X = df_clean['text']
y = df_clean.iloc[:,2:]
y.head(2)

Unnamed: 0,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,disappointment,disapproval,disgust,embarrassment,excitement,fear,gratitude,grief,joy,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_val, X_test, Y_val, Y_test = train_test_split(X_test, Y_test, test_size=0.5, random_state=2)
print(f'X_train shape: {X_train.shape}')
print(f'X_val shape: {X_val.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'Y_train shape: {Y_train.shape}')
print(f'Y_val shape: {Y_val.shape}')
print(f'Y_test shape: {Y_test.shape}')

X_train shape: (43195,)
X_val shape: (5399,)
X_test shape: (5400,)
Y_train shape: (43195, 27)
Y_val shape: (5399, 27)
Y_test shape: (5400, 27)


## Word Embedding

In [14]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 3000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 32
# This is fixed.
EMBEDDING_DIM = 100
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='', lower=True)
tokenizer.fit_on_texts(X_train.values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 49290 unique tokens.


In [15]:
def embedding(text, tokenizer):
    MAX_SEQUENCE_LENGTH = 32
    text = tokenizer.texts_to_sequences(text.values)
    text = pad_sequences(text, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
    print('Shape of data tensor:', text.shape)
    return text

In [16]:
X_train = embedding(X_train, tokenizer)
X_val = embedding(X_val, tokenizer)
X_test = embedding(X_test, tokenizer)
# Y_train = embedding(Y_train)
# Y_val = embedding(Y_val)
# Y_test = embedding(Y_test)

Shape of data tensor: (43195, 32)
Shape of data tensor: (5399, 32)
Shape of data tensor: (5400, 32)


## Build Model

In [None]:
# def model_builder(hp):
#     model = Sequential()
#     model.add(keras.layers.Flatten(input_shape=(28, 28)))

#     # Tune the number of units in the first Dense layer
#     # Choose an optimal value between 32-512
#     hp_units = hp.Int('units', min_value=32, max_value=512, step=32)
#     model.add(keras.layers.Dense(units=hp_units, activation='relu'))
#     model.add(keras.layers.Dense(10))

#     # Tune the learning rate for the optimizer
#     # Choose an optimal value from 0.01, 0.001, or 0.0001
#     hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

#     model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
#                 loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
#                 metrics=['accuracy'])

#     return model

In [44]:
class MyHyperModel(kt.HyperModel):
    def build(self, hp):
        # TODO: build the model, 
        # DO NOT CHANGE THE INPUTS OR OUTPUTS TO THIS FUNCTION


        model=Sequential()
        model.add(Embedding(MAX_NB_WORDS,100,input_length=MAX_SEQUENCE_LENGTH))
        hp_units_1 = hp.Int('units1', min_value=32, max_value=32, step=32)
        model.add(LSTM(hp_units_1,input_shape=(64,),activation='relu',return_sequences=True))
        model.add(Dropout(0.2))
        hp_units_2 = hp.Int('units2', min_value=32, max_value=32, step=32)
        model.add(LSTM(hp_units_2,activation='relu'))
        model.add(Dropout(0.2))
        # for units in [128,128,64,32]:
        # model.add(Dense(units,activation='relu'))
        # model.add(Dropout(0.2))
        model.add(Dense(32,activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(27,activation='sigmoid'))

        print(model.summary())

        hp_learning_rate = hp.Choice('learning_rate', values=[1e-4, 1e-5, 1e-6])

        model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
                    loss=keras.losses.CategoricalCrossentropy(),
                    metrics=['accuracy'])
        return model

    # def compile_model(model):
    #     # TODO: compile the model
    #     # DO NOT CHANGE THE INPUTS OR OUTPUTS TO THIS FUNCTION



    # #     model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    #     return model

    def fit(self, hp, model, *args, **kwargs):
        # TODO: train the model
        # DO NOT CHANGE THE INPUTS OR OUTPUTS TO THIS FUNCTION

        return model.fit(
            *args,
            batch_size=hp.Choice("batch_size", [128]),
            **kwargs)


#     def eval_model(self, model, X_test, Y_test):
#         # TODO: evaluate the model
#         # DO NOT CHANGE THE INPUTS OR OUTPUTS TO THIS FUNCTION

#         test_loss, test_accuracy = model.evaluate(X_test, Y_test)
#         return test_loss, test_accuracy

In [49]:
tuner = kt.Hyperband(MyHyperModel(),
                     objective='val_accuracy',
                     max_epochs=10,
                     factor=2,
                     directory='my_dir',
                     project_name='intro_to_kt')
stop_early = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

INFO:tensorflow:Reloading Oracle from existing project my_dir/intro_to_kt/oracle.json
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 32, 100)           300000    
                                                                 
 lstm_2 (LSTM)               (None, 32, 32)            17024     
                                                                 
 dropout_3 (Dropout)         (None, 32, 32)            0         
                                                                 
 lstm_3 (LSTM)               (None, 32)                8320      
                                                                 
 dropout_4 (Dropout)         (None, 32)                0         
                                                                 
 dense_2 (Dense)             (None, 32)                1056      
                                  

In [50]:
tuner.search(X_train,
        Y_train,
#         batch_size=128,
        epochs=30,
        verbose=2,
        validation_data=(X_val, Y_val),
        callbacks=[stop_early])

# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
The hyperparameter search is complete. The optimal number of units in the first LSTM
layer is {best_hps.get('units1')}, the optimal number of units in the second LSTM
layer is {best_hps.get('units2')}, best batch_size is {best_hps.get('batch_size')}, and the optimal learning rate for the optimizer
is {best_hps.get('learning_rate')}.
""")

Trial 5 Complete [00h 00m 36s]
val_accuracy: 0.3059825897216797

Best val_accuracy So Far: 0.3059825897216797
Total elapsed time: 00h 02m 37s
INFO:tensorflow:Oracle triggered exit

The hyperparameter search is complete. The optimal number of units in the first LSTM
layer is 32, the optimal number of units in the second LSTM
layer is 32, best batch_size is 128, and the optimal learning rate for the optimizer
is 1e-05.



In [51]:
model = tuner.hypermodel.build(best_hps)
history = model.fit(X_train,
        Y_train,
        batch_size=128,
        epochs=5,
        verbose=2,
        validation_data=(X_val, Y_val))

val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 32, 100)           300000    
                                                                 
 lstm_2 (LSTM)               (None, 32, 32)            17024     
                                                                 
 dropout_3 (Dropout)         (None, 32, 32)            0         
                                                                 
 lstm_3 (LSTM)               (None, 32)                8320      
                                                                 
 dropout_4 (Dropout)         (None, 32)                0         
                                                                 
 dense_2 (Dense)             (None, 32)                1056      
                                                                 
 dropout_5 (Dropout)         (None, 32)               