In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/emoji-prediction-1/val_set_processed.csv
/kaggle/input/emoji-prediction-1/train_set_processed.csv
/kaggle/input/emoji-prediction-1/test_set_processed.csv


In [2]:
import numpy as np
import pandas as pd
import getopt
import logging
#import nltk
import os
import re
import sys

from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer

from sklearn.metrics import f1_score, accuracy_score, classification_report
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.preprocessing import MaxAbsScaler

from keras.models import Sequential
from keras.layers import Dense, Dropout, SimpleRNN
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model

In [3]:
train_proc = pd.read_csv('../input/emoji-prediction-1/train_set_processed.csv')
val_proc = pd.read_csv('../input/emoji-prediction-1/val_set_processed.csv')
test_proc = pd.read_csv('../input/emoji-prediction-1/test_set_processed.csv')

In [4]:
def emoji_to_int(labels: list):
    return [emoji_map[emoji] for emoji in labels]


def keep_top_10(data, top_10: list): 
    """
    Function that checks, whether Tweet consists of one of the top ten emojis.
    If, and only if, Tweet consists one of the most frequent emojis, 
    Tweet will be used for further analysis.
    Else: Line will be dropped.
    """
    idx_drop = []
    for index, row in data.iterrows():
        if row[1] not in top_10:
            idx_drop.append(index)
    return data.drop(data.index[idx_drop])

In [5]:
top_10_test = test_proc['label'].value_counts()[:10].index.to_list()
print(top_10_test)

['😍', '😂', '❤️', '💕', '😊', '😘', '😭', '💖', '😎', '✨']


In [6]:
emoji_map = {emoji: i for i, emoji in enumerate(top_10_test)}
idx_emoji = {i: emoji for i, emoji in enumerate(top_10_test)}

In [7]:
train_data = keep_top_10(train_proc, top_10_test)
print("Number of Tweets in the train data set: {}".format(len(train_data)))
test_data = keep_top_10(test_proc, top_10_test)
print("Number of Tweets in the test data set: {}".format(len(test_data)))
val_data = keep_top_10(val_proc, top_10_test)
print("Number of Tweets in the validation data set: {}".format(len(val_data)))

Number of Tweets in the train data set: 81236
Number of Tweets in the test data set: 7646
Number of Tweets in the validation data set: 7613


In [8]:
def tweets_cleaning(tweets, labels, use_stopwords = False, train = False, use_bigrams = False, 
                    lowercase = True, stemming = False, min_df = 2, embedding = False):
    """
    Text cleaning function that performs all necessary text preprocessing steps.
    Function only keeps characters, that are alphanumerical (non-alphanumerical values are discarded).
    Digits are treated by regular expressions.
    Lower-casing is performed to reduce noise and normalize the text (convert it into a uniform representation).
    Stemming is performed to only keep the stem of each word token but not any other deviated form. 
    Stop words (i.e., words that occur more frequently than other words in a given corpus) are removed.
    """
    if stemming:
        # initialize Lancaster stemmer
        st = LancasterStemmer()
    if use_stopwords:
        # create list of stopwords
        stopwords = list(set(stopwords.words('english')))
    cleaned_data = []
    cleaned_labels = []
    
    all_bigrams = [] # serves as place-holder
    bigrams_dict = dict()
    vocab = dict()
    
    for tweet, label in zip(tweets, labels):
        tweet = re.sub(r'&amp\S+','', tweet)
        tweet = re.sub(r' & ', ' and ', tweet)
        tweet = re.sub(r'!+', ' ! ', tweet)
        tweet = re.sub(r'[?]+', ' ? ', tweet)
        tweet = re.sub('@.+', '@user', tweet)
        tweet = re.sub('#', '# ', tweet)

        # Create spaces instead of some punctuation marks, but not if it's part of an emoticon
        tweet = ' '.join([word if re.search(r'(?:X|:|;|=)(?:-)?(?:\)|\(|O|D|P|S)+', word)
            else re.sub('[,.;\-_:/\n\t]+', ' ', word) for word in tweet.split()])
        
        tweet = tweet.split(" ")
        
        cleaned_tweet = []
        for word in tweet:
            
            #if emoticon is in word, keep the emoticon
            if re.search(r'(?:X|:|;|=)(?:-)?(?:\)|\(|O|D|P|S)+', word):
                cleaned_word = word
            else:
                # keep special characters which might carry important information
                # perform lower-casing to normalize the text and reduce noise
                cleaned_word = ''.join([char for char in word if re.search('[<>$#€£!?@=]', char) or
                                        char.isalnum()])
            if lowercase:
                cleaned_word = cleaned_word.lower()
                
            if "<3" not in cleaned_word:
                cleaned_word = re.sub('[0-9]', '0', cleaned_word)
  
            # removes each \n (i.e., new line) or \t (i.e., tab) -> pipe char denotes a disjunction
            cleaned_word = re.sub(r'( \n| \t)+', '', cleaned_word)
            
            if stemming:
                cleaned_word = st.stem(cleaned_word)
            
            if len(cleaned_word) > 0:
                if not use_stopwords:
                    cleaned_tweet.append(cleaned_word)
                elif(cleaned_word not in stopwords):
                    cleaned_tweet.append(cleaned_word)

                if train:
                    if cleaned_word in vocab:
                        vocab[cleaned_word] += 1
                    else:
                        vocab[cleaned_word] = 1
            
        # only append tweets with more than 1 word per tweet
        if len(cleaned_tweet) > 1:
            
            if train and use_bigrams:
                
                bigrams = [' '.join([cleaned_tweet[i-1], cleaned_tweet[i]]) 
                           for i, _ in enumerate(cleaned_tweet) if i > 0]
                
                for bigram in bigrams:
                    
                    if bigram in bigrams_dict:
                        bigrams_dict[bigram] += 1
                    else:
                        bigrams_dict[bigram] = 1 

            cleaned_tweet = ' '.join(cleaned_tweet)
            cleaned_data.append(cleaned_tweet)
            cleaned_labels.append(label)
            
    if train and embedding and not use_bigrams:
        
        word2index = dict()
        i = 1
        for word in vocab.keys():
            word2index[word] = i
            i += 1
            
        word2index.update({'UNK': len(word2idx) + 1})
        
        assert len(cleaned_data) == len(cleaned_labels)

        return cleaned_data, cleaned_labels, word2index
                
    if train:
        vocab = [word for word, freq in vocab.items() if freq >= min_df]  
        if use_bigrams:
            all_bigrams = [bigram for bigram, freq in bigrams_dict.items() if freq >= min_df]
            vocab.extend(all_bigrams)
        
    assert len(cleaned_data) == len(cleaned_labels)
    
    return cleaned_data, cleaned_labels, sorted(vocab), sorted(all_bigrams)

In [9]:
cleaned_train_data, train_labels, vocab, bigrams = tweets_cleaning(train_data.text, 
                                                                   train_data.label, 
                                                                   use_stopwords = False, 
                                                                   train = True, 
                                                                   use_bigrams = True, 
                                                                   lowercase = True,
                                                                   min_df = 2)

cleaned_test_data, test_labels, _, _ = tweets_cleaning(test_data.text, 
                                                       test_data.label, 
                                                       use_stopwords = False, 
                                                       lowercase = True)

cleaned_val_data, val_labels, _, _ = tweets_cleaning(val_data.text, 
                                                     val_data.label, 
                                                     use_stopwords = False, 
                                                     lowercase = True)

In [10]:
print("Number of unique tokens in the vocabulary: {}".format(len(vocab)))
print("Number of Tweets per data set after text cleaning was computed:")
print("Train: {}".format(len(cleaned_train_data)))
print("Test: {}".format(len(cleaned_test_data)))
print("Validation: {}".format(len(cleaned_val_data)))

Number of unique tokens in the vocabulary: 55855
Number of Tweets per data set after text cleaning was computed:
Train: 68750
Test: 6539
Validation: 6505


In [11]:
y_train = emoji_to_int(train_labels)
y_test = emoji_to_int(test_labels)
y_val = emoji_to_int(val_labels)

In [12]:
def bag_of_words(train: list, test: list, val: list, ngram: tuple, vocab = None, 
                 n_best_factor = 0.7):
    """
    Create a weighted bag-of-words unigram or bigram representation of provided tweets.
    Ngram is set to unigram by default. If bigram bag-of-words should be created, pass tuple (2, 2).
    
    Vocabulary argument is set to None by default. 
    You can pass a vocabulary to this function, which may then be used for TfidfVectorizer. 
    If you do not pass a vocabulary to this function, TfidfVectorizer will create a vocabulary itself.
    """ 
    
    vectorizer = CountVectorizer(encoding = 'utf-8', ngram_range = ngram, analyzer = 'word', 
                                 vocabulary = vocab, max_df = 0.9)
    
    train_BoW = vectorizer.fit_transform(train) #.toarray()
    test_BoW = vectorizer.transform(test) #.toarray()
    val_BoW = vectorizer.transform(val) #.toarray()
    

    return train_BoW, test_BoW, val_BoW

def to_cat_matrix(y):

    """ 
    Binary one-hot encoding using an indicator matrix.
    This function converts labels to a categorical matrix which is of size N x K.
    Each row is a row vector with k-1 zeros and a single 1.
    """
    N = len(y)
    K = len(set(y))
    ind_matrix = np.zeros((N,K), dtype = int)
    
    for i, cat in enumerate(y):
        ind_matrix[i, int(cat)] = 1
        
    return ind_matrix

In [13]:
X_train, X_test, X_val = bag_of_words(cleaned_train_data, cleaned_test_data, cleaned_val_data, ngram = (1, 2), vocab = vocab)

In [14]:
y_train = emoji_to_int(train_data.iloc[:68750,:].label)
y_test = emoji_to_int(test_data.iloc[:6539,:].label)
y_val = emoji_to_int(val_data.iloc[:6505,:].label)

In [15]:
X_train, X_test, X_val = bag_of_words(train_data.iloc[:68750,:].text, test_data.iloc[:6539,:].text, val_data.iloc[:6505,:].text, ngram = (1, 2), vocab= None)

In [16]:
X_train.shape

(68750, 268399)

In [17]:
def get_model(hidden_units: int, input_dims: int, n_labels: int):
    model = Sequential()
    model.add(Dense(hidden_units, input_dim = input_dims, activation = 'relu'))
    model.add(Dropout(0.5)) # dropout is important to prevent model from overfitting
    model.add(Dense(n_labels, activation = 'softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

def preds_to_labels(ypred):
    """
    Firstly, extract the predicted label from a vector of probability distributions.
    Secondly, retrieve index of highest value (i.e., highest probability).
    """
    num_labels = [np.argmax(pred) for pred in ypred]
    return np.array(num_labels)

def accuracy_top_n(y_true, y_preds, top_n = 3):
    """
    If the correct label / emoji is among the top n (e.g., two, three) predictions,
    we consider the prediction as correctly labeled.
    """
    n_correct = 0
    n_total = 0
    
    for i, pred in enumerate(y_preds):
        top_3 = np.argsort(pred)[-top_n:]
        if y_true[i] in top_3:
            n_correct += 1
        n_total += 1
        
    ratio = n_correct / n_total
    return round(ratio, 4)

In [18]:
# get indicator matrix with one-hot-encoded vectors per label (of all labels)
y_train = to_cat_matrix(y_train)
y_val = to_cat_matrix(y_val)

In [19]:
X_train.sort_indices()
X_val.sort_indices()

In [20]:
# set number of hidden units, epochs and batch size
n_units = 60
n_epochs = 6
n_batches = 32

model = get_model(n_units, X_train.shape[1], y_train.shape[1])

es = EarlyStopping(monitor='val_acc', mode='max', verbose=1)
mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

2022-11-17 16:35:32.199578: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-17 16:35:32.326062: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-17 16:35:32.327157: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-17 16:35:32.329037: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

In [21]:
model.fit(X_train, y_train, validation_data = (X_val, y_val), epochs = n_epochs, 
          batch_size = n_batches, callbacks = [es, mc])

Epoch 1/6


  "shape. This may consume a large amount of memory." % value)
2022-11-17 16:35:35.552816: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7f7785e51dd0>

In [22]:
# load best model
model.save('best_model.h5')
saved_model = load_model('best_model.h5')
# get predictions
y_pred_test = saved_model.predict(X_test)
# convert predictions to labels
y_pred_labels = preds_to_labels(y_pred_test)

In [23]:
# if true label is among the top 3 predictions, prediction is deemed correctly labeled
print(accuracy_top_n(y_test, y_pred_test, top_n = 3))
# if true label is among the top 2 predictions, prediction is deemed correctly labeled
print(accuracy_top_n(y_test, y_pred_test, top_n = 2))
# if true label is among the top 1 prediction, prediction is deemed correctly labeled
print(accuracy_top_n(y_test, y_pred_test, top_n = 1))

0.6908
0.5958
0.454


In [24]:
# f1_score(y_test, y_pred_labels, average = 'weighted')
# f1_score(y_test, y_pred_labels, average = 'micro')

In [25]:
print(classification_report(y_test, y_pred_labels, target_names=top_10_test))

              precision    recall  f1-score   support

           😍       0.48      0.59      0.53      1353
           😂       0.55      0.64      0.59      1233
          ❤️       0.42      0.49      0.45      1151
           💕       0.34      0.31      0.33       635
           😊       0.39      0.29      0.33       585
           😘       0.47      0.26      0.33       331
           😭       0.37      0.28      0.32       333
           💖       0.41      0.22      0.29       316
           😎       0.41      0.31      0.35       306
           ✨       0.45      0.38      0.41       296

    accuracy                           0.45      6539
   macro avg       0.43      0.38      0.39      6539
weighted avg       0.45      0.45      0.44      6539



In [26]:
y_train = emoji_to_int(train_labels)
y_test = emoji_to_int(test_labels)

In [27]:
# Print out example tweets
start = 100
finish = 150
for tweet, preds, true in zip(cleaned_test_data[start:finish], y_pred_test[start:finish], test_labels[start:finish]):
    print(tweet)
    pred = np.argsort(preds)
    print("true label:", true)
    print("prediction:", idx_emoji[pred[-1]])
    print("prediction:", idx_emoji[pred[-2]])
    print("prediction:", idx_emoji[pred[-3]])    
    print()

lovely evening w @user
true label: 💕
prediction: 😍
prediction: 😘
prediction: 😂

the shit @user
true label: 😂
prediction: 😍
prediction: 😎
prediction: 😂

i want swole but i enjoyed both
true label: 💕
prediction: ❤️
prediction: 😊
prediction: 😘

vibinn till i fall asleep ily lil yachty
true label: ❤️
prediction: 😂
prediction: ✨
prediction: 😘

final preparation for our # yearofmercy # interfaith event tomorrow ! # badges
true label: 💕
prediction: 😊
prediction: 😍
prediction: ❤️

what do you call a fish with no eyes ? fsh !
true label: 😂
prediction: 😍
prediction: 😂
prediction: 😭

when hes extra sweet to you but you use to dogs so you dont know how to accept him being sweet
true label: 😂
prediction: 💕
prediction: 😘
prediction: ❤️

tagged by @user well
true label: 😂
prediction: 💕
prediction: 😍
prediction: ❤️

endlessly and untiringly falling for your dimples
true label: 😍
prediction: 😂
prediction: 😭
prediction: 😍

00 months with this beautiful girl i love you sky
true label: ❤️
prediction: 😍
pr

In [28]:
# Count occurances of Emojis in the predictions
freq = {}
for pred in y_pred_labels:
    if idx_emoji[pred] in freq:
        freq[idx_emoji[pred]] += 1
    else:
        freq[idx_emoji[pred]] = 1 
print(freq)

{'💕': 586, '❤️': 1340, '😂': 1437, '😍': 1657, '😊': 438, '✨': 248, '😘': 179, '😭': 250, '😎': 234, '💖': 170}


In [29]:
# Count occurances of Emojis in the test set
freq = {}
for y_true in y_test:
    if idx_emoji[y_true] in freq:
        freq[idx_emoji[y_true]] += 1
    else:
        freq[idx_emoji[y_true]] = 1 
print(freq)

{'✨': 326, '😊': 519, '❤️': 1193, '😎': 307, '😭': 343, '😂': 1194, '😍': 1367, '💕': 650, '😘': 326, '💖': 314}
