In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/glove840b300dtxt/glove.840B.300d.txt
/kaggle/input/twitter-data/newData.csv


In [3]:
df = pd.read_csv('../input/twitter-data/newData.csv')
positive =  ['😊','😉','😀','😃','🌚','😇','🎉','😅','😆','😎','😏','😈','😋','😜','😝','😌','😂','😁','😄','💕','💖','💗','💙','💛','💜','💚','💞','😍','😘','😢','✅','💯','🔥','👍','👏','👌','👀','💪','🙏','🙌','🙈','✨','🎶','👇','👑','😬','😱','😳']
negative  =  ['😭','😡','😤','😑','😒','💀','😣','😩','😪','😫','💔','😴','😐','😔','😕','😞']
def sentiment_map(emoji):
    if emoji in positive:
        return 1
    else :
        return 0
df['sentiment'] = df['label_to_emoji'].map(sentiment_map)

In [4]:
emoji_to_idx = {'✅': 0, '✨': 1, '🌚': 2, '🎉': 3, '🎶': 4, '👀': 5, '👇': 6, '👌': 7, '👍': 8, '👏': 9, '👑': 10, '💀': 11, '💔': 12, '💕': 13, '💖': 14, '💗': 15, '💙': 16,\
                    '💚': 17, '💛': 18, '💜': 19, '💞': 20, '💪': 21, '💯': 22, '🔥': 23, '😀': 24, '😁': 25, '😂': 26, '😃': 27, '😄': 28, '😅': 29, '😆': 30, '😇': 31, '😈': 32,\
                    '😉': 33, '😊': 34, '😋': 35, '😌': 36, '😍': 37, '😎': 38, '😏': 39, '😐': 40, '😑': 41, '😒': 42, '😔': 43, '😕': 44, '😘': 45, '😜': 46, '😝': 47, '😞': 48,\
                    '😡': 49, '😢': 50, '😣': 51, '😤': 52, '😩': 53, '😪': 54, '😫': 55, '😬': 56, '😭': 57, '😱': 58, '😳': 59, '😴': 60, '🙈': 61, '🙌': 62, '🙏': 63}

In [5]:
mappings = pd.DataFrame(data =emoji_to_idx.items(),columns=['emoticons','number'])

In [6]:
mappings

Unnamed: 0,emoticons,number
0,✅,0
1,✨,1
2,🌚,2
3,🎉,3
4,🎶,4
...,...,...
59,😳,59
60,😴,60
61,🙈,61
62,🙌,62


In [7]:
df  = df.drop(columns = ['Unnamed: 0'])

In [8]:
from sklearn.model_selection import train_test_split 
train_data, test_data = train_test_split(df, test_size = 0.2, stratify = df.newLabel)

In [9]:
train_length = train_data.shape[0]
test_length = test_data.shape[0]

In [10]:
# tokenize the sentences
def tokenize(tweets):
    tokenized_tweets = []
    for tweet in tweets:
        tokenized_tweets.append(tweet)
    return tokenized_tweets

In [11]:
from keras.preprocessing.text import Tokenizer

In [12]:
def encod_tweets(tweets):
    tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', split=" ", lower=True)
    tokenizer.fit_on_texts(tweets)
    return tokenizer, tokenizer.texts_to_sequences(tweets)

In [13]:
from keras.preprocessing.sequence import pad_sequences

In [21]:
def format_data(encoded_tweets, max_length, labels):
    x = pad_sequences(encoded_tweets, maxlen= max_length, padding='post')
    y = []
    for emoji in labels:
        bit_vec = np.zeros(7)
        bit_vec[emoji-1] = 1
        y.append(bit_vec)
    y = np.asarray(y)
    return x, y

In [22]:
# create weight matrix from pre trained embeddings
def create_weight_matrix(vocab, raw_embeddings):
    vocab_size = len(vocab) + 1
    weight_matrix = np.zeros((vocab_size, 300))
    for word, idx in vocab.items():
        if word in raw_embeddings:
            weight_matrix[idx] = raw_embeddings[word]
    return weight_matrix

In [23]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.wrappers import Bidirectional
from keras.layers import Embedding

In [24]:
# final model
def final_model(weight_matrix, vocab_size, max_length, x, y, epochs = 1):
    embedding_layer = Embedding(vocab_size, 300, weights=[weight_matrix], input_length=max_length, trainable=True, mask_zero=True)
    model = Sequential()
    model.add(embedding_layer)
    model.add(Bidirectional(LSTM(128, dropout=0.2, return_sequences=True)))
    model.add(Bidirectional(LSTM(128, dropout=0.2)))
    model.add(Dense(7, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(x, y, epochs = epochs, validation_split = 0.25)
    score, acc = model.evaluate(x_test, y_test)
    return model, score, acc

In [25]:
import math

In [26]:
tokenized_tweets = tokenize(train_data['pre_punc'])
tokenized_tweets += tokenize(test_data['pre_punc'])
max_length = math.ceil(sum([len(s.split(" ")) for s in tokenized_tweets])/len(tokenized_tweets))
tokenizer, encoded_tweets = encod_tweets(tokenized_tweets)
max_length, len(tokenized_tweets)

(17, 567758)

In [27]:
x, y = format_data(encoded_tweets[:train_length], max_length, train_data['newLabel'])
len(x), len(y)

(454206, 454206)

In [28]:
x_test, y_test = format_data(encoded_tweets[train_length:], max_length, test_data['newLabel'])
len(x_test), len(y_test)

(113552, 113552)

In [29]:
vocab = tokenizer.word_index

In [30]:
from gensim.models.keyedvectors import KeyedVectors

In [31]:
embeddings_index = {}
f = open('/kaggle/input/glove840b300dtxt/glove.840B.300d.txt','r',encoding='utf-8')
for line in f:
    values = line.split(' ')
    word = values[0]
    coefs = np.asarray([float(val) for val in values[1:]])
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 2196017 word vectors.


In [32]:
weight_matrix = create_weight_matrix(vocab, embeddings_index)
len(weight_matrix)

108443

In [33]:
model, score, acc = final_model(weight_matrix, len(vocab)+1, max_length, x, y, epochs = 5)
model, score, acc

2022-11-25 08:44:44.967226: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-25 08:44:45.054867: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-25 08:44:45.055651: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-25 08:44:45.058394: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

Epoch 1/5


2022-11-25 08:45:03.388361: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005


Epoch 2/5
Epoch 3/5
Epoch 4/5
  172/10646 [..............................] - ETA: 4:59 - loss: 1.0055 - accuracy: 0.6399

KeyboardInterrupt: 

In [None]:
!mkdir -p saved_model
model.save('saved_model/my_model_newLabel')

In [None]:
model.summary()

In [None]:
y_pred = model.predict(x_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
y_pred = np.array([np.argmax(pred) for pred in y_pred])
y_true = np.array(test_data['newLabel'])
print(classification_report(y_true, y_pred))

In [None]:
for i in range(200,300):
    try:
        test_tweet = test_data['pre_punc'][i]
        pred_label = y_pred[i]
        actual_label = y_true[i]
        print('tweet: ', test_tweet)
        print('pred label: ', pred_label)
        print('actual label: ', actual_label)
        print('-'*50)
    except:
        continue

In [None]:
# emoji_pred = [mappings[mappings['number'] == pred]['emoticons'] for pred in y_pred]

In [None]:
# for i in range(150, 200):
#     try:
#         test_tweet = test_data['pre_punc'][i]
#         pred_label = y_pred[i]
#         pred_emoji = emoji_pred[i]
#         print('tweet: ', test_tweet)
#         print('pred emoji: ', pred_emoji)
#         print('-'*50)
#     except:
#         continue

In [None]:
! zip -r newLabel ./saved_model