In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/another50k/Mapping.csv
/kaggle/input/another50k/Train.csv
/kaggle/input/another50k/Test.csv


In [2]:
train_data = pd.read_csv("../input/another50k/Train.csv")
print(train_data.head())
test_data = pd.read_csv("../input/another50k/Test.csv")
print(test_data.head())
mappings = pd.read_csv("../input/another50k/Mapping.csv")
print(mappings.head())

                                                TEXT  Label
0  Vacation wasted ! #vacation2017 #photobomb #ti...      0
1  Oh Wynwood, you’re so funny! : @user #Wynwood ...      1
2  Been friends since 7th grade. Look at us now w...      2
3  This is what it looks like when someone loves ...      3
4  RT @user this white family was invited to a Bl...      3
                                                TEXT  Label
0  A little throwback with my favourite person @ ...      0
1  glam on @user yesterday for #kcon makeup using...      7
2  Democracy Plaza in the wake of a stunning outc...     11
3   Then &amp; Now. VILO @ Walt Disney Magic Kingdom      0
4               Who never... @ A Galaxy Far Far Away      2
   Unnamed: 0 emoticons  number
0           0         😜       0
1           1         📸       1
2           2         😍       2
3           3         😂       3
4           4         😉       4


In [3]:
train_data.shape, test_data.shape, mappings.shape

((70000, 2), (50006, 2), (20, 3))

In [4]:
train_length = train_data.shape[0]
test_length = test_data.shape[0]
train_length, test_length

(70000, 50006)

In [5]:
from nltk.corpus import stopwords

In [6]:
stop_words = stopwords.words("english")
stop_words[:5]

['i', 'me', 'my', 'myself', 'we']

In [7]:
# tokenize the sentences
def tokenize(tweets):
    stop_words = stopwords.words("english")
    tokenized_tweets = []
    for tweet in tweets:
        # split all words in the tweet
        words = tweet.split(" ")
        tokenized_string = ""
        for word in words:
            # remove @handles -> useless -> no information
            if word[0] != '@' and word not in stop_words:
                # if a hashtag, remove # -> adds no new information
                if word[0] == "#":
                    word = word[1:]
                tokenized_string += word + " "
        tokenized_tweets.append(tokenized_string)
    return tokenized_tweets

In [8]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.wrappers import Bidirectional
from keras.layers import Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [9]:
# translate tweets to a sequence of numbers
def encod_tweets(tweets):
    tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', split=" ", lower=True)
    tokenizer.fit_on_texts(tweets)
    return tokenizer, tokenizer.texts_to_sequences(tweets)
def format_data(encoded_tweets, max_length, labels):
    x = pad_sequences(encoded_tweets, maxlen= max_length, padding='post')
    y = []
    for emoji in labels:
        bit_vec = np.zeros(20)
        bit_vec[emoji] = 1
        y.append(bit_vec)
    y = np.asarray(y)
    return x, y
def create_weight_matrix(vocab, raw_embeddings):
    vocab_size = len(vocab) + 1
    weight_matrix = np.zeros((vocab_size, 300))
    for word, idx in vocab.items():
        if word in raw_embeddings:
            weight_matrix[idx] = raw_embeddings[word]
    return weight_matrix

In [10]:
# final model
def final_model(weight_matrix, vocab_size, max_length, x, y):
    embedding_layer = Embedding(vocab_size, 300, weights=[weight_matrix], input_length=max_length, trainable=True, mask_zero=True)
    model = Sequential()
    model.add(embedding_layer)
    model.add(Bidirectional(LSTM(128, dropout=0.2, return_sequences=True)))
    model.add(Bidirectional(LSTM(128, dropout=0.2)))
    model.add(Dense(20, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(x, y, epochs = 1, validation_split = 0.25)
    score, acc = model.evaluate(x_test, y_test)
    return model, score, acc

In [11]:
import math

In [12]:
tokenized_tweets = tokenize(train_data['TEXT'])
tokenized_tweets += tokenize(test_data['TEXT'])
max_length = math.ceil(sum([len(s.split(" ")) for s in tokenized_tweets])/len(tokenized_tweets))
tokenizer, encoded_tweets = encod_tweets(tokenized_tweets)
max_length, len(tokenized_tweets)

(10, 120006)

In [13]:
x, y = format_data(encoded_tweets[:train_length], max_length, train_data['Label'])
len(x), len(y)

(70000, 70000)

In [14]:
x_test, y_test = format_data(encoded_tweets[train_length:], max_length, test_data['Label'])
len(x_test), len(y_test)

(50006, 50006)

In [15]:
vocab = tokenizer.word_index
len(vocab)

97462

In [16]:
from gensim.models.keyedvectors import KeyedVectors

In [17]:
!pip install gdown
!gdown 0B13VF_-CUsHPN0dveFZBODlUU00

Collecting gdown
  Downloading gdown-4.5.3.tar.gz (14 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: gdown
  Building wheel for gdown (pyproject.toml) ... [?25ldone
[?25h  Created wheel for gdown: filename=gdown-4.5.3-py3-none-any.whl size=14821 sha256=cc0022838cf4dee74c5dfcfde7b8231f914785466d2e437a7a864b62e06ed1ed
  Stored in directory: /root/.cache/pip/wheels/94/8d/0b/bdcd83555c3555f91a33f6c2384428d9f163c7d75ab0d272b4
Successfully built gdown
Installing collected packages: gdown
Successfully installed gdown-4.5.3
[0mDownloading...
From: https://drive.google.com/uc?id=0B13VF_-CUsHPN0dveFZBODlUU00
To: /kaggle/working/model_swm_300-6-10-low.w2v
100%|█████████████████████████████████████████| 687M/687M [00:04<00:00, 155MB/s]


In [18]:
raw_embeddings = KeyedVectors.load_word2vec_format('model_swm_300-6-10-low.w2v', binary=False)

In [19]:
weight_matrix = create_weight_matrix(vocab, raw_embeddings)
len(weight_matrix)

97463

In [20]:
model, score, acc = final_model(weight_matrix, len(vocab)+1, max_length, x, y)
model, score, acc

2022-11-17 13:56:18.361785: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-17 13:56:18.464044: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-17 13:56:18.465115: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-17 13:56:18.467210: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil



(<keras.engine.sequential.Sequential at 0x7fb54af52d50>,
 3.773322105407715,
 0.06711194664239883)

In [21]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 10, 300)           29238900  
_________________________________________________________________
bidirectional (Bidirectional (None, 10, 256)           439296    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               394240    
_________________________________________________________________
dense (Dense)                (None, 20)                5140      
Total params: 30,077,576
Trainable params: 30,077,576
Non-trainable params: 0
_________________________________________________________________


In [22]:
y_pred = model.predict(x_test)

In [23]:
from sklearn.metrics import classification_report
y_pred = np.array([np.argmax(pred) for pred in y_pred])
y_true = np.array(test_data['Label'])
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00     10760
           1       0.04      0.01      0.01      5280
           2       0.05      0.10      0.07      5241
           3       0.02      0.08      0.04      2886
           4       0.00      0.00      0.00      2518
           5       0.03      0.02      0.02      2317
           6       0.03      0.01      0.02      2049
           7       0.03      0.04      0.03      1894
           8       0.03      0.00      0.00      1796
           9       0.04      0.42      0.08      1671
          10       0.00      0.00      0.00      1544
          11       0.51      0.59      0.55      1528
          12       0.42      0.46      0.44      1462
          13       0.02      0.01      0.01      1346
          14       0.05      0.02      0.02      1377
          15       0.02      0.01      0.01      1250
          16       0.04      0.03      0.04      1306
          17       0.01    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
emoji_pred = [mappings[mappings['number'] == pred]['emoticons'] for pred in y_pred]