In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)

import re
import os
import wordsegment as ws
import preprocessor as p
import numpy as np

from sklearn.model_selection import train_test_split, KFold
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix

from collections import Counter

from keras.layers import Embedding, Dense, Input, MaxPooling2D, Dropout, LSTM, Bidirectional, Reshape
from keras.models import Model,Sequential
from keras.preprocessing.text import Tokenizer
from keras.engine.topology import Layer, InputSpec
from keras import initializers, optimizers
from keras import backend as K

os.environ['KERAS_BACKEND']='tensorflow'

Using TensorFlow backend.


## Loading data

In [23]:
data = pd.read_csv("./datasets/test/SemEval2018-T3_input_test_taskB_emoji.txt", sep="\t")
ws.load()

In [24]:
def get_hashtags(tweet):
    parsed_tweet = p.parse(tweet.lower())
    parsed_hashtags = parsed_tweet.hashtags
    all_hashtags = {}
    
    hashtags = []
    if parsed_hashtags is not None:
        for h in parsed_hashtags:
            temp = h.match[1:].lower()
            hashtag = " ".join(ws.segment(temp))
            if hashtag in all_hashtags:
                all_hashtags[hashtag] += 1
            else:
                all_hashtags[hashtag] = 1
            hashtags.append(hashtag)

    hashtags_str = (" ").join(hashtags)
    return hashtags_str, len(hashtags), all_hashtags

def get_text(tweet):
    clean_tweet = p.clean(tweet)
    clean_tweet = re.sub(r'[^\w\s]','',clean_tweet)
    return clean_tweet.lower()


def get_emotion(tweet):
    emotion_keys = {}
    result = re.findall(r":\w+_\w+:",tweet)
    if result is not None:
        emotions = []
        for i in range(len(result)):
            emotion = result[i][1:-1]
            emotions.append(emotion)
            if emotion in emotion_keys:
                emotion_keys[emotion] += 1
            else:
                emotion_keys[emotion] = 1
    return emotions, emotion_keys 

In [26]:
data['hashtags'], data['length'], data['hashtag_dict'] = zip(*data['tweet text'].map(get_hashtags)) 
data["tweet"] = data['tweet text'].map(get_text)
# data['emotion'], data['emotion_dict'] = zip(*data['tweet'].map(get_emotion))
data.head()

Unnamed: 0,tweet index,tweet text,hashtags,length,hashtag_dict,tweet
0,1,@Callisto1947 Can U Help?||More conservatives needed on #TSU + get paid 4 posting stuff like this!||YOU $ can go to http://t.co/JUmMWi0AyT,tsu,1,{'tsu': 1},can u helpmore conservatives needed on get paid 4 posting stuff like thisyou can go to
1,2,"Just walked in to #Starbucks and asked for a ""tall blonde"" Hahahaha",starbucks,1,{'starbucks': 1},just walked in to and asked for a tall blonde hahahaha
2,3,GONNA WIN http://t.co/Mc9ebqjAqj,,0,{},gonna win
3,4,@mickymantell He is exactly that sort of person. Weirdo!,,0,{},he is exactly that sort of person weirdo
4,5,So much at work mate 10/10 #boring 100% #dead mate full on #shit absolutely #sleeping mate can't handle the,boring dead shit sleeping,4,"{'sleeping': 1, 'dead': 1, 'boring': 1, 'shit': 1}",so much at work mate 1010 100 mate full on absolutely mate cant handle the


## Preprocessing the data

In [5]:
BASE_DIR = ''
GLOVE_DIR = os.path.join(BASE_DIR, 'glove')
# TEXT_LENGTH = max(len(x.split(' ')) for x in data['tweet'].tolist())
# HASHTAG_LENGTH = max(len(x.split(' ')) for x in data['hashtags'].tolist())
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
NUM_CLASSES = 4

In [6]:
# first, build index mapping words in the embeddings set
# to their embedding vector
import io

print('Indexing word vectors.')

embeddings_index = {}
f = io.open(os.path.join(GLOVE_DIR, 'glove.twitter.27B.100d.txt'),encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 1193514 word vectors.


In [27]:
# vectorize the text samples into a 2D integer tensor
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

def vectorize_data(text, MAX_NUM_WORDS, MAX_SEQUENCE_LENGTH):
    tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
    tokenizer.fit_on_texts(text)
    sequences = tokenizer.texts_to_sequences(text)

    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))
    
    for word,idx in word_index.items():
        word_index[word] = idx - 1

    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    return data, word_index

# labels = to_categorical(np.asarray(data['Label']))
x_tweet, tweet_token_index = vectorize_data(data['tweet'],MAX_NUM_WORDS,TEXT_LENGTH)
x_hashtags, ht_token_index = vectorize_data(data['hashtags'],MAX_NUM_WORDS,HASHTAG_LENGTH)

Found 2866 unique tokens.
Found 1016 unique tokens.


In [10]:
# split the data into a training set and a validation set
indices = np.arange(x_tweet.shape[0])
np.random.shuffle(indices)
x_tweet = x_tweet[indices]
x_hashtags = x_hashtags[indices]
labels = labels[indices]
# num_validation_samples = int(VALIDATION_SPLIT * x_tweet.shape[0])

# x_tweet_train = x_tweet[:-num_validation_samples]
# x_hashtags_train = x_hashtags[:-num_validation_samples]
# y_train = labels[:-num_validation_samples]
# x_tweet_val = x_tweet[-num_validation_samples:]
# x_hashtags_val = x_hashtags[-num_validation_samples:]
# y_val = labels[-num_validation_samples:]

In [28]:
# prepare embedding matrix
def get_embedding_matrix(word_index):
    num_words = min(MAX_NUM_WORDS, len(word_index)+1)
    embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
    for word, i in word_index.items():
        if i >= MAX_NUM_WORDS:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i-1] = embedding_vector
    return embedding_matrix

tweet_emb = get_embedding_matrix(tweet_token_index)
hashtag_emb = get_embedding_matrix(ht_token_index)

## Building the Model

In [29]:
def blstm_maxpool(x, word_index, emb_matrix, num_filters, max_seq_len, learn_rate):
    num_words = min(MAX_NUM_WORDS, len(word_index)+1)
    y = Embedding(num_words,EMBEDDING_DIM,weights=[emb_matrix],
                                        input_length=max_seq_len,trainable=False)(x)
    y = Bidirectional(LSTM(num_filters, return_sequences=True))(y)
    y = Reshape((max_seq_len,2*num_filters,1))(y)
    y = MaxPooling2D(pool_size=(max_seq_len,1), strides=None, padding='valid')(y)
    y = Reshape((2*num_filters,))(y)
    return y

In [30]:
NUM_FILTERS = 100
LEARNING_RATE = 0.005
tweet = Input(batch_shape=(None,TEXT_LENGTH), dtype='int32')
hashtag = Input(batch_shape=(None,HASHTAG_LENGTH), dtype='int32')

tweet_lstm_vec = blstm_maxpool(tweet,tweet_token_index,tweet_emb, NUM_FILTERS, TEXT_LENGTH, LEARNING_RATE)
ht_lstm_vec = blstm_maxpool(hashtag,ht_token_index,hashtag_emb, NUM_FILTERS, HASHTAG_LENGTH, LEARNING_RATE)

In [14]:
import tensorflow as tf

def holographic_merge(inp):
    [a, b] = inp
    a_fft = tf.fft(tf.complex(a, 0.0))
    b_fft = tf.fft(tf.complex(b, 0.0))
    ifft = tf.ifft(tf.conj(a_fft) * b_fft)
    return tf.cast(tf.real(ifft), 'float32') 

In [15]:
from keras.layers import Lambda

h_circ = Lambda(holographic_merge)([tweet_lstm_vec,ht_lstm_vec])

In [16]:
# Dropout and dense layer

h_circ = Dropout(0.3)(h_circ)
preds = Dense(NUM_CLASSES, activation='softmax')(h_circ)

In [17]:
model = Model([tweet,hashtag],preds)

## Training

In [18]:
BATCH_SIZE = 20
EPOCHS = 10

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

print('Training')
model.fit([x_tweet, x_hashtags], labels,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          verbose=1)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 32)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 23)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 32, 100)      822600      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 23, 100)      282500      input_2[0][0]                    
__________________________________________________________________________________________________
bidirectio

<keras.callbacks.History at 0x1f502db9f98>

## Testing on validation set

In [45]:
temp = model.predict([x_tweet_val,x_hashtags_val])
y_pred  = np.argmax(temp, 1)
y_true = np.argmax(y_val, 1)
precision = metrics.precision_score(y_true, y_pred, average=None)
recall = metrics.recall_score(y_true, y_pred, average=None)
f1_score = metrics.f1_score(y_true, y_pred, average=None)
print("Precision: " + str(precision) + "\n")
print("Recall: " + str(recall) + "\n")
print("f1_score: " + str(f1_score) + "\n")
print(confusion_matrix(y_true, y_pred))
print(":: Classification Report")
print(classification_report(y_true, y_pred, digits=4))

Precision: [ 0.6614786   0.55795678]

Recall: [ 0.43037975  0.76549865]

f1_score: [ 0.52147239  0.64545455]

[[170 225]
 [ 87 284]]
:: Classification Report
             precision    recall  f1-score   support

          0     0.6615    0.4304    0.5215       395
          1     0.5580    0.7655    0.6455       371

avg / total     0.6113    0.5927    0.5815       766



## Getting holographic embeddings

In [19]:
def get_holographic_output(x_tweet, x_hashtags):
    intermediate_layer_model = Model(inputs=model.input, outputs=model.get_layer('lambda_1').output)
    intermediate_output = intermediate_layer_model.predict([x_tweet, x_hashtags])
    return np.array(list(intermediate_output))

In [20]:
indices, x_tweet, x_hashtags, labels = (list(t) for t in zip(*sorted(zip(indices, x_tweet, x_hashtags, labels))))

In [31]:
X_test = get_holographic_output(np.array(x_tweet),np.array(x_hashtags))

In [32]:
np.save('taskB_test_holographic',X_test)
np.save('taskB_labels',labels)