In [3]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)

import re
import os
import wordsegment as ws
import preprocessor as p
import numpy as np

from sklearn.model_selection import train_test_split, KFold
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix

from collections import Counter

from keras.layers import Embedding, Dense, Input, Flatten, MaxPooling1D, Embedding, Merge, Dropout, LSTM, Bidirectional
from keras.models import Model,Sequential
from keras.preprocessing.text import Tokenizer
from keras.engine.topology import Layer, InputSpec
from keras import initializers, optimizers
from keras import backend as K

os.environ['KERAS_BACKEND']='tensorflow'

In [4]:
data = pd.read_csv("./datasets/train/SemEval2018-T3-train-taskA.txt", sep="\t")
ws.load()

In [5]:
def get_hashtags(tweet):
    parsed_tweet = p.parse(tweet.lower())
    parsed_hashtags = parsed_tweet.hashtags
    all_hashtags = {}
    
    hashtags = []
    if parsed_hashtags is not None:
        for h in parsed_hashtags:
            temp = h.match[1:].lower()
            hashtag = " ".join(ws.segment(temp))
            if hashtag in all_hashtags:
                all_hashtags[hashtag] += 1
            else:
                all_hashtags[hashtag] = 1
            hashtags.append(hashtag)

    hashtags_str = (" ").join(hashtags)
    return hashtags_str, len(hashtags), all_hashtags

def get_text(tweet):
    clean_tweet = p.clean(tweet)
    clean_tweet = re.sub(r'[^\w\s]','',clean_tweet)
    return clean_tweet.lower()


def get_emotion(tweet):
    emotion_keys = {}
    result = re.findall(r":\w+_\w+:",tweet)
    if result is not None:
        emotions = []
        for i in range(len(result)):
            emotion = result[i][1:-1]
            emotions.append(emotion)
            if emotion in emotion_keys:
                emotion_keys[emotion] += 1
            else:
                emotion_keys[emotion] = 1
    return emotions, emotion_keys 

In [7]:
data['hashtags'], data['length'], data['hashtag_dict'] = zip(*data['Tweet text'].map(get_hashtags)) 
data["tweet"] = data['Tweet text'].map(get_text)
data['emotion'], data['emotion_dict'] = zip(*data['tweet'].map(get_emotion))
data.head()

Unnamed: 0,Tweet index,Label,Tweet text,hashtags,length,hashtag_dict,tweet,emotion,emotion_dict
0,1,1,Sweet United Nations video. Just in time for Christmas. #imagine #NoReligion http://t.co/fej2v3OUBR,imagine no religion,2,"{'imagine': 1, 'no religion': 1}",sweet united nations video just in time for christmas,[],{}
1,2,1,@mrdahl87 We are rumored to have talked to Erv's agent... and the Angels asked about Ed Escobar... that's hardly nothing ;),,0,{},we are rumored to have talked to ervs agent and the angels asked about ed escobar thats hardly nothing,[],{}
2,3,1,Hey there! Nice to see you Minnesota/ND Winter Weather,,0,{},hey there nice to see you minnesotand winter weather,[],{}
3,4,0,3 episodes left I'm dying over here,,0,{},3 episodes left im dying over here,[],{}
4,5,1,I can't breathe! was chosen as the most notable quote of the year in an annual list released by a Yale University librarian,,0,{},i cant breathe was chosen as the most notable quote of the year in an annual list released by a yale university librarian,[],{}


In [61]:
BASE_DIR = ''
GLOVE_DIR = os.path.join(BASE_DIR, 'glove')
TEXT_LENGTH = max(len(x.split(' ')) for x in data['tweet'].tolist())
HASHTAG_LENGTH = max(len(x.split(' ')) for x in data['hashtags'].tolist())
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
NUM_CLASSES = 2

In [29]:
# first, build index mapping words in the embeddings set
# to their embedding vector
import io

print('Indexing word vectors.')

embeddings_index = {}
f = io.open(os.path.join(GLOVE_DIR, 'glove.twitter.27B.100d.txt'),encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 1193514 word vectors.


In [37]:
# vectorize the text samples into a 2D integer tensor
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

def vectorize_data(text, MAX_NUM_WORDS, MAX_SEQUENCE_LENGTH):
    tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
    tokenizer.fit_on_texts(text)
    sequences = tokenizer.texts_to_sequences(text)

    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    return data, word_index

labels = to_categorical(np.asarray(data['Label']))
x_tweet, tweet_token_size = vectorize_data(data['tweet'],MAX_NUM_WORDS,TEXT_LENGTH)
x_hashtags, ht_token_size = vectorize_data(data['hashtags'],MAX_NUM_WORDS,HASHTAG_LENGTH)

Found 8562 unique tokens.
Found 2833 unique tokens.


In [36]:
# split the data into a training set and a validation set
indices = np.arange(x_tweet.shape[0])
np.random.shuffle(indices)
x_tweet = x_tweet[indices]
x_hashtags = x_hashtags[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * x_tweet.shape[0])

x_tweet_train = x_tweet[:-num_validation_samples]
x_hashtags_train = x_hashtags[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_tweet_val = x_tweet[-num_validation_samples:]
x_hashtags_val = x_hashtags[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

In [43]:
# prepare embedding matrix
def get_embedding_matrix(word_index):
    num_words = min(MAX_NUM_WORDS, len(word_index))
    embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
    for word, i in word_index.items():
        if i >= MAX_NUM_WORDS:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i-1] = embedding_vector
    return embedding_matrix

tweet_emb = get_embedding_matrix(tweet_token_size)
hashtag_emb = get_embedding_matrix(ht_token_size)

In [64]:
def blstm_maxpool(x, word_index, emb_matrix, num_filters, max_seq_len, learn_rate):
    num_words = min(MAX_NUM_WORDS, len(word_index))
    y = Embedding(num_words,EMBEDDING_DIM,weights=[emb_matrix],
                                        input_length=max_seq_len,trainable=False)(x)
    y = Bidirectional(LSTM(num_filters, return_sequences=True))(y)
    y = MaxPooling1D(pool_size=max_seq_len, strides=None, padding='valid')(y)
    return y

In [65]:
NUM_FILTERS = 100
LEARNING_RATE = 0.005
tweet = Input(shape=(TEXT_LENGTH,), dtype='int32')
hashtag = Input(shape=(HASHTAG_LENGTH,), dtype='int32')

tweet_lstm_vec = blstm_maxpool(tweet,tweet_token_size,tweet_emb, NUM_FILTERS, TEXT_LENGTH, LEARNING_RATE)
ht_lstm_vec = blstm_maxpool(hashtag,ht_token_size,hashtag_emb, NUM_FILTERS, HASHTAG_LENGTH, LEARNING_RATE)

In [66]:
import tensorflow as tf

class HolographicMerge(Layer):

    def __init__(self, **kwargs):
        super(HolographicMerge, self).__init__(**kwargs)

    def build(self, input_shape):
        # Create a trainable weight variable for this layer.
        self.W = self.add_weight(name='kernel', 
                                      shape=(input_shape[-1],),
                                      initializer='random_normal',
                                      trainable=True)
        super(HolographicMerge, self).build(input_shape)  # Be sure to call this somewhere!

    def call(self, x, mask=None):
        eij = K.tanh(K.dot(x, self.W))
        
        ai = K.exp(eij)
        weights = ai/K.sum(ai, axis=1).dimshuffle(0,'x')
        
        weighted_input = x*weights.dimshuffle(0,1,'x')
        return weighted_input.sum(axis=1)

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])


def holographic_merge(a,b):
    a_fft = tf.fft(tf.complex(a, 0.0))
    b_fft = tf.fft(tf.complex(b, 0.0))
    ifft = tf.ifft(tf.conj(a_fft) * b_fft)
    return tf.cast(tf.real(ifft), 'float32') 

In [67]:
h_circ = holographic_merge(tweet_lstm_vec,ht_lstm_vec)

In [68]:
# Dropout and dense layer

h_circ = Dropout(0.3)(h_circ)
preds = Dense(NUM_CLASSES, activation='softmax')(h_circ)

In [69]:
model = Model([tweet,hashtag],preds)

AttributeError: 'Tensor' object has no attribute '_keras_history'