In [2]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)

import re
import os
import wordsegment as ws
import preprocessor as p
import numpy as np

from sklearn.model_selection import train_test_split, KFold
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix

from collections import Counter

from keras.layers import Embedding, Dense, Input, MaxPooling2D, Dropout, LSTM, Bidirectional, Reshape
from keras.models import Model,Sequential
from keras.preprocessing.text import Tokenizer
from keras.engine.topology import Layer, InputSpec
from keras import initializers, optimizers
from keras import backend as K

os.environ['KERAS_BACKEND']='tensorflow'

Using TensorFlow backend.


## Loading data

In [3]:
data = pd.read_csv("./datasets/train/SemEval2018-T3-train-taskA.txt", sep="\t")
ws.load()

In [4]:
def get_hashtags(tweet):
    parsed_tweet = p.parse(tweet.lower())
    parsed_hashtags = parsed_tweet.hashtags
    all_hashtags = {}
    
    hashtags = []
    if parsed_hashtags is not None:
        for h in parsed_hashtags:
            temp = h.match[1:].lower()
            hashtag = " ".join(ws.segment(temp))
            if hashtag in all_hashtags:
                all_hashtags[hashtag] += 1
            else:
                all_hashtags[hashtag] = 1
            hashtags.append(hashtag)

    hashtags_str = (" ").join(hashtags)
    return hashtags_str, len(hashtags), all_hashtags

def get_text(tweet):
    clean_tweet = p.clean(tweet)
    clean_tweet = re.sub(r'[^\w\s]','',clean_tweet)
    return clean_tweet.lower()


def get_emotion(tweet):
    emotion_keys = {}
    result = re.findall(r":\w+_\w+:",tweet)
    if result is not None:
        emotions = []
        for i in range(len(result)):
            emotion = result[i][1:-1]
            emotions.append(emotion)
            if emotion in emotion_keys:
                emotion_keys[emotion] += 1
            else:
                emotion_keys[emotion] = 1
    return emotions, emotion_keys 

In [5]:
data['hashtags'], data['length'], data['hashtag_dict'] = zip(*data['Tweet text'].map(get_hashtags)) 
data["tweet"] = data['Tweet text'].map(get_text)
data['emotion'], data['emotion_dict'] = zip(*data['tweet'].map(get_emotion))
data.head()

Unnamed: 0,Tweet index,Label,Tweet text,hashtags,length,hashtag_dict,tweet,emotion,emotion_dict
0,1,1,Sweet United Nations video. Just in time for Christmas. #imagine #NoReligion http://t.co/fej2v3OUBR,imagine no religion,2,"{'no religion': 1, 'imagine': 1}",sweet united nations video just in time for christmas,[],{}
1,2,1,@mrdahl87 We are rumored to have talked to Erv's agent... and the Angels asked about Ed Escobar... that's hardly nothing ;),,0,{},we are rumored to have talked to ervs agent and the angels asked about ed escobar thats hardly nothing,[],{}
2,3,1,Hey there! Nice to see you Minnesota/ND Winter Weather,,0,{},hey there nice to see you minnesotand winter weather,[],{}
3,4,0,3 episodes left I'm dying over here,,0,{},3 episodes left im dying over here,[],{}
4,5,1,I can't breathe! was chosen as the most notable quote of the year in an annual list released by a Yale University librarian,,0,{},i cant breathe was chosen as the most notable quote of the year in an annual list released by a yale university librarian,[],{}


## Preprocessing the data

In [6]:
BASE_DIR = ''
GLOVE_DIR = os.path.join(BASE_DIR, 'glove')
TEXT_LENGTH = max(len(x.split(' ')) for x in data['tweet'].tolist())
HASHTAG_LENGTH = max(len(x.split(' ')) for x in data['hashtags'].tolist())
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
NUM_CLASSES = 2

In [7]:
# first, build index mapping words in the embeddings set
# to their embedding vector
import io

print('Indexing word vectors.')

embeddings_index = {}
f = io.open(os.path.join(GLOVE_DIR, 'glove.twitter.27B.100d.txt'),encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 1193514 word vectors.


In [8]:
# vectorize the text samples into a 2D integer tensor
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

def vectorize_data(text, MAX_NUM_WORDS, MAX_SEQUENCE_LENGTH):
    tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
    tokenizer.fit_on_texts(text)
    sequences = tokenizer.texts_to_sequences(text)

    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))
    
    for word,idx in word_index.items():
        word_index[word] = idx - 1

    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    return data, word_index

labels = to_categorical(np.asarray(data['Label']))
x_tweet, tweet_token_index = vectorize_data(data['tweet'],MAX_NUM_WORDS,TEXT_LENGTH)
x_hashtags, ht_token_index = vectorize_data(data['hashtags'],MAX_NUM_WORDS,HASHTAG_LENGTH)

Found 8562 unique tokens.
Found 2833 unique tokens.


In [65]:
# split the data into a training set and a validation set
indices = np.arange(x_tweet.shape[0])
np.random.shuffle(indices)
x_tweet = x_tweet[indices]
x_hashtags = x_hashtags[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * x_tweet.shape[0])

x_tweet_train = x_tweet[:-num_validation_samples]
x_hashtags_train = x_hashtags[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_tweet_val = x_tweet[-num_validation_samples:]
x_hashtags_val = x_hashtags[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

In [85]:
with np.load('./datasets/taskA-punctuation.npz') as d:
    w_hash = d['num_hash']
    w_at = d['num_at']
    w_exc = d['num_exclaim']
    
W_hash_tr = w_hash[:-num_validation_samples]
W_hash_dev = w_hash[-num_validation_samples:]
W_at_tr = w_at[:-num_validation_samples]
W_at_dev = w_at[-num_validation_samples:]
W_exc_tr = w_exc[:-num_validation_samples]
W_exc_dev = w_exc[-num_validation_samples:]

In [18]:
# prepare embedding matrix
def get_embedding_matrix(word_index):
    num_words = min(MAX_NUM_WORDS, len(word_index)+1)
    embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
    for word, i in word_index.items():
        if i >= MAX_NUM_WORDS:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i-1] = embedding_vector
    return embedding_matrix

tweet_emb = get_embedding_matrix(tweet_token_index)
hashtag_emb = get_embedding_matrix(ht_token_index)

## Saving/loading preprocessed data

## Building the Model

In [58]:
def blstm_maxpool(x, word_index, emb_matrix, num_filters, max_seq_len, learn_rate):
    num_words = min(MAX_NUM_WORDS, len(word_index)+1)
    y = Embedding(num_words,EMBEDDING_DIM,weights=[emb_matrix],
                                        input_length=max_seq_len,trainable=False)(x)
    y = Bidirectional(LSTM(num_filters, return_sequences=True))(y)
    y = Reshape((max_seq_len,2*num_filters,1))(y)
    y = MaxPooling2D(pool_size=(max_seq_len,1), strides=None, padding='valid')(y)
    y = Reshape((2*num_filters,))(y)
    return y

In [73]:
NUM_FILTERS = 50
LEARNING_RATE = 0.005
tweet = Input(shape=(TEXT_LENGTH,), dtype='int32')
hashtag = Input(shape=(HASHTAG_LENGTH,), dtype='int32')

tweet_lstm_vec = blstm_maxpool(tweet,tweet_token_index,tweet_emb, NUM_FILTERS, TEXT_LENGTH, LEARNING_RATE)
ht_lstm_vec = blstm_maxpool(hashtag,ht_token_index,hashtag_emb, NUM_FILTERS, HASHTAG_LENGTH, LEARNING_RATE)

In [74]:
import tensorflow as tf

def holographic_merge(inp):
    [a, b] = inp
    a_fft = tf.fft(tf.complex(a, 0.0))
    b_fft = tf.fft(tf.complex(b, 0.0))
    ifft = tf.ifft(tf.conj(a_fft) * b_fft)
    return tf.cast(tf.real(ifft), 'float32') 

In [75]:
from keras.layers import Lambda

h_circ = Lambda(holographic_merge)([tweet_lstm_vec,ht_lstm_vec])

In [76]:
# Dropout and dense layer

h_circ = Dropout(0.3)(h_circ)
preds = Dense(NUM_CLASSES, activation='softmax')(h_circ)

In [77]:
model = Model([tweet,hashtag],preds)

## Training

In [78]:
BATCH_SIZE = 20
EPOCHS = 10

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

print('Training')
model.fit([x_tweet_train, x_hashtags_train], y_train,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          verbose=1)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_11 (InputLayer)           (None, 172)          0                                            
__________________________________________________________________________________________________
input_12 (InputLayer)           (None, 23)           0                                            
__________________________________________________________________________________________________
embedding_10 (Embedding)        (None, 172, 100)     856300      input_11[0][0]                   
__________________________________________________________________________________________________
embedding_11 (Embedding)        (None, 23, 100)      283400      input_12[0][0]                   
__________________________________________________________________________________________________
bidirectio

<keras.callbacks.History at 0x2641e87f2e8>

## Saving/Loading the trained model

In [46]:
## Saving
model.save('./keras_models/model1.h5')

## Loading
# from keras.models import load_model
# model = load_model('./keras_models/model1.h5')

## Testing on validation set

In [79]:
temp = model.predict([x_tweet_val,x_hashtags_val])
y_pred  = np.argmax(temp, 1)
y_true = np.argmax(y_val, 1)
precision = metrics.precision_score(y_true, y_pred, average=None)
recall = metrics.recall_score(y_true, y_pred, average=None)
f1_score = metrics.f1_score(y_true, y_pred, average=None)
print("Precision: " + str(precision) + "\n")
print("Recall: " + str(recall) + "\n")
print("f1_score: " + str(f1_score) + "\n")
print(confusion_matrix(y_true, y_pred))
print(":: Classification Report")
print(classification_report(y_true, y_pred, digits=4))

Precision: [ 0.57305936  0.64      ]

Recall: [ 0.68206522  0.52658228]

f1_score: [ 0.62282878  0.57777778]

[[251 117]
 [187 208]]
:: Classification Report
             precision    recall  f1-score   support

          0     0.5731    0.6821    0.6228       368
          1     0.6400    0.5266    0.5778       395

avg / total     0.6077    0.6016    0.5995       763



In [82]:
def get_holographic_output(x_tweet, x_hashtags):
    intermediate_layer_model = Model(inputs=model.input, outputs=model.get_layer('lambda_5').output)
    intermediate_output = intermediate_layer_model.predict([x_tweet, x_hashtags])
    return np.array(list(intermediate_output))

In [86]:
X_train_xgb = np.concatenate((get_holographic_output(x_tweet_train, x_hashtags_train),
                              W_hash_tr,W_at_tr,W_exc_tr),axis=1)
X_val_xgb = np.concatenate((get_holographic_output(x_tweet_val, x_hashtags_val),
                            W_hash_dev,W_at_dev,W_exc_dev),axis=1)
y_train_xgb = np.argmax(y_train, axis=1)
y_val_xgb = np.argmax(y_val, axis=1)

ValueError: all the input arrays must have same number of dimensions

(3071,)

In [84]:
import xgboost as xgb

logreg = xgb.XGBClassifier(max_depth=3, learning_rate=0.1, n_estimators=100, silent=False, 
                           objective='binary:logistic', booster='gbtree', n_jobs=1, nthread=None, 
                           gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, 
                           colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, 
                           random_state=0, seed=None, missing=None)
logreg.fit(X_train_xgb, y_train_xgb)

y_pred = logreg.predict(X_val_xgb)
y_true = y_val_xgb
precision = metrics.precision_score(y_true, y_pred, average=None)
recall = metrics.recall_score(y_true, y_pred, average=None)
f1_score = metrics.f1_score(y_true, y_pred, average=None)
print("Precision: " + str(precision) + "\n")
print("Recall: " + str(recall) + "\n")
print("f1_score: " + str(f1_score) + "\n")
print(confusion_matrix(y_true, y_pred))
print(":: Classification Report")
print(classification_report(y_true, y_pred, digits=4))

Precision: [ 0.584       0.61597938]

Recall: [ 0.5951087   0.60506329]

f1_score: [ 0.58950202  0.61047254]

[[219 149]
 [156 239]]
:: Classification Report
             precision    recall  f1-score   support

          0     0.5840    0.5951    0.5895       368
          1     0.6160    0.6051    0.6105       395

avg / total     0.6006    0.6003    0.6004       763

