In [1]:
import pandas as pd
import re
import preprocessor as p
pd.set_option('display.max_colwidth', -1)
import numpy as np
from sklearn.model_selection import train_test_split, KFold
import preprocessor as p
from collections import Counter
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix 
from tensorflow.contrib import learn
from tflearn.data_utils import to_categorical, pad_sequences
import os
os.environ['KERAS_BACKEND']='theano'
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional
from keras.models import Model,Sequential
from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers, optimizers

Using Theano backend.


In [2]:
data = pd.read_csv("./datasets/train/SemEval2018-T3-train-taskA.txt", sep="\t")

In [3]:
all_hashtags = {}
def get_hashtags(tweet):
    parsed_tweet = p.parse(tweet.decode('ascii', 'ignore').encode('ascii').lower())
    parsed_hashtags = parsed_tweet.hashtags
    
    hashtags = []
    if parsed_hashtags is not None:
        for hashtag in parsed_hashtags:
            temp = hashtag.match[1:].lower()
            if temp in all_hashtags:
                all_hashtags[temp] += 1
            else:
                all_hashtags[temp] = 1
            hashtags.append(temp)

    hashtags_str = (" ").join(hashtags)
    return hashtags_str, len(hashtags)

def get_clean_tweet(tweet):
    p.set_options(p.OPT.URL)
    clean_tweet = p.clean(tweet)
    return clean_tweet.lower().replace("#"," ")


emotion_keys = {}
def get_emotion(tweet):
    result = re.findall(r":\w+_\w+:",tweet)
    if result is not None:
        emotions = []
        for i in range(len(result)):
            emotion = result[i][1:-1]
            emotions.append(emotion)
            if emotion in emotion_keys:
                emotion_keys[emotion] += 1
            else:
                emotion_keys[emotion] = 1
    return (" ").join(emotions)

In [4]:
data['hashtags'], data['length'] = zip(*data['Tweet text'].map(get_hashtags)) 
data["tweet"] = data['Tweet text'].map(get_clean_tweet)
data['emotion'] = data['tweet'].map(get_emotion)
data.head()

Unnamed: 0,Tweet index,Label,Tweet text,hashtags,length,tweet,emotion
0,1,1,Sweet United Nations video. Just in time for Christmas. #imagine #NoReligion http://t.co/fej2v3OUBR,imagine noreligion,2,sweet united nations video. just in time for christmas. imagine noreligion,
1,2,1,@mrdahl87 We are rumored to have talked to Erv's agent... and the Angels asked about Ed Escobar... that's hardly nothing ;),,0,@mrdahl87 we are rumored to have talked to erv's agent... and the angels asked about ed escobar... that's hardly nothing ;),
2,3,1,Hey there! Nice to see you Minnesota/ND Winter Weather,,0,hey there! nice to see you minnesota/nd winter weather,
3,4,0,3 episodes left I'm dying over here,,0,3 episodes left i'm dying over here,
4,5,1,I can't breathe! was chosen as the most notable quote of the year in an annual list released by a Yale University librarian,,0,i can't breathe! was chosen as the most notable quote of the year in an annual list released by a yale university librarian,


In [5]:
class AttLayer(Layer):

    def __init__(self, **kwargs):
        super(AttLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        # Create a trainable weight variable for this layer.
        self.W = self.add_weight(name='kernel', 
                                      shape=(input_shape[-1],),
                                      initializer='random_normal',
                                      trainable=True)
        super(AttLayer, self).build(input_shape)  # Be sure to call this somewhere!

    def call(self, x, mask=None):
        eij = K.tanh(K.dot(x, self.W))
        
        ai = K.exp(eij)
        weights = ai/K.sum(ai, axis=1).dimshuffle(0,'x')
        
        weighted_input = x*weights.dimshuffle(0,1,'x')
        return weighted_input.sum(axis=1)

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])

def blstm_atten(inp_dim, vocab_size, embed_size, num_classes, learn_rate):
    model = Sequential()
    model.add(Embedding(vocab_size, embed_size, input_length=inp_dim))
    model.add(Dropout(0.25))
    model.add(Bidirectional(LSTM(embed_size, return_sequences=True)))
    model.add(AttLayer())
    model.add(Dropout(0.50))
    model.add(Dense(num_classes, activation='softmax'))
    adam = optimizers.Adam(lr=learn_rate, beta_1=0.9, beta_2=0.999)
    model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
    model.summary()
    return model

In [6]:
max_document_length = 38
num_classes = 2
embed_size = 50
n_epoch = 10
batch_size = 16
learn_rate = 0.01

In [7]:
x_text = data["tweet"].tolist()
labels =  data["Label"].tolist()
X_train, X_test, Y_train, Y_test = train_test_split(x_text, labels, random_state=121, test_size=0.10)

vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length, 2)
vocab_processor = vocab_processor.fit(x_text)

vocab_size = len(vocab_processor.vocabulary_)
print("Vocabulary Size: {:d}".format(vocab_size))
vocab = vocab_processor.vocabulary_._mapping

trainX = np.array(list(vocab_processor.transform(X_train)))
testX = np.array(list(vocab_processor.transform(X_test)))

trainY = np.asarray(Y_train)
testY = np.asarray(Y_test)

trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.)
testX = pad_sequences(testX, maxlen=max_document_length, value=0.)

trainY = to_categorical(trainY, nb_classes=num_classes)
testY = to_categorical(testY, nb_classes=num_classes)


Vocabulary Size: 2315


In [8]:
model = blstm_atten(trainX.shape[1], vocab_size, embed_size, num_classes, learn_rate)
model.fit(trainX, trainY, epochs=n_epoch, shuffle=True, batch_size=batch_size, 
                  verbose=1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 38, 50)            115750    
_________________________________________________________________
dropout_1 (Dropout)          (None, 38, 50)            0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 38, 100)           40400     
_________________________________________________________________
att_layer_1 (AttLayer)       (None, 100)               100       
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 202       
Total params: 156,452
Trainable params: 156,452
Non-trainable params: 0
_________________________________________________________________
Epoc

<keras.callbacks.History at 0x121d51e50>

In [9]:
temp = model.predict(testX)
y_pred  = np.argmax(temp, 1)
y_true = np.argmax(testY, 1)
precision = metrics.precision_score(y_true, y_pred, average=None)
recall = metrics.recall_score(y_true, y_pred, average=None)
f1_score = metrics.f1_score(y_true, y_pred, average=None)
print("Precision: " + str(precision) + "\n")
print("Recall: " + str(recall) + "\n")
print("f1_score: " + str(f1_score) + "\n")
print(confusion_matrix(y_true, y_pred))
print(":: Classification Report")
print(classification_report(y_true, y_pred))

Precision: [ 0.6127451   0.62921348]

Recall: [ 0.65445026  0.58638743]

f1_score: [ 0.63291139  0.60704607]

[[125  66]
 [ 79 112]]
:: Classification Report
             precision    recall  f1-score   support

          0       0.61      0.65      0.63       191
          1       0.63      0.59      0.61       191

avg / total       0.62      0.62      0.62       382



In [10]:
import xgboost as xgb
from sklearn.utils import shuffle
from string import punctuation
import re
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support

In [14]:
def gen_data(x_text, labels):
    x_text = np.array(list(vocab_processor.transform(x_text)))
    embedding_weights = model.layers[0].get_weights()[0]
    X, y = [], []
    for i in range(len(x_text)):
        emb = np.zeros(embed_size)
        for word in x_text[i]:
            try:
                emb += embedding_weights[word]
            except:
                print "Here"
                pass
        emb /= len(x_text[i])
        X.append(emb)
        y.append(labels[i])
    X = np.array(X)
    y = np.array(y)
    return X, y

def classification_model(X, Y):
    NO_OF_FOLDS=10
    X, Y = shuffle(X, Y, random_state=42)
    logreg = xgb.XGBClassifier()
    scores2 = cross_val_score(logreg, X, Y, cv=NO_OF_FOLDS, scoring='recall_weighted')
    print "Recall(avg): %0.3f (+/- %0.3f)" % (scores2.mean(), scores2.std() * 2)
    scores1 = cross_val_score(logreg, X, Y, cv=NO_OF_FOLDS, scoring='precision_weighted')
    print "Precision(avg): %0.3f (+/- %0.3f)" % (scores1.mean(), scores1.std() * 2)    
    scores3 = cross_val_score(logreg, X, Y, cv=NO_OF_FOLDS, scoring='f1_weighted')
    print "F1-score(avg): %0.3f (+/- %0.3f)" % (scores3.mean(), scores3.std() * 2)
    print(scores1, scores2, scores3)

In [15]:
X, Y = gen_data(x_text, labels)
classification_model(X,Y)

Recall(avg): 0.860 (+/- 0.032)
Precision(avg): 0.861 (+/- 0.032)
F1-score(avg): 0.860 (+/- 0.032)
(array([ 0.84082122,  0.87697031,  0.85475616,  0.8544143 ,  0.87958115,
        0.83578377,  0.8726527 ,  0.8481269 ,  0.86098766,  0.88241741]), array([ 0.84073107,  0.87696335,  0.85340314,  0.85340314,  0.87958115,
        0.83507853,  0.87139108,  0.84776903,  0.86089239,  0.88188976]), array([ 0.84072456,  0.87696082,  0.85323414,  0.85332274,  0.87958115,
        0.83496655,  0.87129355,  0.84773756,  0.86088664,  0.88185558]))


In [12]:
trainX, trainY = gen_data(X_train, Y_train)
testX, testY = gen_data(X_test, Y_test)

In [13]:
logreg = xgb.XGBClassifier()
logreg.fit(trainX, trainY)

y_pred = logreg.predict(testX)
y_true = Y_test
precision = metrics.precision_score(y_true, y_pred, average=None)
recall = metrics.recall_score(y_true, y_pred, average=None)
f1_score = metrics.f1_score(y_true, y_pred, average=None)
print("Precision: " + str(precision) + "\n")
print("Recall: " + str(recall) + "\n")
print("f1_score: " + str(f1_score) + "\n")
print(confusion_matrix(y_true, y_pred))
print(":: Classification Report")
print(classification_report(y_true, y_pred))

Precision: [ 0.59375     0.59473684]

Recall: [ 0.59685864  0.59162304]

f1_score: [ 0.59530026  0.59317585]

[[114  77]
 [ 78 113]]
:: Classification Report
             precision    recall  f1-score   support

          0       0.59      0.60      0.60       191
          1       0.59      0.59      0.59       191

avg / total       0.59      0.59      0.59       382

