In [3]:
import pandas as pd
import re
import preprocessor as p
pd.set_option('display.max_colwidth', -1)
import numpy as np
from sklearn.model_selection import train_test_split, KFold
import preprocessor as p
from collections import Counter
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix 
from tensorflow.contrib import learn
from tflearn.data_utils import to_categorical, pad_sequences
import os
os.environ['KERAS_BACKEND']='theano'
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional
from keras.models import Model,Sequential
from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers, optimizers

In [None]:
all_hashtags = {}
def get_hashtags(tweet):
    parsed_tweet = p.parse(tweet.decode('ascii', 'ignore').encode('ascii').lower())
    parsed_hashtags = parsed_tweet.hashtags
    
    hashtags = []
    if parsed_hashtags is not None:
        for hashtag in parsed_hashtags:
            temp = hashtag.match[1:].lower()
            if temp in all_hashtags:
                all_hashtags[temp] += 1
            else:
                all_hashtags[temp] = 1
            hashtags.append(temp)

    hashtags_str = (" ").join(hashtags)
    return hashtags_str, len(hashtags)

def get_clean_tweet(tweet):
    p.set_options(p.OPT.URL)
    clean_tweet = p.clean(tweet)
    return clean_tweet.lower().replace("#"," ")


emotion_keys = {}
def get_emotion(tweet):
    result = re.findall(r":\w+_\w+:",tweet)
    if result is not None:
        emotions = []
        for i in range(len(result)):
            emotion = result[i][1:-1]
            emotions.append(emotion)
            if emotion in emotion_keys:
                emotion_keys[emotion] += 1
            else:
                emotion_keys[emotion] = 1
    return (" ").join(emotions)

In [None]:
data = pd.read_csv("./datasets/train/SemEval2018-T3-train-taskA.txt", sep="\t", header=None)

In [None]:
data['hashtags'], data['length'] = zip(*data['Tweet text'].map(get_hashtags)) 
data["tweet"] = data['Tweet text'].map(get_clean_tweet)
data['emotion'] = data['tweet'].map(get_emotion)
data.head()

In [None]:
x_text = data["tweet"].tolist()
labels =  data["Label"].tolist()

In [4]:
import pickle
f = open("./datasets/SARC/data.pkl", "r")
x_text = pickle.load(f)
label = pickle.load(f)
f.close()

In [5]:
def get_embedding_weights(filename, sep):
    embed_dict = {}
    file = open(filename,'r')
    for line in file.readlines():
        row = line.strip().split(sep)
        embed_dict[row[0]] = row[1:]
    print('Loaded from file: ' + str(filename))
    file.close()
    return embed_dict

def map_embedding_weights(embed, vocab, embed_size):
    vocab_size = len(vocab)
    embeddingWeights = np.zeros((vocab_size , embed_size))
    n = 0
    words_missed = []
    for k, v in vocab.iteritems():
        try:
            embeddingWeights[v] = embed[k]
        except:
            n += 1
            words_missed.append(k)
            pass
    print("%d embedding missed"%n, " of " , vocab_size)
    return embeddingWeights

In [6]:
class AttLayer(Layer):

    def __init__(self, **kwargs):
        super(AttLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        # Create a trainable weight variable for this layer.
        self.W = self.add_weight(name='kernel', 
                                      shape=(input_shape[-1],),
                                      initializer='random_normal',
                                      trainable=True)
        super(AttLayer, self).build(input_shape)  # Be sure to call this somewhere!

    def call(self, x, mask=None):
        eij = K.tanh(K.dot(x, self.W))
        
        ai = K.exp(eij)
        weights = ai/K.sum(ai, axis=1).dimshuffle(0,'x')
        
        weighted_input = x*weights.dimshuffle(0,1,'x')
        return weighted_input.sum(axis=1)

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])

def blstm_atten(inp_dim, vocab_size, embed_size, num_classes, learn_rate):
    model = Sequential()
    model.add(Embedding(vocab_size, embed_size, input_length=inp_dim))
    model.add(Dropout(0.25))
    model.add(Bidirectional(LSTM(embed_size, return_sequences=True)))
    model.add(AttLayer())
    model.add(Dropout(0.25))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
    model.summary()
    return model


def blstm(inp_dim,vocab_size, embed_size, num_classes, learn_rate):   
    model = Sequential()
    model.add(Embedding(vocab_size, embed_size, input_length=inp_dim, trainable=True))
    model.add(Dropout(0.25))
    model.add(Bidirectional(LSTM(embed_size)))
    model.add(Dropout(0.50))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
    
    return model


In [7]:
max_document_length = 50
num_classes = 2
embed_size = 50
n_epoch = 50
batch_size = 256
learn_rate = 0.01

In [8]:

X_train, X_test, Y_train, Y_test = train_test_split(x_text, label, random_state=42, test_size=0.10)

vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length, 2)
vocab_processor = vocab_processor.fit(x_text)

vocab_size = len(vocab_processor.vocabulary_)
print("Vocabulary Size: {:d}".format(vocab_size))
vocab = vocab_processor.vocabulary_._mapping

trainX = np.array(list(vocab_processor.transform(X_train)))
testX = np.array(list(vocab_processor.transform(X_test)))

trainY = np.asarray(Y_train)
testY = np.asarray(Y_test)

trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.)
testX = pad_sequences(testX, maxlen=max_document_length, value=0.)

trainY = to_categorical(trainY, nb_classes=num_classes)
testY = to_categorical(testY, nb_classes=num_classes)


Vocabulary Size: 29106


In [None]:
model = blstm_atten(trainX.shape[1], vocab_size, embed_size, num_classes, learn_rate)
model.fit(trainX, trainY, epochs=n_epoch, shuffle=True, batch_size=batch_size, validation_split= 0.10,
                  verbose=1)

In [10]:
from keras.models import load_model
model = load_model("sarc_model.h5", custom_objects={'AttLayer':AttLayer})

In [9]:
temp = model.predict(testX)
y_pred  = np.argmax(temp, 1)
y_true = np.argmax(testY, 1)
precision = metrics.precision_score(y_true, y_pred, average=None)
recall = metrics.recall_score(y_true, y_pred, average=None)
f1_score = metrics.f1_score(y_true, y_pred, average=None)
print("Precision: " + str(precision) + "\n")
print("Recall: " + str(recall) + "\n")
print("f1_score: " + str(f1_score) + "\n")
print(confusion_matrix(y_true, y_pred))
print(":: Classification Report")
print(classification_report(y_true, y_pred))

Precision: [ 0.63329937  0.62662137]

Recall: [ 0.62555168  0.63435986]

f1_score: [ 0.62940168  0.63046687]

[[8079 4836]
 [4678 8116]]
:: Classification Report
             precision    recall  f1-score   support

          0       0.63      0.63      0.63     12915
          1       0.63      0.63      0.63     12794

avg / total       0.63      0.63      0.63     25709



In [10]:
model.save("sarc_model.h5")

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
    assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
    plt.figure(figsize=(14, 14))  # in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(label,
                     xy=(x, y),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()
    
def plot_embedding(embedding, reverse_dictionary):
    tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=2000)
    low_dim_embs = tsne.fit_transform(embedding)
    labels = [reverse_dictionary[i] for i in xrange(vocab_size)]
    plot_with_labels(low_dim_embs, labels)
    return low_dim_embs


In [None]:
reverse_dict = vocab_processor.vocabulary_._reverse_mapping
embedding = model.layers[0].get_weights()[0]
low_dim_embs = plot_embedding(embedding, reverse_dict)

In [None]:
import xgboost as xgb
from sklearn.utils import shuffle
from string import punctuation
import re
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support

In [None]:
def gen_data(x_text, labels):
    x_text = np.array(list(vocab_processor.transform(x_text)))
    embedding_weights = model.layers[0].get_weights()[0]
    X, y = [], []
    for i in range(len(x_text)):
        emb = np.zeros(embed_size)
        for word in x_text[i]:
            try:
                emb += embedding_weights[word]
            except:
                print "Here"
                pass
        emb /= len(x_text[i])
        X.append(emb)
        y.append(labels[i])
    X = np.array(X)
    y = np.array(y)
    return X, y

def classification_model(X, Y):
    NO_OF_FOLDS=10
    X, Y = shuffle(X, Y, random_state=42)
    logreg = xgb.XGBClassifier()
    scores2 = cross_val_score(logreg, X, Y, cv=NO_OF_FOLDS, scoring='recall_weighted')
    print "Recall(avg): %0.3f (+/- %0.3f)" % (scores2.mean(), scores2.std() * 2)
    scores1 = cross_val_score(logreg, X, Y, cv=NO_OF_FOLDS, scoring='precision_weighted')
    print "Precision(avg): %0.3f (+/- %0.3f)" % (scores1.mean(), scores1.std() * 2)    
    scores3 = cross_val_score(logreg, X, Y, cv=NO_OF_FOLDS, scoring='f1_weighted')
    print "F1-score(avg): %0.3f (+/- %0.3f)" % (scores3.mean(), scores3.std() * 2)
    print(scores1, scores2, scores3)

In [None]:
trainX, trainY = gen_data(X_train, Y_train)
testX, testY = gen_data(X_test, Y_test)

In [None]:
logreg = xgb.XGBClassifier()
logreg.fit(trainX, trainY)

y_pred = logreg.predict(testX)
y_true = Y_test
precision = metrics.precision_score(y_true, y_pred, average=None)
recall = metrics.recall_score(y_true, y_pred, average=None)
f1_score = metrics.f1_score(y_true, y_pred, average=None)
print("Precision: " + str(precision) + "\n")
print("Recall: " + str(recall) + "\n")
print("f1_score: " + str(f1_score) + "\n")
print(confusion_matrix(y_true, y_pred))
print(":: Classification Report")
print(classification_report(y_true, y_pred))

In [None]:
X, Y = gen_data(x_text, labels)
classification_model(X,Y)