In [None]:
import pandas as pd
import re
import preprocessor as p
pd.set_option('display.max_colwidth', -1)
import numpy as np
from sklearn.model_selection import train_test_split, KFold
import preprocessor as p
from collections import Counter
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix 
from tensorflow.contrib import learn
from tflearn.data_utils import to_categorical, pad_sequences
import os
os.environ['KERAS_BACKEND']='theano'
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional, Activation
from keras.models import Model,Sequential
from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers, optimizers
from keras.layers import Conv1D, GlobalMaxPooling1D, SpatialDropout1D
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from keras.regularizers import L1L2
from keras.layers.merge import concatenate
import xgboost as xgb
from sklearn.utils import shuffle
from string import punctuation
import re
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support

Load Irony Data

In [None]:
all_hashtags = {}
def get_hashtags(tweet):
    parsed_tweet = p.parse(tweet.decode('ascii', 'ignore').encode('ascii').lower())
    parsed_hashtags = parsed_tweet.hashtags
    
    hashtags = []
    if parsed_hashtags is not None:
        for hashtag in parsed_hashtags:
            temp = hashtag.match[1:].lower()
            if temp in all_hashtags:
                all_hashtags[temp] += 1
            else:
                all_hashtags[temp] = 1
            hashtags.append(temp)

    hashtags_str = (" ").join(hashtags)
    return hashtags_str, len(hashtags)

def get_clean_tweet(tweet):
    p.set_options(p.OPT.URL)
    clean_tweet = p.clean(tweet)
    return clean_tweet.lower().replace("#"," ")


emotion_keys = {}
def get_emotion(tweet):
    result = re.findall(r":\w+_\w+:",tweet)
    if result is not None:
        emotions = []
        for i in range(len(result)):
            emotion = result[i][1:-1]
            emotions.append(emotion)
            if emotion in emotion_keys:
                emotion_keys[emotion] += 1
            else:
                emotion_keys[emotion] = 1
    return (" ").join(emotions)


In [None]:
def get_data(filename):
    
    data = pd.DataFrame()
    
    tweets = []
    labels = []
    
    count = 0
    
    with open(filename) as f:
        for line in f:
            _, label, tweet = line.strip().split("\t")
            if(count)==0:
                count+=1
                continue
            tweets.append(tweet)
            labels.append(label)
            count+=1
            
    print "Lines read: " + str(count)
    data["Tweet text"] = tweets
    data["Label"] = labels
    
    return data

In [None]:
data = get_data("./datasets/train/SemEval2018-T3-train-taskA.txt")
data['hashtags'], data['length'] = zip(*data['Tweet text'].map(get_hashtags)) 
data["tweet"] = data['Tweet text'].map(get_clean_tweet)
data['emotion'] = data['tweet'].map(get_emotion)
data.head()
data.to_pickle("Irony_data.pkl")

In [None]:
data = pd.read_pickle("Irony_data.pkl")

In [None]:
x_text = np.array(data["tweet"].tolist())
label =  np.array(data["Label"].tolist())

Randomize indices

In [None]:
indices = np.arange(x_text.shape[0])
np.random.shuffle(indices)
indices.dump("split.pkl")

Using word and character unigrams

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
print("Using char n-grams based features")
bow_transformer = CountVectorizer(max_features = 10000, ngram_range = (1,5)).fit(x_text)
comments_bow = bow_transformer.transform(x_text)
tfidf_transformer = TfidfTransformer(norm = 'l2').fit(comments_bow)
comments_tfidf = tfidf_transformer.transform(comments_bow)
features = comments_tfidf

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
import scipy
#Irony
VALIDATION_SPLIT = 0.2
indices = np.load("split.pkl")
features = features[indices]
label = label[indices]
num_validation_samples = int(VALIDATION_SPLIT * features.shape[0])

X_train = scipy.sparse.csr_matrix.todense(features[:-num_validation_samples])
Y_train = label[:-num_validation_samples]
X_test = scipy.sparse.csr_matrix.todense(features[-num_validation_samples:])
Y_test = label[-num_validation_samples:]

In [None]:
logreg = LogisticRegression(class_weight="balanced")
logreg.fit(X_train, Y_train)

y_pred = logreg.predict(X_test)
y_true = Y_test
precision = metrics.precision_score(y_true, y_pred, average=None)
recall = metrics.recall_score(y_true, y_pred, average=None)
f1_score = metrics.f1_score(y_true, y_pred, average=None)
accuracy = metrics.accuracy_score(y_true, y_pred)
print("Precision: " + str(precision) + "\n")
print("Recall: " + str(recall) + "\n")
print("f1_score: " + str(f1_score) + "\n")
print("Accuracy: " + str(accuracy) + "\n")
print(confusion_matrix(y_true, y_pred))
print(":: Classification Report")
print(classification_report(y_true, y_pred))

In [None]:
np.savez('trainedmodels/char_ngrams_count',indices, X_train, X_test, Y_train, Y_test)

Load SARC data

In [None]:
import pickle
f = open("./datasets/SARC/data_without_ancestor.pkl", "r")
x_text = pickle.load(f)
label = pickle.load(f)
f.close()

Run Deep Neural Networks

In [None]:
def get_embedding_weights(filename, sep):
    embed_dict = {}
    file = open(filename,'r')
    for line in file.readlines():
        row = line.strip().split(sep)
        embed_dict[row[0]] = row[1:]
    print('Loaded from file: ' + str(filename))
    file.close()
    return embed_dict

def map_embedding_weights(embed, vocab, embed_size):
    vocab_size = len(vocab)
    embeddingWeights = np.zeros((vocab_size , embed_size))
    n = 0
    words_missed = []
    for k, v in vocab.iteritems():
        try:
            embeddingWeights[v] = embed[k]
        except:
            n += 1
            words_missed.append(k)
            pass
    print("%d embedding missed"%n, " of " , vocab_size)
    return embeddingWeights

In [None]:
class AttLayer(Layer):

    def __init__(self, **kwargs):
        super(AttLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        # Create a trainable weight variable for this layer.
        self.W = self.add_weight(name='kernel', 
                                      shape=(input_shape[-1],),
                                      initializer='random_normal',
                                      trainable=True)
        super(AttLayer, self).build(input_shape)  # Be sure to call this somewhere!

    def call(self, x, mask=None):
        eij = K.tanh(K.dot(x, self.W))
        
        ai = K.exp(eij)
        weights = ai/K.sum(ai, axis=1).dimshuffle(0,'x')
        
        weighted_input = x*weights.dimshuffle(0,1,'x')
        return weighted_input.sum(axis=1)

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])

def blstm_atten(inp_dim, vocab_size, embed_size, num_classes, learn_rate):
    model = Sequential()
    model.add(Embedding(vocab_size, embed_size, input_length=inp_dim))
    model.add(SpatialDropout1D(0.25))
    model.add(Bidirectional(LSTM(embed_size, return_sequences=True)))
    model.add(AttLayer())
    model.add(Dropout(0.25))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
    model.summary()
    return model

def blstm_atten_2(inp_dim, vocab_size, embed_size, num_classes, learn_rate):
    model_input = Input(shape=(inp_dim,), dtype='int32')
    embed = Embedding(vocab_size, embed_size, input_length=inp_dim)(model_input)
    embed_drop = Dropout(0.25)(embed)
    lstm_output_0 = Bidirectional(LSTM(embed_size, return_sequences=True), name="bi_lstm_0")(embed_drop)
    lstm_output_1 = Bidirectional(LSTM(embed_size, return_sequences=True), name="bi_lstm_1")(lstm_output_0)
    x = concatenate([embed, lstm_output_0, lstm_output_1])
    atten_output = AttLayer()(x)
    drop =  Dropout(0.25)(atten_output)
    output = Dense(num_classes, activation='softmax')(drop)
    model = Model(inputs=[model_input], outputs=output, name="Irony")
    model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
    model.summary()
    return model

def cnn_lstm(inp_dim, vocab_size, embed_size, num_classes, learn_rate):
    # Convolution
    kernel_size = 5
    filters = 64
    pool_size = 4

    # LSTM
    lstm_output_size = 70
    
    model = Sequential()
    model.add(Embedding(vocab_size, embed_size, input_length=inp_dim))
    model.add(Dropout(0.25))
    model.add(Conv1D(filters,
                     kernel_size,
                     padding='valid',
                     activation='relu',
                     strides=1))
    model.add(MaxPooling1D(pool_size=pool_size))
    model.add(LSTM(lstm_output_size))
    model.add(Dense(num_classes, activation='softmax'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    
    return model

def blstm(inp_dim,vocab_size, embed_size, num_classes, learn_rate):   
    model = Sequential()
    model.add(Embedding(vocab_size, embed_size, input_length=inp_dim, trainable=True))
    model.add(Dropout(0.25))
    model.add(Bidirectional(LSTM(embed_size)))
    model.add(Dropout(0.50))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
    
    return model


In [None]:
# Settings for Irony Data
max_document_length = 50
num_classes = 2
embed_size = 50
n_epoch = 10
batch_size = 16
learn_rate = 0.01
max_features = 1

In [None]:
# Settings for SARC Data
max_document_length = 50
num_classes = 2
embed_size = 50
n_epoch = 15
batch_size = 256
learn_rate = 0.01
max_features = 5

In [None]:
#SARC
X_train, X_test, Y_train, Y_test = train_test_split(x_text, label, random_state=42, test_size=0.10)

In [None]:
#Irony
VALIDATION_SPLIT = 0.2
indices = np.load("split.pkl")
x_text = x_text[indices]
label = label[indices]
num_validation_samples = int(VALIDATION_SPLIT * x_text.shape[0])

X_train = x_text[:-num_validation_samples]
Y_train = label[:-num_validation_samples]
X_test = x_text[-num_validation_samples:]
Y_test = label[-num_validation_samples:]

In [None]:
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length, max_features)
# vocab_processor = vocab_processor.fit(x_text)
vocab_processor = vocab_processor.restore("./Models/SARC_vocab_glove.pkl")

vocab_size = len(vocab_processor.vocabulary_._mapping)
print("Vocabulary Size: {:d}".format(vocab_size))
vocab = vocab_processor.vocabulary_


vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length, max_features, vocabulary=vocab)

In [None]:
trainX = np.array(list(vocab_processor.transform(X_train)))
testX = np.array(list(vocab_processor.transform(X_test)))

trainY = np.asarray(Y_train)
testY = np.asarray(Y_test)

trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.)
testX = pad_sequences(testX, maxlen=max_document_length, value=0.)

trainY = to_categorical(trainY, nb_classes=num_classes)
testY = to_categorical(testY, nb_classes=num_classes)

Train a new model

In [None]:
filename = "word_vectors/glove.twitter.27B.50d.txt"
sep = " "
vocab = vocab_processor.vocabulary_._mapping
model = blstm_atten_2(trainX.shape[1], vocab_size, embed_size, num_classes, learn_rate)
model.layers[1].set_weights([map_embedding_weights(get_embedding_weights(filename, sep), vocab, embed_size)])
model.fit(trainX, trainY, epochs=n_epoch, shuffle=True, batch_size=batch_size, validation_split= 0.05,
                  verbose=1)

In [None]:
model = blstm_atten_2(trainX.shape[1], vocab_size, embed_size, num_classes, learn_rate)
model.fit(trainX, trainY, epochs=n_epoch, shuffle=True, batch_size=batch_size, validation_split= 0.05,verbose=1)

Test the trained model

In [None]:
temp = model.predict(testX)
y_pred  = np.argmax(temp, 1)
y_true = np.argmax(testY, 1)
precision = metrics.precision_score(y_true, y_pred, average=None)
recall = metrics.recall_score(y_true, y_pred, average=None)
f1_score = metrics.f1_score(y_true, y_pred, average=None)
print("Precision: " + str(precision) + "\n")
print("Recall: " + str(recall) + "\n")
print("f1_score: " + str(f1_score) + "\n")
print(confusion_matrix(y_true, y_pred))
print(":: Classification Report")
print(classification_report(y_true, y_pred))

In [None]:
vocab_processor.save("Models/SARC_vocab_glove.pkl")
model.save("Models/sarc_cnn_glove.h5")

Load trained model

In [None]:
from keras.models import load_model
model = load_model("Models/sarc_model_glove.h5", custom_objects={'AttLayer':AttLayer})

Transfer Learning Variant I -> Model All parameters transfer

In [None]:
model.fit(trainX, trainY, epochs=n_epoch, shuffle=True, batch_size=batch_size, validation_split= 0.05, verbose=1)

In [None]:
model.save("trainedmodels/sarc_transfer_all.h5")
np.savez('trainedmodels/sarc_transfer_all',indices,trainX,testX,trainY,testY)

Transfer Learning Variant II -> Embedding parameters transfer

In [None]:
vocab = vocab._mapping
trained_model = load_model("Models/sarc_model_glove.h5", custom_objects={'AttLayer':AttLayer})
embedding = trained_model.layers[0].get_weights()
model = blstm_atten(trainX.shape[1], vocab_size, embed_size, num_classes, learn_rate)
model.layers[0].set_weights(embedding)
model.fit(trainX, trainY, epochs=n_epoch, shuffle=True, batch_size=batch_size, validation_split= 0.05,
                  verbose=1)

Transfer Learning Variant III -> Direct test the trained model

In [None]:
temp = model.predict(testX)
y_pred  = np.argmax(temp, 1)
y_true = np.argmax(testY, 1)
precision = metrics.precision_score(y_true, y_pred, average=None)
recall = metrics.recall_score(y_true, y_pred, average=None)
f1_score = metrics.f1_score(y_true, y_pred, average=None)
print("Precision: " + str(precision) + "\n")
print("Recall: " + str(recall) + "\n")
print("f1_score: " + str(f1_score) + "\n")
print(confusion_matrix(y_true, y_pred))
print(":: Classification Report")
print(classification_report(y_true, y_pred))

Plot Embeddings

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
    assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
    plt.figure(figsize=(14, 14))  # in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(label,
                     xy=(x, y),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()
    
def plot_embedding(embedding, reverse_dictionary):
    tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=2000)
    low_dim_embs = tsne.fit_transform(embedding)
    labels = [reverse_dictionary[i] for i in xrange(vocab_size)]
    plot_with_labels(low_dim_embs, labels)
    return low_dim_embs


In [None]:
reverse_dict = vocab_processor.vocabulary_._reverse_mapping
embedding = model.layers[0].get_weights()[0]
low_dim_embs = plot_embedding(embedding, reverse_dict)

XGBoost for classification

In [None]:
def gen_data(x_text, labels):
    x_text = np.array(list(vocab_processor.transform(x_text)))
    embedding_weights = model.layers[0].get_weights()[0]
    X, y = [], []
    for i in range(len(x_text)):
        emb = np.zeros(embed_size)
        for word in x_text[i]:
            try:
                emb += embedding_weights[word]
            except:
                print "Here"
                pass
        emb /= len(x_text[i])
        X.append(emb)
        y.append(labels[i])
    X = np.array(X)
    y = np.array(y)
    return X, y


In [None]:
trainX, trainY = gen_data(X_train, Y_train)
testX, testY = gen_data(X_test, Y_test)

In [None]:
logreg = xgb.XGBClassifier()
logreg.fit(trainX, trainY)

y_pred = logreg.predict(testX)
y_true = Y_test
precision = metrics.precision_score(y_true, y_pred, average=None)
recall = metrics.recall_score(y_true, y_pred, average=None)
f1_score = metrics.f1_score(y_true, y_pred, average=None)
print("Precision: " + str(precision) + "\n")
print("Recall: " + str(recall) + "\n")
print("f1_score: " + str(f1_score) + "\n")
print(confusion_matrix(y_true, y_pred))
print(":: Classification Report")
print(classification_report(y_true, y_pred))

5 fold cross validation

In [None]:
def classification_model(X, Y):
    NO_OF_FOLDS=10
    X, Y = shuffle(X, Y, random_state=42)
    logreg = xgb.XGBClassifier()
    scores2 = cross_val_score(logreg, X, Y, cv=NO_OF_FOLDS, scoring='recall_weighted')
    print "Recall(avg): %0.3f (+/- %0.3f)" % (scores2.mean(), scores2.std() * 2)
    scores1 = cross_val_score(logreg, X, Y, cv=NO_OF_FOLDS, scoring='precision_weighted')
    print "Precision(avg): %0.3f (+/- %0.3f)" % (scores1.mean(), scores1.std() * 2)    
    scores3 = cross_val_score(logreg, X, Y, cv=NO_OF_FOLDS, scoring='f1_weighted')
    print "F1-score(avg): %0.3f (+/- %0.3f)" % (scores3.mean(), scores3.std() * 2)
    print(scores1, scores2, scores3)

X, Y = gen_data(x_text, labels)
classification_model(X,Y)

DeepMoji

In [None]:
# Settings for SARC Data
max_document_length = 50
#Irony
VALIDATION_SPLIT = 0.2
indices = np.load("split.pkl")
x_text = x_text[indices]
label = label[indices]
num_validation_samples = int(VALIDATION_SPLIT * x_text.shape[0])

X_train = x_text[:-num_validation_samples]
Y_train = label[:-num_validation_samples]
X_test = x_text[-num_validation_samples:]
Y_test = label[-num_validation_samples:]

In [None]:
import sys
sys.path.append("DeepMoji")
sys.path.append("/Library/Python/2.7/site-packages")
# import example_helper
import json
import csv
import numpy as np
from deepmoji.sentence_tokenizer import SentenceTokenizer
from deepmoji.model_def import deepmoji_feature_encoding
from deepmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH


In [None]:
def get_encoded_deepmoji(x_text, labels):
    
    # Decode data
    try:
        x_text = [unicode(x) for x in x_text]
    except UnicodeDecodeError:
        x_text = [x.decode('utf-8') for x in x_text]
    
    print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)
    st = SentenceTokenizer(vocabulary, max_document_length)
    tokenized, _, _ = st.tokenize_sentences(x_text)

    print('Loading model from {}.'.format(PRETRAINED_PATH))
    model = deepmoji_feature_encoding(max_document_length, PRETRAINED_PATH)
    model.summary()

    print('Encoding texts..')
    encoding = model.predict(tokenized)
    
    X = np.array(encoding)
    y = np.array(labels)
    return X, y


In [None]:
trainX, trainY = get_encoded_deepmoji(X_train, Y_train)
testX, testY = get_encoded_deepmoji(X_test, Y_test)

In [None]:
logreg = xgb.XGBClassifier()
logreg.fit(trainX, trainY)

y_pred = logreg.predict(testX)
y_true = Y_test
precision = metrics.precision_score(y_true, y_pred, average=None)
recall = metrics.recall_score(y_true, y_pred, average=None)
f1_score = metrics.f1_score(y_true, y_pred, average=None)
print("Precision: " + str(precision) + "\n")
print("Recall: " + str(recall) + "\n")
print("f1_score: " + str(f1_score) + "\n")
print(confusion_matrix(y_true, y_pred))
print(":: Classification Report")
print(classification_report(y_true, y_pred))

In [None]:
np.savez('trainedmodels/deepmoji',indices,trainX,testX,trainY,testY)

Sentiment base features

In [None]:
sid = SentimentIntensityAnalyzer()
def get_sentiment_score(tweet):
    ss = sid.polarity_scores(tweet)
    return np.array([ss["pos"], ss["neg"], ss["neu"]])

def get_sentiment(x_text, labels):
    encoding = []
    for tweet in x_text:
        encoding.append(get_sentiment_score(tweet))
    X = np.array(encoding)
    y = np.array(labels)
    return X, y


In [None]:
trainX, trainY = get_sentiment(X_train, Y_train)
testX, testY = get_sentiment(X_test, Y_test)

In [None]:
np.savez('trainedmodels/sent_features',indices, trainX, testX, trainY, testY)  