In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix 
from scipy import sparse
import xgboost as xgb
from sklearn.utils import shuffle
from string import punctuation
import re
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn import metrics

import os
os.environ['KERAS_BACKEND']='theano'
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional, Activation
from keras.models import Model,Sequential
from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers, optimizers
from keras.models import load_model
from sklearn.linear_model import LogisticRegression
from keras.models import load_model

Using Theano backend.


In [2]:
def get_data(filename):
    data_dict = {}
    with open(filename) as f:
        d = np.load(f)
        data_dict["indices"] = d['arr_0']
        data_dict["X_train"] = d['arr_1']
        data_dict["X_test"] = d['arr_2']
        data_dict["y_train"] = d['arr_3']
        data_dict["y_test"] = d['arr_4']
    return data_dict

In [3]:
class AttLayer(Layer):

    def __init__(self, **kwargs):
        super(AttLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        # Create a trainable weight variable for this layer.
        self.W = self.add_weight(name='kernel', 
                                      shape=(input_shape[-1],),
                                      initializer='random_normal',
                                      trainable=True)
        super(AttLayer, self).build(input_shape)  # Be sure to call this somewhere!

    def call(self, x, mask=None):
        eij = K.tanh(K.dot(x, self.W))
        
        ai = K.exp(eij)
        weights = ai/K.sum(ai, axis=1).dimshuffle(0,'x')
        
        weighted_input = x*weights.dimshuffle(0,1,'x')
        return weighted_input.sum(axis=1)

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])

model_sarc = load_model("trainedmodels/sarc_transfer_all.h5", custom_objects={'AttLayer':AttLayer})

In [4]:
data = {}

data["char_tfidf"] = get_data('./trainedmodels/char_ngrams_tfidf.npz')
data["holographic"] = get_data('./trainedmodels/holographic.npz')
data["sarc"] = get_data('./trainedmodels/sarc_transfer_all.npz')
data["deepmoji"] = get_data('./trainedmodels/deepmoji.npz')

In [5]:
def get_sarc_features(x_text, labels):
    embedding_weights = model_sarc.layers[0].get_weights()[0]
    embed_size = embedding_weights.shape[1]
    X, y = [], []
    for i in range(len(x_text)):
        emb = np.zeros(embed_size)
        for word in x_text[i]:
            try:
                emb += embedding_weights[word]
            except:
                print "Here"
                pass
        emb /= len(x_text[i])
        X.append(emb)
        y.append(labels[i])
    X = np.array(X)
    y = np.array(y)
    return X, y


In [6]:
def get_model(model_name):
    if model_name =="xgb":
        return xgb.XGBClassifier()
    elif model_name == "lr":
        return LogisticRegression(class_weight="balanced")

In [10]:
def get_features(name):
    if name in data:
        data_dict = data[name]
        X_train = data_dict["X_train"]
        Y_train = data_dict["y_train"]
        X_test = data_dict["X_test"]
        Y_test = data_dict["y_test"]
        return X_train, Y_train, X_test, Y_test
    else:
        print "No data found"
        return None

In [12]:
def evaluate_model(model, testX, testY):   
    y_pred = model.predict(testX)
    y_true = testY
    precision = metrics.precision_score(y_true, y_pred, average=None)
    recall = metrics.recall_score(y_true, y_pred, average=None)
    f1_score = metrics.f1_score(y_true, y_pred, average=None)
    accuracy = metrics.accuracy_score(y_true, y_pred)
    print("Precision: " + str(precision) + "\n")
    print("Recall: " + str(recall) + "\n")
    print("f1_score: " + str(f1_score) + "\n")
    print(confusion_matrix(y_true, y_pred))
    print(":: Classification Report")
    print(classification_report(y_true, y_pred))
    print("Accuracy: " + str(accuracy) + "\n")

In [27]:
def stack_features(name_1, name_2):
    X_train = np.hstack((data[name_1]["X_train"], data[name_2]["X_train"]))
    Y_train = data[name_1]["y_train"]
    X_test = np.hstack((data[name_1]["X_test"], data[name_2]["X_test"]))
    Y_test = data[name_1]["y_test"]
    
    return X_train, Y_train, X_test, Y_test 

In [21]:
X_train, Y_train, X_test, Y_test = get_features("deepmoji")

model = get_model("lr")

model.fit(X_train, Y_train)

evaluate_model(model, X_test, Y_test)

Precision: [ 0.68065268  0.72072072]

Recall: [ 0.75844156  0.63660477]

f1_score: [ 0.71744472  0.67605634]

[[292  93]
 [137 240]]
:: Classification Report
             precision    recall  f1-score   support

          0       0.68      0.76      0.72       385
          1       0.72      0.64      0.68       377

avg / total       0.70      0.70      0.70       762

Accuracy: 0.698162729659



In [28]:
X_train, Y_train, X_test, Y_test = stack_features("char_tfidf","deepmoji")

model = get_model("lr")

model.fit(X_train, Y_train)

evaluate_model(model, X_test, Y_test)

Precision: [ 0.70095694  0.73255814]

Recall: [ 0.76103896  0.66843501]

f1_score: [ 0.72976339  0.69902913]

[[293  92]
 [125 252]]
:: Classification Report
             precision    recall  f1-score   support

          0       0.70      0.76      0.73       385
          1       0.73      0.67      0.70       377

avg / total       0.72      0.72      0.71       762

Accuracy: 0.715223097113



In [None]:
"""
While loading data in the dataframe, some lines are incorrectly read, i.e., their tweet length is >140 since multiple tweets are read as single record. I have removed these records as:
"""
data = pd.read_csv("./datasets/train/SemEval2018-T3-train-taskA_emoji.txt", sep="\t")
data = data[data['Tweet text'].map(len)<=140]