In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix 

import xgboost as xgb
from sklearn.utils import shuffle
from string import punctuation
import re
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn import metrics

import os
os.environ['KERAS_BACKEND']='theano'
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional, Activation
from keras.models import Model,Sequential
from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers, optimizers
from keras.models import load_model
from sklearn.linear_model import LogisticRegression

In [None]:
def get_data(filename):
    data = {}
    with open(filename) as f:
        d = np.load(f)
        data["indices"] = d['arr_0']
        data["X_train"] = d['arr_1']
        data["X_test"] = d['arr_2']
        data["y_train"] = d['arr_3']
        data["y_test"] = d['arr_4']
    return data

In [None]:
class AttLayer(Layer):

    def __init__(self, **kwargs):
        super(AttLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        # Create a trainable weight variable for this layer.
        self.W = self.add_weight(name='kernel', 
                                      shape=(input_shape[-1],),
                                      initializer='random_normal',
                                      trainable=True)
        super(AttLayer, self).build(input_shape)  # Be sure to call this somewhere!

    def call(self, x, mask=None):
        eij = K.tanh(K.dot(x, self.W))
        
        ai = K.exp(eij)
        weights = ai/K.sum(ai, axis=1).dimshuffle(0,'x')
        
        weighted_input = x*weights.dimshuffle(0,1,'x')
        return weighted_input.sum(axis=1)

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])

model = load_model("trainedmodels/sarc_transfer_all.h5", custom_objects={'AttLayer':AttLayer})

In [None]:
data_1 = get_data('./trainedmodels/char_ngrams_tfidf.npz')
data_2 = get_data('./trainedmodels/holographic.npz')
data_3 = get_data('./trainedmodels/sarc_transfer_all.npz')
data_4 = get_data('./trainedmodels/deepmoji.npz')

In [None]:
def get_results_XGB(data_1, data_2):
    
    trainX = np.hstack((data_1["X_train"], data_2["X_train"]))
    trainY = data_1["y_train"]
    testX = np.hstack((data_1["X_test"], data_2["X_test"]))
    testY = data_1["y_test"]
    
    logreg = xgb.XGBClassifier()
    logreg.fit(trainX, trainY)

    y_pred = logreg.predict(testX)
    y_true = testY
    precision = metrics.precision_score(y_true, y_pred, average=None)
    recall = metrics.recall_score(y_true, y_pred, average=None)
    f1_score = metrics.f1_score(y_true, y_pred, average=None)
    print("Precision: " + str(precision) + "\n")
    print("Recall: " + str(recall) + "\n")
    print("f1_score: " + str(f1_score) + "\n")
    print(confusion_matrix(y_true, y_pred))
    print(":: Classification Report")
    print(classification_report(y_true, y_pred))

In [None]:

def get_results_LR(data):
    
    X_train = data["X_train"]
    Y_train = data["y_train"]
    X_test = data["X_test"]
    Y_test = data["y_test"]
    
    logreg = LogisticRegression(class_weight="balanced")
    logreg.fit(X_train, Y_train)

    y_pred = logreg.predict(X_test)
    y_true = Y_test
    precision = metrics.precision_score(y_true, y_pred, average=None)
    recall = metrics.recall_score(y_true, y_pred, average=None)
    f1_score = metrics.f1_score(y_true, y_pred, average=None)
    accuracy = metrics.accuracy_score(y_true, y_pred)
    print("Precision: " + str(precision) + "\n")
    print("Recall: " + str(recall) + "\n")
    print("f1_score: " + str(f1_score) + "\n")
    print("Accuracy: " + str(accuracy) + "\n")
    print(confusion_matrix(y_true, y_pred))
    print(":: Classification Report")
    print(classification_report(y_true, y_pred))

In [None]:
get_results_XGB(data_2, data_4)

In [None]:
"""
While loading data in the dataframe, some lines are incorrectly read, i.e., their tweet length is >140 since multiple tweets are read as single record. I have removed these records as:
"""
data = pd.read_csv("./datasets/train/SemEval2018-T3-train-taskA_emoji.txt", sep="\t")
data = data[data['Tweet text'].map(len)<=140]